diff --git "a/lora_adapter/checkpoint-5000/trainer_state.json" "b/lora_adapter/checkpoint-5000/trainer_state.json" deleted file mode 100644--- "a/lora_adapter/checkpoint-5000/trainer_state.json" +++ /dev/null @@ -1,7034 +0,0 @@ -{ - "best_global_step": null, - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 2.0, - "eval_steps": 500, - "global_step": 5000, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.002, - "grad_norm": 0.5721193552017212, - "learning_rate": 9.992e-05, - "loss": 4.2877, - "step": 5 - }, - { - "epoch": 0.004, - "grad_norm": 0.6026411056518555, - "learning_rate": 9.982e-05, - "loss": 4.6802, - "step": 10 - }, - { - "epoch": 0.006, - "grad_norm": 0.9385420680046082, - "learning_rate": 9.972e-05, - "loss": 4.6201, - "step": 15 - }, - { - "epoch": 0.008, - "grad_norm": 0.8009935021400452, - "learning_rate": 9.962e-05, - "loss": 4.7671, - "step": 20 - }, - { - "epoch": 0.01, - "grad_norm": 0.9409578442573547, - "learning_rate": 9.952e-05, - "loss": 4.2347, - "step": 25 - }, - { - "epoch": 0.012, - "grad_norm": 1.1376001834869385, - "learning_rate": 9.942000000000001e-05, - "loss": 4.4625, - "step": 30 - }, - { - "epoch": 0.014, - "grad_norm": 0.9677644371986389, - "learning_rate": 9.932e-05, - "loss": 4.5317, - "step": 35 - }, - { - "epoch": 0.016, - "grad_norm": 0.878607988357544, - "learning_rate": 9.922e-05, - "loss": 4.1702, - "step": 40 - }, - { - "epoch": 0.018, - "grad_norm": 1.034571886062622, - "learning_rate": 9.912e-05, - "loss": 4.215, - "step": 45 - }, - { - "epoch": 0.02, - "grad_norm": 1.0319870710372925, - "learning_rate": 9.902e-05, - "loss": 3.9984, - "step": 50 - }, - { - "epoch": 0.022, - "grad_norm": 0.7936278581619263, - "learning_rate": 9.892e-05, - "loss": 4.1078, - "step": 55 - }, - { - "epoch": 0.024, - "grad_norm": 1.5388593673706055, - "learning_rate": 9.882e-05, - "loss": 4.1454, - "step": 60 - }, - { - "epoch": 0.026, - "grad_norm": 1.1013274192810059, - "learning_rate": 9.872e-05, - "loss": 4.1011, - "step": 65 - }, - { - "epoch": 0.028, - "grad_norm": 1.3863942623138428, - "learning_rate": 9.862e-05, - "loss": 3.8758, - "step": 70 - }, - { - "epoch": 0.03, - "grad_norm": 1.2699391841888428, - "learning_rate": 9.852e-05, - "loss": 3.8447, - "step": 75 - }, - { - "epoch": 0.032, - "grad_norm": 0.79298996925354, - "learning_rate": 9.842e-05, - "loss": 3.6708, - "step": 80 - }, - { - "epoch": 0.034, - "grad_norm": 1.3336719274520874, - "learning_rate": 9.832000000000001e-05, - "loss": 3.8648, - "step": 85 - }, - { - "epoch": 0.036, - "grad_norm": 1.0719950199127197, - "learning_rate": 9.822e-05, - "loss": 3.7916, - "step": 90 - }, - { - "epoch": 0.038, - "grad_norm": 1.332682490348816, - "learning_rate": 9.812e-05, - "loss": 3.6925, - "step": 95 - }, - { - "epoch": 0.04, - "grad_norm": 1.3171230554580688, - "learning_rate": 9.802e-05, - "loss": 3.6201, - "step": 100 - }, - { - "epoch": 0.042, - "grad_norm": 1.0597072839736938, - "learning_rate": 9.792e-05, - "loss": 3.484, - "step": 105 - }, - { - "epoch": 0.044, - "grad_norm": 1.6820316314697266, - "learning_rate": 9.782e-05, - "loss": 3.6541, - "step": 110 - }, - { - "epoch": 0.046, - "grad_norm": 1.7244327068328857, - "learning_rate": 9.772e-05, - "loss": 3.5441, - "step": 115 - }, - { - "epoch": 0.048, - "grad_norm": 1.0304560661315918, - "learning_rate": 9.762e-05, - "loss": 3.5992, - "step": 120 - }, - { - "epoch": 0.05, - "grad_norm": 1.675391435623169, - "learning_rate": 9.752e-05, - "loss": 3.1433, - "step": 125 - }, - { - "epoch": 0.052, - "grad_norm": 1.9963089227676392, - "learning_rate": 9.742e-05, - "loss": 3.3042, - "step": 130 - }, - { - "epoch": 0.054, - "grad_norm": 1.8973188400268555, - "learning_rate": 9.732e-05, - "loss": 3.3942, - "step": 135 - }, - { - "epoch": 0.056, - "grad_norm": 1.1776793003082275, - "learning_rate": 9.722e-05, - "loss": 3.1565, - "step": 140 - }, - { - "epoch": 0.058, - "grad_norm": 1.6588083505630493, - "learning_rate": 9.712e-05, - "loss": 3.2037, - "step": 145 - }, - { - "epoch": 0.06, - "grad_norm": 1.866132140159607, - "learning_rate": 9.702e-05, - "loss": 2.921, - "step": 150 - }, - { - "epoch": 0.062, - "grad_norm": 0.8898491263389587, - "learning_rate": 9.692e-05, - "loss": 3.0541, - "step": 155 - }, - { - "epoch": 0.064, - "grad_norm": 1.8436152935028076, - "learning_rate": 9.682e-05, - "loss": 2.7864, - "step": 160 - }, - { - "epoch": 0.066, - "grad_norm": 2.3928751945495605, - "learning_rate": 9.672e-05, - "loss": 3.0799, - "step": 165 - }, - { - "epoch": 0.068, - "grad_norm": 1.4375264644622803, - "learning_rate": 9.661999999999999e-05, - "loss": 2.9754, - "step": 170 - }, - { - "epoch": 0.07, - "grad_norm": 1.478073239326477, - "learning_rate": 9.652e-05, - "loss": 2.8644, - "step": 175 - }, - { - "epoch": 0.072, - "grad_norm": 1.5689969062805176, - "learning_rate": 9.642e-05, - "loss": 2.9735, - "step": 180 - }, - { - "epoch": 0.074, - "grad_norm": 1.9494465589523315, - "learning_rate": 9.632e-05, - "loss": 2.6551, - "step": 185 - }, - { - "epoch": 0.076, - "grad_norm": 2.043407678604126, - "learning_rate": 9.622000000000001e-05, - "loss": 2.6535, - "step": 190 - }, - { - "epoch": 0.078, - "grad_norm": 1.8407542705535889, - "learning_rate": 9.612000000000001e-05, - "loss": 2.7985, - "step": 195 - }, - { - "epoch": 0.08, - "grad_norm": 1.5500164031982422, - "learning_rate": 9.602e-05, - "loss": 2.9799, - "step": 200 - }, - { - "epoch": 0.082, - "grad_norm": 1.3006932735443115, - "learning_rate": 9.592e-05, - "loss": 2.9563, - "step": 205 - }, - { - "epoch": 0.084, - "grad_norm": 1.2256354093551636, - "learning_rate": 9.582000000000001e-05, - "loss": 2.9478, - "step": 210 - }, - { - "epoch": 0.086, - "grad_norm": 2.3953299522399902, - "learning_rate": 9.572000000000001e-05, - "loss": 2.8945, - "step": 215 - }, - { - "epoch": 0.088, - "grad_norm": 2.034975051879883, - "learning_rate": 9.562000000000001e-05, - "loss": 2.839, - "step": 220 - }, - { - "epoch": 0.09, - "grad_norm": 2.116765260696411, - "learning_rate": 9.552000000000001e-05, - "loss": 2.626, - "step": 225 - }, - { - "epoch": 0.092, - "grad_norm": 1.7377326488494873, - "learning_rate": 9.542e-05, - "loss": 3.0082, - "step": 230 - }, - { - "epoch": 0.094, - "grad_norm": 1.8839207887649536, - "learning_rate": 9.532000000000002e-05, - "loss": 2.7061, - "step": 235 - }, - { - "epoch": 0.096, - "grad_norm": 1.8325484991073608, - "learning_rate": 9.522000000000001e-05, - "loss": 2.6903, - "step": 240 - }, - { - "epoch": 0.098, - "grad_norm": 1.7984235286712646, - "learning_rate": 9.512000000000001e-05, - "loss": 2.7144, - "step": 245 - }, - { - "epoch": 0.1, - "grad_norm": 2.731910228729248, - "learning_rate": 9.502000000000001e-05, - "loss": 2.6156, - "step": 250 - }, - { - "epoch": 0.102, - "grad_norm": 2.2913668155670166, - "learning_rate": 9.492e-05, - "loss": 2.4733, - "step": 255 - }, - { - "epoch": 0.104, - "grad_norm": 1.8068524599075317, - "learning_rate": 9.482e-05, - "loss": 2.7326, - "step": 260 - }, - { - "epoch": 0.106, - "grad_norm": 2.2460227012634277, - "learning_rate": 9.472000000000001e-05, - "loss": 2.7199, - "step": 265 - }, - { - "epoch": 0.108, - "grad_norm": 2.186492443084717, - "learning_rate": 9.462000000000001e-05, - "loss": 2.7873, - "step": 270 - }, - { - "epoch": 0.11, - "grad_norm": 2.345064401626587, - "learning_rate": 9.452000000000001e-05, - "loss": 2.5964, - "step": 275 - }, - { - "epoch": 0.112, - "grad_norm": 1.6393128633499146, - "learning_rate": 9.442000000000001e-05, - "loss": 2.7022, - "step": 280 - }, - { - "epoch": 0.114, - "grad_norm": 1.9504517316818237, - "learning_rate": 9.432e-05, - "loss": 2.526, - "step": 285 - }, - { - "epoch": 0.116, - "grad_norm": 3.769509792327881, - "learning_rate": 9.422e-05, - "loss": 2.4051, - "step": 290 - }, - { - "epoch": 0.118, - "grad_norm": 2.109177589416504, - "learning_rate": 9.412000000000001e-05, - "loss": 2.3615, - "step": 295 - }, - { - "epoch": 0.12, - "grad_norm": 6.674826145172119, - "learning_rate": 9.402000000000001e-05, - "loss": 2.5718, - "step": 300 - }, - { - "epoch": 0.122, - "grad_norm": 2.5551745891571045, - "learning_rate": 9.392000000000001e-05, - "loss": 2.5388, - "step": 305 - }, - { - "epoch": 0.124, - "grad_norm": 2.7368383407592773, - "learning_rate": 9.382e-05, - "loss": 2.1562, - "step": 310 - }, - { - "epoch": 0.126, - "grad_norm": 2.9764292240142822, - "learning_rate": 9.372e-05, - "loss": 2.4115, - "step": 315 - }, - { - "epoch": 0.128, - "grad_norm": 2.150486469268799, - "learning_rate": 9.362e-05, - "loss": 2.4289, - "step": 320 - }, - { - "epoch": 0.13, - "grad_norm": 3.41752028465271, - "learning_rate": 9.352000000000001e-05, - "loss": 2.4018, - "step": 325 - }, - { - "epoch": 0.132, - "grad_norm": 2.62450909614563, - "learning_rate": 9.342000000000001e-05, - "loss": 2.404, - "step": 330 - }, - { - "epoch": 0.134, - "grad_norm": 2.1548142433166504, - "learning_rate": 9.332000000000001e-05, - "loss": 2.766, - "step": 335 - }, - { - "epoch": 0.136, - "grad_norm": 2.3468611240386963, - "learning_rate": 9.322e-05, - "loss": 2.4288, - "step": 340 - }, - { - "epoch": 0.138, - "grad_norm": 1.9857568740844727, - "learning_rate": 9.312e-05, - "loss": 2.0464, - "step": 345 - }, - { - "epoch": 0.14, - "grad_norm": 1.7904646396636963, - "learning_rate": 9.302e-05, - "loss": 2.5532, - "step": 350 - }, - { - "epoch": 0.142, - "grad_norm": 1.6434996128082275, - "learning_rate": 9.292000000000001e-05, - "loss": 2.2769, - "step": 355 - }, - { - "epoch": 0.144, - "grad_norm": 2.023183584213257, - "learning_rate": 9.282000000000001e-05, - "loss": 2.37, - "step": 360 - }, - { - "epoch": 0.146, - "grad_norm": 1.925668478012085, - "learning_rate": 9.272e-05, - "loss": 2.7774, - "step": 365 - }, - { - "epoch": 0.148, - "grad_norm": 3.1799802780151367, - "learning_rate": 9.262e-05, - "loss": 2.4829, - "step": 370 - }, - { - "epoch": 0.15, - "grad_norm": 2.7041819095611572, - "learning_rate": 9.252e-05, - "loss": 2.3482, - "step": 375 - }, - { - "epoch": 0.152, - "grad_norm": 2.807724952697754, - "learning_rate": 9.242000000000001e-05, - "loss": 2.0214, - "step": 380 - }, - { - "epoch": 0.154, - "grad_norm": 2.2531774044036865, - "learning_rate": 9.232000000000001e-05, - "loss": 2.93, - "step": 385 - }, - { - "epoch": 0.156, - "grad_norm": 2.0609052181243896, - "learning_rate": 9.222000000000001e-05, - "loss": 1.9283, - "step": 390 - }, - { - "epoch": 0.158, - "grad_norm": 2.284008502960205, - "learning_rate": 9.212e-05, - "loss": 2.2357, - "step": 395 - }, - { - "epoch": 0.16, - "grad_norm": 2.8613440990448, - "learning_rate": 9.202e-05, - "loss": 2.1285, - "step": 400 - }, - { - "epoch": 0.162, - "grad_norm": 2.23891544342041, - "learning_rate": 9.192e-05, - "loss": 2.2739, - "step": 405 - }, - { - "epoch": 0.164, - "grad_norm": 1.527755856513977, - "learning_rate": 9.182000000000001e-05, - "loss": 2.4071, - "step": 410 - }, - { - "epoch": 0.166, - "grad_norm": 1.6973111629486084, - "learning_rate": 9.172000000000001e-05, - "loss": 2.4015, - "step": 415 - }, - { - "epoch": 0.168, - "grad_norm": 3.209406614303589, - "learning_rate": 9.162000000000001e-05, - "loss": 2.4004, - "step": 420 - }, - { - "epoch": 0.17, - "grad_norm": 1.8819735050201416, - "learning_rate": 9.152e-05, - "loss": 2.2514, - "step": 425 - }, - { - "epoch": 0.172, - "grad_norm": 2.637023448944092, - "learning_rate": 9.142e-05, - "loss": 2.0511, - "step": 430 - }, - { - "epoch": 0.174, - "grad_norm": 2.4952168464660645, - "learning_rate": 9.132e-05, - "loss": 2.2291, - "step": 435 - }, - { - "epoch": 0.176, - "grad_norm": 2.280730724334717, - "learning_rate": 9.122000000000001e-05, - "loss": 2.4591, - "step": 440 - }, - { - "epoch": 0.178, - "grad_norm": 1.9758051633834839, - "learning_rate": 9.112000000000001e-05, - "loss": 2.4378, - "step": 445 - }, - { - "epoch": 0.18, - "grad_norm": 2.1086337566375732, - "learning_rate": 9.102e-05, - "loss": 2.2705, - "step": 450 - }, - { - "epoch": 0.182, - "grad_norm": 2.398313045501709, - "learning_rate": 9.092e-05, - "loss": 2.2926, - "step": 455 - }, - { - "epoch": 0.184, - "grad_norm": 3.39194393157959, - "learning_rate": 9.082e-05, - "loss": 2.8741, - "step": 460 - }, - { - "epoch": 0.186, - "grad_norm": 2.1371476650238037, - "learning_rate": 9.072e-05, - "loss": 1.9811, - "step": 465 - }, - { - "epoch": 0.188, - "grad_norm": 2.9003446102142334, - "learning_rate": 9.062000000000001e-05, - "loss": 2.4993, - "step": 470 - }, - { - "epoch": 0.19, - "grad_norm": 2.0266385078430176, - "learning_rate": 9.052000000000001e-05, - "loss": 2.2897, - "step": 475 - }, - { - "epoch": 0.192, - "grad_norm": 1.8421316146850586, - "learning_rate": 9.042e-05, - "loss": 2.0086, - "step": 480 - }, - { - "epoch": 0.194, - "grad_norm": 1.958868145942688, - "learning_rate": 9.032e-05, - "loss": 2.3263, - "step": 485 - }, - { - "epoch": 0.196, - "grad_norm": 2.8556814193725586, - "learning_rate": 9.022e-05, - "loss": 2.3719, - "step": 490 - }, - { - "epoch": 0.198, - "grad_norm": 2.265723705291748, - "learning_rate": 9.012e-05, - "loss": 2.2051, - "step": 495 - }, - { - "epoch": 0.2, - "grad_norm": 1.8368626832962036, - "learning_rate": 9.002000000000001e-05, - "loss": 2.3211, - "step": 500 - }, - { - "epoch": 0.202, - "grad_norm": 3.4433846473693848, - "learning_rate": 8.992e-05, - "loss": 2.0655, - "step": 505 - }, - { - "epoch": 0.204, - "grad_norm": 1.8898130655288696, - "learning_rate": 8.982e-05, - "loss": 1.992, - "step": 510 - }, - { - "epoch": 0.206, - "grad_norm": 3.5473153591156006, - "learning_rate": 8.972e-05, - "loss": 2.1858, - "step": 515 - }, - { - "epoch": 0.208, - "grad_norm": 2.271097183227539, - "learning_rate": 8.962e-05, - "loss": 1.9518, - "step": 520 - }, - { - "epoch": 0.21, - "grad_norm": 1.821327805519104, - "learning_rate": 8.952000000000001e-05, - "loss": 1.9524, - "step": 525 - }, - { - "epoch": 0.212, - "grad_norm": 3.471569776535034, - "learning_rate": 8.942000000000001e-05, - "loss": 1.8348, - "step": 530 - }, - { - "epoch": 0.214, - "grad_norm": 3.1918933391571045, - "learning_rate": 8.932e-05, - "loss": 2.2592, - "step": 535 - }, - { - "epoch": 0.216, - "grad_norm": 2.0800018310546875, - "learning_rate": 8.922e-05, - "loss": 2.3358, - "step": 540 - }, - { - "epoch": 0.218, - "grad_norm": 1.8120659589767456, - "learning_rate": 8.912e-05, - "loss": 2.2089, - "step": 545 - }, - { - "epoch": 0.22, - "grad_norm": 2.169672727584839, - "learning_rate": 8.902e-05, - "loss": 2.3545, - "step": 550 - }, - { - "epoch": 0.222, - "grad_norm": 1.9190467596054077, - "learning_rate": 8.892000000000001e-05, - "loss": 2.2975, - "step": 555 - }, - { - "epoch": 0.224, - "grad_norm": 2.399026870727539, - "learning_rate": 8.882000000000001e-05, - "loss": 2.3177, - "step": 560 - }, - { - "epoch": 0.226, - "grad_norm": 1.993609070777893, - "learning_rate": 8.872e-05, - "loss": 2.412, - "step": 565 - }, - { - "epoch": 0.228, - "grad_norm": 4.1268720626831055, - "learning_rate": 8.862e-05, - "loss": 2.3971, - "step": 570 - }, - { - "epoch": 0.23, - "grad_norm": 2.6726512908935547, - "learning_rate": 8.852e-05, - "loss": 2.294, - "step": 575 - }, - { - "epoch": 0.232, - "grad_norm": 2.2172746658325195, - "learning_rate": 8.842e-05, - "loss": 2.355, - "step": 580 - }, - { - "epoch": 0.234, - "grad_norm": 2.61527943611145, - "learning_rate": 8.832000000000001e-05, - "loss": 1.83, - "step": 585 - }, - { - "epoch": 0.236, - "grad_norm": 1.6478010416030884, - "learning_rate": 8.822e-05, - "loss": 2.1412, - "step": 590 - }, - { - "epoch": 0.238, - "grad_norm": 2.563441038131714, - "learning_rate": 8.812e-05, - "loss": 2.2381, - "step": 595 - }, - { - "epoch": 0.24, - "grad_norm": 3.079211473464966, - "learning_rate": 8.802e-05, - "loss": 2.2569, - "step": 600 - }, - { - "epoch": 0.242, - "grad_norm": 1.9616568088531494, - "learning_rate": 8.792e-05, - "loss": 2.2858, - "step": 605 - }, - { - "epoch": 0.244, - "grad_norm": 2.6890292167663574, - "learning_rate": 8.782e-05, - "loss": 2.0128, - "step": 610 - }, - { - "epoch": 0.246, - "grad_norm": 1.2593388557434082, - "learning_rate": 8.772000000000001e-05, - "loss": 2.4054, - "step": 615 - }, - { - "epoch": 0.248, - "grad_norm": 2.716627836227417, - "learning_rate": 8.762e-05, - "loss": 2.5457, - "step": 620 - }, - { - "epoch": 0.25, - "grad_norm": 2.6016945838928223, - "learning_rate": 8.752e-05, - "loss": 1.6912, - "step": 625 - }, - { - "epoch": 0.252, - "grad_norm": 2.391510248184204, - "learning_rate": 8.742e-05, - "loss": 2.0171, - "step": 630 - }, - { - "epoch": 0.254, - "grad_norm": 4.822355270385742, - "learning_rate": 8.732e-05, - "loss": 2.1439, - "step": 635 - }, - { - "epoch": 0.256, - "grad_norm": 3.8465750217437744, - "learning_rate": 8.722e-05, - "loss": 2.0739, - "step": 640 - }, - { - "epoch": 0.258, - "grad_norm": 2.866173267364502, - "learning_rate": 8.712e-05, - "loss": 2.0621, - "step": 645 - }, - { - "epoch": 0.26, - "grad_norm": 2.4506778717041016, - "learning_rate": 8.702e-05, - "loss": 2.0337, - "step": 650 - }, - { - "epoch": 0.262, - "grad_norm": 2.4373891353607178, - "learning_rate": 8.692e-05, - "loss": 1.7654, - "step": 655 - }, - { - "epoch": 0.264, - "grad_norm": 2.212902784347534, - "learning_rate": 8.682e-05, - "loss": 2.1709, - "step": 660 - }, - { - "epoch": 0.266, - "grad_norm": 2.6106960773468018, - "learning_rate": 8.672e-05, - "loss": 1.9015, - "step": 665 - }, - { - "epoch": 0.268, - "grad_norm": 4.304783344268799, - "learning_rate": 8.662000000000001e-05, - "loss": 2.0843, - "step": 670 - }, - { - "epoch": 0.27, - "grad_norm": 2.9099340438842773, - "learning_rate": 8.652e-05, - "loss": 2.2098, - "step": 675 - }, - { - "epoch": 0.272, - "grad_norm": 2.6931354999542236, - "learning_rate": 8.642e-05, - "loss": 2.1349, - "step": 680 - }, - { - "epoch": 0.274, - "grad_norm": 3.630815029144287, - "learning_rate": 8.632e-05, - "loss": 1.7593, - "step": 685 - }, - { - "epoch": 0.276, - "grad_norm": 2.0120015144348145, - "learning_rate": 8.622e-05, - "loss": 2.1293, - "step": 690 - }, - { - "epoch": 0.278, - "grad_norm": 3.897691249847412, - "learning_rate": 8.612e-05, - "loss": 2.1552, - "step": 695 - }, - { - "epoch": 0.28, - "grad_norm": 2.266237735748291, - "learning_rate": 8.602e-05, - "loss": 2.2244, - "step": 700 - }, - { - "epoch": 0.282, - "grad_norm": 2.100522994995117, - "learning_rate": 8.592e-05, - "loss": 2.3361, - "step": 705 - }, - { - "epoch": 0.284, - "grad_norm": 2.1430091857910156, - "learning_rate": 8.582e-05, - "loss": 1.7879, - "step": 710 - }, - { - "epoch": 0.286, - "grad_norm": 3.2257421016693115, - "learning_rate": 8.572e-05, - "loss": 2.0216, - "step": 715 - }, - { - "epoch": 0.288, - "grad_norm": 0.9987928867340088, - "learning_rate": 8.562e-05, - "loss": 2.3699, - "step": 720 - }, - { - "epoch": 0.29, - "grad_norm": 3.250732421875, - "learning_rate": 8.552e-05, - "loss": 1.7009, - "step": 725 - }, - { - "epoch": 0.292, - "grad_norm": 2.7594077587127686, - "learning_rate": 8.542e-05, - "loss": 1.829, - "step": 730 - }, - { - "epoch": 0.294, - "grad_norm": 3.0348315238952637, - "learning_rate": 8.532e-05, - "loss": 1.4677, - "step": 735 - }, - { - "epoch": 0.296, - "grad_norm": 2.9564616680145264, - "learning_rate": 8.522e-05, - "loss": 1.7962, - "step": 740 - }, - { - "epoch": 0.298, - "grad_norm": 2.6723451614379883, - "learning_rate": 8.512e-05, - "loss": 2.4121, - "step": 745 - }, - { - "epoch": 0.3, - "grad_norm": 3.3210055828094482, - "learning_rate": 8.502e-05, - "loss": 2.1947, - "step": 750 - }, - { - "epoch": 0.302, - "grad_norm": 2.0533103942871094, - "learning_rate": 8.492e-05, - "loss": 2.1698, - "step": 755 - }, - { - "epoch": 0.304, - "grad_norm": 1.7164925336837769, - "learning_rate": 8.482e-05, - "loss": 2.3975, - "step": 760 - }, - { - "epoch": 0.306, - "grad_norm": 2.3715977668762207, - "learning_rate": 8.472e-05, - "loss": 2.0064, - "step": 765 - }, - { - "epoch": 0.308, - "grad_norm": 2.326876640319824, - "learning_rate": 8.462e-05, - "loss": 1.8805, - "step": 770 - }, - { - "epoch": 0.31, - "grad_norm": 2.4446003437042236, - "learning_rate": 8.452e-05, - "loss": 2.0861, - "step": 775 - }, - { - "epoch": 0.312, - "grad_norm": 3.457144021987915, - "learning_rate": 8.442e-05, - "loss": 2.3564, - "step": 780 - }, - { - "epoch": 0.314, - "grad_norm": 2.255930185317993, - "learning_rate": 8.431999999999999e-05, - "loss": 2.1533, - "step": 785 - }, - { - "epoch": 0.316, - "grad_norm": 1.9043174982070923, - "learning_rate": 8.422e-05, - "loss": 1.914, - "step": 790 - }, - { - "epoch": 0.318, - "grad_norm": 3.0527002811431885, - "learning_rate": 8.412e-05, - "loss": 1.9351, - "step": 795 - }, - { - "epoch": 0.32, - "grad_norm": 3.707892417907715, - "learning_rate": 8.402e-05, - "loss": 2.0129, - "step": 800 - }, - { - "epoch": 0.322, - "grad_norm": 1.6021428108215332, - "learning_rate": 8.392e-05, - "loss": 2.0383, - "step": 805 - }, - { - "epoch": 0.324, - "grad_norm": 2.2315077781677246, - "learning_rate": 8.382e-05, - "loss": 1.8572, - "step": 810 - }, - { - "epoch": 0.326, - "grad_norm": 2.0886893272399902, - "learning_rate": 8.372e-05, - "loss": 2.389, - "step": 815 - }, - { - "epoch": 0.328, - "grad_norm": 2.5066492557525635, - "learning_rate": 8.362000000000002e-05, - "loss": 2.3126, - "step": 820 - }, - { - "epoch": 0.33, - "grad_norm": 2.559074640274048, - "learning_rate": 8.352000000000001e-05, - "loss": 1.8435, - "step": 825 - }, - { - "epoch": 0.332, - "grad_norm": 1.2982532978057861, - "learning_rate": 8.342000000000001e-05, - "loss": 2.6958, - "step": 830 - }, - { - "epoch": 0.334, - "grad_norm": 2.9500558376312256, - "learning_rate": 8.332000000000001e-05, - "loss": 2.2249, - "step": 835 - }, - { - "epoch": 0.336, - "grad_norm": 1.1935762166976929, - "learning_rate": 8.322e-05, - "loss": 2.1226, - "step": 840 - }, - { - "epoch": 0.338, - "grad_norm": 2.153440237045288, - "learning_rate": 8.312e-05, - "loss": 2.1367, - "step": 845 - }, - { - "epoch": 0.34, - "grad_norm": 3.4815332889556885, - "learning_rate": 8.302000000000001e-05, - "loss": 1.7813, - "step": 850 - }, - { - "epoch": 0.342, - "grad_norm": 2.8280904293060303, - "learning_rate": 8.292000000000001e-05, - "loss": 1.9479, - "step": 855 - }, - { - "epoch": 0.344, - "grad_norm": 3.511687994003296, - "learning_rate": 8.282000000000001e-05, - "loss": 1.9082, - "step": 860 - }, - { - "epoch": 0.346, - "grad_norm": 2.669370651245117, - "learning_rate": 8.272000000000001e-05, - "loss": 1.5332, - "step": 865 - }, - { - "epoch": 0.348, - "grad_norm": 2.840242862701416, - "learning_rate": 8.262e-05, - "loss": 2.3229, - "step": 870 - }, - { - "epoch": 0.35, - "grad_norm": 3.331766128540039, - "learning_rate": 8.252e-05, - "loss": 2.0155, - "step": 875 - }, - { - "epoch": 0.352, - "grad_norm": 4.060706615447998, - "learning_rate": 8.242000000000001e-05, - "loss": 1.7354, - "step": 880 - }, - { - "epoch": 0.354, - "grad_norm": 2.9245781898498535, - "learning_rate": 8.232000000000001e-05, - "loss": 2.1195, - "step": 885 - }, - { - "epoch": 0.356, - "grad_norm": 2.2486793994903564, - "learning_rate": 8.222000000000001e-05, - "loss": 2.0922, - "step": 890 - }, - { - "epoch": 0.358, - "grad_norm": 1.3685901165008545, - "learning_rate": 8.212e-05, - "loss": 2.0711, - "step": 895 - }, - { - "epoch": 0.36, - "grad_norm": 3.810460090637207, - "learning_rate": 8.202e-05, - "loss": 1.6854, - "step": 900 - }, - { - "epoch": 0.362, - "grad_norm": 2.693786382675171, - "learning_rate": 8.192e-05, - "loss": 1.9812, - "step": 905 - }, - { - "epoch": 0.364, - "grad_norm": 3.220974922180176, - "learning_rate": 8.182000000000001e-05, - "loss": 2.1331, - "step": 910 - }, - { - "epoch": 0.366, - "grad_norm": 3.7384660243988037, - "learning_rate": 8.172000000000001e-05, - "loss": 1.713, - "step": 915 - }, - { - "epoch": 0.368, - "grad_norm": 2.024315118789673, - "learning_rate": 8.162000000000001e-05, - "loss": 2.2023, - "step": 920 - }, - { - "epoch": 0.37, - "grad_norm": 3.1162705421447754, - "learning_rate": 8.152e-05, - "loss": 1.77, - "step": 925 - }, - { - "epoch": 0.372, - "grad_norm": 2.4156429767608643, - "learning_rate": 8.142e-05, - "loss": 1.9556, - "step": 930 - }, - { - "epoch": 0.374, - "grad_norm": 1.5801384449005127, - "learning_rate": 8.132e-05, - "loss": 1.9622, - "step": 935 - }, - { - "epoch": 0.376, - "grad_norm": 3.660128355026245, - "learning_rate": 8.122000000000001e-05, - "loss": 2.078, - "step": 940 - }, - { - "epoch": 0.378, - "grad_norm": 1.9089343547821045, - "learning_rate": 8.112000000000001e-05, - "loss": 1.9397, - "step": 945 - }, - { - "epoch": 0.38, - "grad_norm": 2.250739812850952, - "learning_rate": 8.102000000000001e-05, - "loss": 1.6644, - "step": 950 - }, - { - "epoch": 0.382, - "grad_norm": 2.162501573562622, - "learning_rate": 8.092e-05, - "loss": 1.7254, - "step": 955 - }, - { - "epoch": 0.384, - "grad_norm": 1.6305783987045288, - "learning_rate": 8.082e-05, - "loss": 1.7052, - "step": 960 - }, - { - "epoch": 0.386, - "grad_norm": 3.8243024349212646, - "learning_rate": 8.072000000000001e-05, - "loss": 1.6534, - "step": 965 - }, - { - "epoch": 0.388, - "grad_norm": 2.9563748836517334, - "learning_rate": 8.062000000000001e-05, - "loss": 2.0002, - "step": 970 - }, - { - "epoch": 0.39, - "grad_norm": 2.350604772567749, - "learning_rate": 8.052000000000001e-05, - "loss": 1.8192, - "step": 975 - }, - { - "epoch": 0.392, - "grad_norm": 1.9382598400115967, - "learning_rate": 8.042e-05, - "loss": 2.1386, - "step": 980 - }, - { - "epoch": 0.394, - "grad_norm": 3.3442025184631348, - "learning_rate": 8.032e-05, - "loss": 1.7758, - "step": 985 - }, - { - "epoch": 0.396, - "grad_norm": 4.59849214553833, - "learning_rate": 8.022e-05, - "loss": 1.9987, - "step": 990 - }, - { - "epoch": 0.398, - "grad_norm": 1.7831141948699951, - "learning_rate": 8.012000000000001e-05, - "loss": 1.8504, - "step": 995 - }, - { - "epoch": 0.4, - "grad_norm": 3.119198799133301, - "learning_rate": 8.002000000000001e-05, - "loss": 2.1055, - "step": 1000 - }, - { - "epoch": 0.402, - "grad_norm": 4.341230869293213, - "learning_rate": 7.992000000000001e-05, - "loss": 1.9915, - "step": 1005 - }, - { - "epoch": 0.404, - "grad_norm": 3.653338670730591, - "learning_rate": 7.982e-05, - "loss": 1.9072, - "step": 1010 - }, - { - "epoch": 0.406, - "grad_norm": 2.365283489227295, - "learning_rate": 7.972e-05, - "loss": 2.1189, - "step": 1015 - }, - { - "epoch": 0.408, - "grad_norm": 2.3448755741119385, - "learning_rate": 7.962e-05, - "loss": 1.5658, - "step": 1020 - }, - { - "epoch": 0.41, - "grad_norm": 3.2361137866973877, - "learning_rate": 7.952000000000001e-05, - "loss": 1.5764, - "step": 1025 - }, - { - "epoch": 0.412, - "grad_norm": 4.448095798492432, - "learning_rate": 7.942000000000001e-05, - "loss": 1.9814, - "step": 1030 - }, - { - "epoch": 0.414, - "grad_norm": 1.5654709339141846, - "learning_rate": 7.932e-05, - "loss": 1.8629, - "step": 1035 - }, - { - "epoch": 0.416, - "grad_norm": 3.3745901584625244, - "learning_rate": 7.922e-05, - "loss": 2.1952, - "step": 1040 - }, - { - "epoch": 0.418, - "grad_norm": 2.3770949840545654, - "learning_rate": 7.912e-05, - "loss": 1.9977, - "step": 1045 - }, - { - "epoch": 0.42, - "grad_norm": 3.179367780685425, - "learning_rate": 7.902e-05, - "loss": 1.9814, - "step": 1050 - }, - { - "epoch": 0.422, - "grad_norm": 1.5007638931274414, - "learning_rate": 7.892000000000001e-05, - "loss": 1.8761, - "step": 1055 - }, - { - "epoch": 0.424, - "grad_norm": 3.5575854778289795, - "learning_rate": 7.882000000000001e-05, - "loss": 1.789, - "step": 1060 - }, - { - "epoch": 0.426, - "grad_norm": 1.8852957487106323, - "learning_rate": 7.872e-05, - "loss": 2.178, - "step": 1065 - }, - { - "epoch": 0.428, - "grad_norm": 2.534390449523926, - "learning_rate": 7.862e-05, - "loss": 1.9272, - "step": 1070 - }, - { - "epoch": 0.43, - "grad_norm": 3.5568392276763916, - "learning_rate": 7.852e-05, - "loss": 2.116, - "step": 1075 - }, - { - "epoch": 0.432, - "grad_norm": 2.170743942260742, - "learning_rate": 7.842e-05, - "loss": 1.4085, - "step": 1080 - }, - { - "epoch": 0.434, - "grad_norm": 2.4826807975769043, - "learning_rate": 7.832000000000001e-05, - "loss": 1.6083, - "step": 1085 - }, - { - "epoch": 0.436, - "grad_norm": 3.557332992553711, - "learning_rate": 7.822e-05, - "loss": 2.0262, - "step": 1090 - }, - { - "epoch": 0.438, - "grad_norm": 2.6044585704803467, - "learning_rate": 7.812e-05, - "loss": 1.9665, - "step": 1095 - }, - { - "epoch": 0.44, - "grad_norm": 2.431857109069824, - "learning_rate": 7.802e-05, - "loss": 1.9879, - "step": 1100 - }, - { - "epoch": 0.442, - "grad_norm": 3.814208507537842, - "learning_rate": 7.792e-05, - "loss": 1.6894, - "step": 1105 - }, - { - "epoch": 0.444, - "grad_norm": 2.7338225841522217, - "learning_rate": 7.782000000000001e-05, - "loss": 1.7777, - "step": 1110 - }, - { - "epoch": 0.446, - "grad_norm": 2.560375690460205, - "learning_rate": 7.772000000000001e-05, - "loss": 2.0086, - "step": 1115 - }, - { - "epoch": 0.448, - "grad_norm": 2.316746950149536, - "learning_rate": 7.762e-05, - "loss": 1.7457, - "step": 1120 - }, - { - "epoch": 0.45, - "grad_norm": 1.6756999492645264, - "learning_rate": 7.752e-05, - "loss": 2.0588, - "step": 1125 - }, - { - "epoch": 0.452, - "grad_norm": 1.4262984991073608, - "learning_rate": 7.742e-05, - "loss": 1.9309, - "step": 1130 - }, - { - "epoch": 0.454, - "grad_norm": 3.5977210998535156, - "learning_rate": 7.732e-05, - "loss": 1.7672, - "step": 1135 - }, - { - "epoch": 0.456, - "grad_norm": 2.7261245250701904, - "learning_rate": 7.722000000000001e-05, - "loss": 1.5192, - "step": 1140 - }, - { - "epoch": 0.458, - "grad_norm": 2.7008583545684814, - "learning_rate": 7.712000000000001e-05, - "loss": 2.0424, - "step": 1145 - }, - { - "epoch": 0.46, - "grad_norm": 2.377896785736084, - "learning_rate": 7.702e-05, - "loss": 2.0002, - "step": 1150 - }, - { - "epoch": 0.462, - "grad_norm": 4.894864082336426, - "learning_rate": 7.692e-05, - "loss": 2.1725, - "step": 1155 - }, - { - "epoch": 0.464, - "grad_norm": 1.4119629859924316, - "learning_rate": 7.682e-05, - "loss": 2.177, - "step": 1160 - }, - { - "epoch": 0.466, - "grad_norm": 2.613739013671875, - "learning_rate": 7.672e-05, - "loss": 2.093, - "step": 1165 - }, - { - "epoch": 0.468, - "grad_norm": 2.0441625118255615, - "learning_rate": 7.662000000000001e-05, - "loss": 1.98, - "step": 1170 - }, - { - "epoch": 0.47, - "grad_norm": 3.4278924465179443, - "learning_rate": 7.652e-05, - "loss": 1.7976, - "step": 1175 - }, - { - "epoch": 0.472, - "grad_norm": 2.316985607147217, - "learning_rate": 7.642e-05, - "loss": 2.0487, - "step": 1180 - }, - { - "epoch": 0.474, - "grad_norm": 2.847053050994873, - "learning_rate": 7.632e-05, - "loss": 1.8201, - "step": 1185 - }, - { - "epoch": 0.476, - "grad_norm": 2.258514404296875, - "learning_rate": 7.622e-05, - "loss": 1.8056, - "step": 1190 - }, - { - "epoch": 0.478, - "grad_norm": 1.729820728302002, - "learning_rate": 7.612e-05, - "loss": 1.7724, - "step": 1195 - }, - { - "epoch": 0.48, - "grad_norm": 3.0825610160827637, - "learning_rate": 7.602000000000001e-05, - "loss": 1.9275, - "step": 1200 - }, - { - "epoch": 0.482, - "grad_norm": 3.6028025150299072, - "learning_rate": 7.592e-05, - "loss": 1.7892, - "step": 1205 - }, - { - "epoch": 0.484, - "grad_norm": 3.5654330253601074, - "learning_rate": 7.582e-05, - "loss": 2.3649, - "step": 1210 - }, - { - "epoch": 0.486, - "grad_norm": 3.2018349170684814, - "learning_rate": 7.572e-05, - "loss": 1.7233, - "step": 1215 - }, - { - "epoch": 0.488, - "grad_norm": 2.509002923965454, - "learning_rate": 7.562e-05, - "loss": 1.7338, - "step": 1220 - }, - { - "epoch": 0.49, - "grad_norm": 3.320098876953125, - "learning_rate": 7.552e-05, - "loss": 2.0038, - "step": 1225 - }, - { - "epoch": 0.492, - "grad_norm": 3.109086036682129, - "learning_rate": 7.542e-05, - "loss": 1.724, - "step": 1230 - }, - { - "epoch": 0.494, - "grad_norm": 2.193565607070923, - "learning_rate": 7.532e-05, - "loss": 1.9984, - "step": 1235 - }, - { - "epoch": 0.496, - "grad_norm": 1.5994617938995361, - "learning_rate": 7.522e-05, - "loss": 1.4454, - "step": 1240 - }, - { - "epoch": 0.498, - "grad_norm": 4.096536159515381, - "learning_rate": 7.512e-05, - "loss": 1.9554, - "step": 1245 - }, - { - "epoch": 0.5, - "grad_norm": 4.227677822113037, - "learning_rate": 7.502e-05, - "loss": 2.1382, - "step": 1250 - }, - { - "epoch": 0.502, - "grad_norm": 3.4727842807769775, - "learning_rate": 7.492000000000001e-05, - "loss": 1.5761, - "step": 1255 - }, - { - "epoch": 0.504, - "grad_norm": 3.6935126781463623, - "learning_rate": 7.482e-05, - "loss": 1.845, - "step": 1260 - }, - { - "epoch": 0.506, - "grad_norm": 2.6635711193084717, - "learning_rate": 7.472e-05, - "loss": 1.9839, - "step": 1265 - }, - { - "epoch": 0.508, - "grad_norm": 3.7328500747680664, - "learning_rate": 7.462e-05, - "loss": 1.9438, - "step": 1270 - }, - { - "epoch": 0.51, - "grad_norm": 2.842043161392212, - "learning_rate": 7.452e-05, - "loss": 1.7112, - "step": 1275 - }, - { - "epoch": 0.512, - "grad_norm": 2.5873022079467773, - "learning_rate": 7.442e-05, - "loss": 1.7037, - "step": 1280 - }, - { - "epoch": 0.514, - "grad_norm": 2.5171470642089844, - "learning_rate": 7.432e-05, - "loss": 2.0828, - "step": 1285 - }, - { - "epoch": 0.516, - "grad_norm": 2.580310344696045, - "learning_rate": 7.422e-05, - "loss": 1.9703, - "step": 1290 - }, - { - "epoch": 0.518, - "grad_norm": 1.925465703010559, - "learning_rate": 7.412e-05, - "loss": 1.9266, - "step": 1295 - }, - { - "epoch": 0.52, - "grad_norm": 4.212243556976318, - "learning_rate": 7.402e-05, - "loss": 1.816, - "step": 1300 - }, - { - "epoch": 0.522, - "grad_norm": 2.8834757804870605, - "learning_rate": 7.392e-05, - "loss": 1.7435, - "step": 1305 - }, - { - "epoch": 0.524, - "grad_norm": 3.207301616668701, - "learning_rate": 7.382e-05, - "loss": 1.6266, - "step": 1310 - }, - { - "epoch": 0.526, - "grad_norm": 2.595672369003296, - "learning_rate": 7.372e-05, - "loss": 2.1611, - "step": 1315 - }, - { - "epoch": 0.528, - "grad_norm": 1.9702566862106323, - "learning_rate": 7.362e-05, - "loss": 1.874, - "step": 1320 - }, - { - "epoch": 0.53, - "grad_norm": 3.2945854663848877, - "learning_rate": 7.352e-05, - "loss": 2.385, - "step": 1325 - }, - { - "epoch": 0.532, - "grad_norm": 2.8158018589019775, - "learning_rate": 7.342e-05, - "loss": 1.8912, - "step": 1330 - }, - { - "epoch": 0.534, - "grad_norm": 3.153384208679199, - "learning_rate": 7.332e-05, - "loss": 1.8591, - "step": 1335 - }, - { - "epoch": 0.536, - "grad_norm": 2.0991859436035156, - "learning_rate": 7.322e-05, - "loss": 2.4344, - "step": 1340 - }, - { - "epoch": 0.538, - "grad_norm": 1.6609746217727661, - "learning_rate": 7.312e-05, - "loss": 1.6431, - "step": 1345 - }, - { - "epoch": 0.54, - "grad_norm": 1.7339993715286255, - "learning_rate": 7.302e-05, - "loss": 1.8644, - "step": 1350 - }, - { - "epoch": 0.542, - "grad_norm": 2.7158915996551514, - "learning_rate": 7.292e-05, - "loss": 1.7384, - "step": 1355 - }, - { - "epoch": 0.544, - "grad_norm": 3.752121925354004, - "learning_rate": 7.282e-05, - "loss": 1.6989, - "step": 1360 - }, - { - "epoch": 0.546, - "grad_norm": 0.895588755607605, - "learning_rate": 7.272e-05, - "loss": 1.99, - "step": 1365 - }, - { - "epoch": 0.548, - "grad_norm": 3.2313334941864014, - "learning_rate": 7.261999999999999e-05, - "loss": 1.7486, - "step": 1370 - }, - { - "epoch": 0.55, - "grad_norm": 3.4713807106018066, - "learning_rate": 7.252e-05, - "loss": 1.6347, - "step": 1375 - }, - { - "epoch": 0.552, - "grad_norm": 2.7429184913635254, - "learning_rate": 7.242e-05, - "loss": 1.8079, - "step": 1380 - }, - { - "epoch": 0.554, - "grad_norm": 1.5747346878051758, - "learning_rate": 7.232e-05, - "loss": 1.5241, - "step": 1385 - }, - { - "epoch": 0.556, - "grad_norm": 2.867905855178833, - "learning_rate": 7.222e-05, - "loss": 1.8958, - "step": 1390 - }, - { - "epoch": 0.558, - "grad_norm": 2.3015518188476562, - "learning_rate": 7.212e-05, - "loss": 1.7197, - "step": 1395 - }, - { - "epoch": 0.56, - "grad_norm": 1.6140376329421997, - "learning_rate": 7.202e-05, - "loss": 1.8053, - "step": 1400 - }, - { - "epoch": 0.562, - "grad_norm": 3.653310537338257, - "learning_rate": 7.192e-05, - "loss": 1.739, - "step": 1405 - }, - { - "epoch": 0.564, - "grad_norm": 2.1771411895751953, - "learning_rate": 7.182e-05, - "loss": 1.8199, - "step": 1410 - }, - { - "epoch": 0.566, - "grad_norm": 3.141714096069336, - "learning_rate": 7.172e-05, - "loss": 1.782, - "step": 1415 - }, - { - "epoch": 0.568, - "grad_norm": 3.9781055450439453, - "learning_rate": 7.162e-05, - "loss": 1.9008, - "step": 1420 - }, - { - "epoch": 0.57, - "grad_norm": 2.663086175918579, - "learning_rate": 7.151999999999999e-05, - "loss": 1.787, - "step": 1425 - }, - { - "epoch": 0.572, - "grad_norm": 2.78171443939209, - "learning_rate": 7.142e-05, - "loss": 1.676, - "step": 1430 - }, - { - "epoch": 0.574, - "grad_norm": 1.9540828466415405, - "learning_rate": 7.132e-05, - "loss": 2.553, - "step": 1435 - }, - { - "epoch": 0.576, - "grad_norm": 3.7563962936401367, - "learning_rate": 7.122000000000001e-05, - "loss": 1.614, - "step": 1440 - }, - { - "epoch": 0.578, - "grad_norm": 3.0696017742156982, - "learning_rate": 7.112000000000001e-05, - "loss": 1.6421, - "step": 1445 - }, - { - "epoch": 0.58, - "grad_norm": 2.7918848991394043, - "learning_rate": 7.102000000000001e-05, - "loss": 1.576, - "step": 1450 - }, - { - "epoch": 0.582, - "grad_norm": 2.9208178520202637, - "learning_rate": 7.092e-05, - "loss": 1.7068, - "step": 1455 - }, - { - "epoch": 0.584, - "grad_norm": 2.821730375289917, - "learning_rate": 7.082e-05, - "loss": 1.9337, - "step": 1460 - }, - { - "epoch": 0.586, - "grad_norm": 3.104081392288208, - "learning_rate": 7.072000000000001e-05, - "loss": 1.6916, - "step": 1465 - }, - { - "epoch": 0.588, - "grad_norm": 4.225072860717773, - "learning_rate": 7.062000000000001e-05, - "loss": 1.489, - "step": 1470 - }, - { - "epoch": 0.59, - "grad_norm": 1.777544379234314, - "learning_rate": 7.052000000000001e-05, - "loss": 2.5044, - "step": 1475 - }, - { - "epoch": 0.592, - "grad_norm": 3.047288179397583, - "learning_rate": 7.042000000000001e-05, - "loss": 1.7485, - "step": 1480 - }, - { - "epoch": 0.594, - "grad_norm": 2.2908759117126465, - "learning_rate": 7.032e-05, - "loss": 1.5557, - "step": 1485 - }, - { - "epoch": 0.596, - "grad_norm": 3.3206658363342285, - "learning_rate": 7.022e-05, - "loss": 1.707, - "step": 1490 - }, - { - "epoch": 0.598, - "grad_norm": 6.7620649337768555, - "learning_rate": 7.012000000000001e-05, - "loss": 1.7839, - "step": 1495 - }, - { - "epoch": 0.6, - "grad_norm": 2.4363317489624023, - "learning_rate": 7.002000000000001e-05, - "loss": 2.006, - "step": 1500 - }, - { - "epoch": 0.602, - "grad_norm": 1.6987566947937012, - "learning_rate": 6.992000000000001e-05, - "loss": 1.701, - "step": 1505 - }, - { - "epoch": 0.604, - "grad_norm": 1.0138988494873047, - "learning_rate": 6.982e-05, - "loss": 2.0307, - "step": 1510 - }, - { - "epoch": 0.606, - "grad_norm": 3.704721689224243, - "learning_rate": 6.972e-05, - "loss": 1.9313, - "step": 1515 - }, - { - "epoch": 0.608, - "grad_norm": 2.189314126968384, - "learning_rate": 6.962e-05, - "loss": 2.195, - "step": 1520 - }, - { - "epoch": 0.61, - "grad_norm": 2.160581111907959, - "learning_rate": 6.952000000000001e-05, - "loss": 1.8127, - "step": 1525 - }, - { - "epoch": 0.612, - "grad_norm": 2.969454288482666, - "learning_rate": 6.942000000000001e-05, - "loss": 1.8863, - "step": 1530 - }, - { - "epoch": 0.614, - "grad_norm": 3.452462673187256, - "learning_rate": 6.932000000000001e-05, - "loss": 1.8243, - "step": 1535 - }, - { - "epoch": 0.616, - "grad_norm": 4.208456039428711, - "learning_rate": 6.922e-05, - "loss": 1.72, - "step": 1540 - }, - { - "epoch": 0.618, - "grad_norm": 2.2857871055603027, - "learning_rate": 6.912e-05, - "loss": 1.886, - "step": 1545 - }, - { - "epoch": 0.62, - "grad_norm": 2.4010958671569824, - "learning_rate": 6.902000000000001e-05, - "loss": 2.0313, - "step": 1550 - }, - { - "epoch": 0.622, - "grad_norm": 3.4712297916412354, - "learning_rate": 6.892000000000001e-05, - "loss": 1.5378, - "step": 1555 - }, - { - "epoch": 0.624, - "grad_norm": 2.614377975463867, - "learning_rate": 6.882000000000001e-05, - "loss": 1.5747, - "step": 1560 - }, - { - "epoch": 0.626, - "grad_norm": 1.621139407157898, - "learning_rate": 6.872e-05, - "loss": 2.2916, - "step": 1565 - }, - { - "epoch": 0.628, - "grad_norm": 2.306574821472168, - "learning_rate": 6.862e-05, - "loss": 1.7473, - "step": 1570 - }, - { - "epoch": 0.63, - "grad_norm": 2.851588010787964, - "learning_rate": 6.852e-05, - "loss": 1.5369, - "step": 1575 - }, - { - "epoch": 0.632, - "grad_norm": 3.665318489074707, - "learning_rate": 6.842000000000001e-05, - "loss": 1.7895, - "step": 1580 - }, - { - "epoch": 0.634, - "grad_norm": 1.9340227842330933, - "learning_rate": 6.832000000000001e-05, - "loss": 1.9506, - "step": 1585 - }, - { - "epoch": 0.636, - "grad_norm": 4.726400375366211, - "learning_rate": 6.822000000000001e-05, - "loss": 1.8055, - "step": 1590 - }, - { - "epoch": 0.638, - "grad_norm": 3.3782994747161865, - "learning_rate": 6.812e-05, - "loss": 1.9607, - "step": 1595 - }, - { - "epoch": 0.64, - "grad_norm": 2.157594680786133, - "learning_rate": 6.802e-05, - "loss": 1.9568, - "step": 1600 - }, - { - "epoch": 0.642, - "grad_norm": 2.580761671066284, - "learning_rate": 6.792e-05, - "loss": 1.8217, - "step": 1605 - }, - { - "epoch": 0.644, - "grad_norm": 2.2638015747070312, - "learning_rate": 6.782000000000001e-05, - "loss": 1.6837, - "step": 1610 - }, - { - "epoch": 0.646, - "grad_norm": 4.926771640777588, - "learning_rate": 6.772000000000001e-05, - "loss": 1.8462, - "step": 1615 - }, - { - "epoch": 0.648, - "grad_norm": 2.017150640487671, - "learning_rate": 6.762e-05, - "loss": 2.0979, - "step": 1620 - }, - { - "epoch": 0.65, - "grad_norm": 1.7009762525558472, - "learning_rate": 6.752e-05, - "loss": 1.9508, - "step": 1625 - }, - { - "epoch": 0.652, - "grad_norm": 1.5154443979263306, - "learning_rate": 6.742e-05, - "loss": 1.8678, - "step": 1630 - }, - { - "epoch": 0.654, - "grad_norm": 2.348085403442383, - "learning_rate": 6.732e-05, - "loss": 2.0632, - "step": 1635 - }, - { - "epoch": 0.656, - "grad_norm": 3.450380802154541, - "learning_rate": 6.722000000000001e-05, - "loss": 1.8161, - "step": 1640 - }, - { - "epoch": 0.658, - "grad_norm": 1.0829286575317383, - "learning_rate": 6.712000000000001e-05, - "loss": 1.9894, - "step": 1645 - }, - { - "epoch": 0.66, - "grad_norm": 2.454120397567749, - "learning_rate": 6.702e-05, - "loss": 1.4593, - "step": 1650 - }, - { - "epoch": 0.662, - "grad_norm": 1.4079653024673462, - "learning_rate": 6.692e-05, - "loss": 1.6048, - "step": 1655 - }, - { - "epoch": 0.664, - "grad_norm": 2.143089771270752, - "learning_rate": 6.682e-05, - "loss": 1.8546, - "step": 1660 - }, - { - "epoch": 0.666, - "grad_norm": 1.7809556722640991, - "learning_rate": 6.672e-05, - "loss": 1.8759, - "step": 1665 - }, - { - "epoch": 0.668, - "grad_norm": 2.6478631496429443, - "learning_rate": 6.662000000000001e-05, - "loss": 2.2062, - "step": 1670 - }, - { - "epoch": 0.67, - "grad_norm": 3.3029139041900635, - "learning_rate": 6.652000000000001e-05, - "loss": 1.6157, - "step": 1675 - }, - { - "epoch": 0.672, - "grad_norm": 2.268291473388672, - "learning_rate": 6.642e-05, - "loss": 1.7665, - "step": 1680 - }, - { - "epoch": 0.674, - "grad_norm": 2.053265333175659, - "learning_rate": 6.632e-05, - "loss": 2.01, - "step": 1685 - }, - { - "epoch": 0.676, - "grad_norm": 2.9823215007781982, - "learning_rate": 6.622e-05, - "loss": 2.2441, - "step": 1690 - }, - { - "epoch": 0.678, - "grad_norm": 2.4951868057250977, - "learning_rate": 6.612000000000001e-05, - "loss": 1.7005, - "step": 1695 - }, - { - "epoch": 0.68, - "grad_norm": 3.276228666305542, - "learning_rate": 6.602000000000001e-05, - "loss": 1.7218, - "step": 1700 - }, - { - "epoch": 0.682, - "grad_norm": 1.6981475353240967, - "learning_rate": 6.592e-05, - "loss": 1.8756, - "step": 1705 - }, - { - "epoch": 0.684, - "grad_norm": 2.3083853721618652, - "learning_rate": 6.582e-05, - "loss": 1.7134, - "step": 1710 - }, - { - "epoch": 0.686, - "grad_norm": 1.466787576675415, - "learning_rate": 6.572e-05, - "loss": 1.758, - "step": 1715 - }, - { - "epoch": 0.688, - "grad_norm": 3.2987775802612305, - "learning_rate": 6.562e-05, - "loss": 1.8357, - "step": 1720 - }, - { - "epoch": 0.69, - "grad_norm": 2.7337427139282227, - "learning_rate": 6.552000000000001e-05, - "loss": 1.9261, - "step": 1725 - }, - { - "epoch": 0.692, - "grad_norm": 3.676628828048706, - "learning_rate": 6.542000000000001e-05, - "loss": 2.2404, - "step": 1730 - }, - { - "epoch": 0.694, - "grad_norm": 1.8547945022583008, - "learning_rate": 6.532e-05, - "loss": 1.5531, - "step": 1735 - }, - { - "epoch": 0.696, - "grad_norm": 1.6941248178482056, - "learning_rate": 6.522e-05, - "loss": 1.7762, - "step": 1740 - }, - { - "epoch": 0.698, - "grad_norm": 1.8873628377914429, - "learning_rate": 6.512e-05, - "loss": 1.8979, - "step": 1745 - }, - { - "epoch": 0.7, - "grad_norm": 2.069035768508911, - "learning_rate": 6.502e-05, - "loss": 1.6585, - "step": 1750 - }, - { - "epoch": 0.702, - "grad_norm": 2.0181164741516113, - "learning_rate": 6.492000000000001e-05, - "loss": 1.5298, - "step": 1755 - }, - { - "epoch": 0.704, - "grad_norm": 3.213226795196533, - "learning_rate": 6.482e-05, - "loss": 1.8443, - "step": 1760 - }, - { - "epoch": 0.706, - "grad_norm": 1.1691619157791138, - "learning_rate": 6.472e-05, - "loss": 2.0895, - "step": 1765 - }, - { - "epoch": 0.708, - "grad_norm": 2.166172504425049, - "learning_rate": 6.462e-05, - "loss": 2.0047, - "step": 1770 - }, - { - "epoch": 0.71, - "grad_norm": 3.0072996616363525, - "learning_rate": 6.452e-05, - "loss": 1.7831, - "step": 1775 - }, - { - "epoch": 0.712, - "grad_norm": 2.720421552658081, - "learning_rate": 6.442e-05, - "loss": 1.8452, - "step": 1780 - }, - { - "epoch": 0.714, - "grad_norm": 2.536058187484741, - "learning_rate": 6.432000000000001e-05, - "loss": 1.7563, - "step": 1785 - }, - { - "epoch": 0.716, - "grad_norm": 3.408418893814087, - "learning_rate": 6.422e-05, - "loss": 1.6771, - "step": 1790 - }, - { - "epoch": 0.718, - "grad_norm": 2.075005531311035, - "learning_rate": 6.412e-05, - "loss": 2.1428, - "step": 1795 - }, - { - "epoch": 0.72, - "grad_norm": 2.7794342041015625, - "learning_rate": 6.402e-05, - "loss": 1.7375, - "step": 1800 - }, - { - "epoch": 0.722, - "grad_norm": 3.188624382019043, - "learning_rate": 6.392e-05, - "loss": 1.5951, - "step": 1805 - }, - { - "epoch": 0.724, - "grad_norm": 2.1974058151245117, - "learning_rate": 6.382e-05, - "loss": 1.9184, - "step": 1810 - }, - { - "epoch": 0.726, - "grad_norm": 2.495058298110962, - "learning_rate": 6.372e-05, - "loss": 1.7634, - "step": 1815 - }, - { - "epoch": 0.728, - "grad_norm": 3.094088077545166, - "learning_rate": 6.362e-05, - "loss": 1.8355, - "step": 1820 - }, - { - "epoch": 0.73, - "grad_norm": 2.500934600830078, - "learning_rate": 6.352e-05, - "loss": 1.4541, - "step": 1825 - }, - { - "epoch": 0.732, - "grad_norm": 2.872494697570801, - "learning_rate": 6.342e-05, - "loss": 1.7752, - "step": 1830 - }, - { - "epoch": 0.734, - "grad_norm": 1.8021352291107178, - "learning_rate": 6.332e-05, - "loss": 1.8278, - "step": 1835 - }, - { - "epoch": 0.736, - "grad_norm": 2.14013409614563, - "learning_rate": 6.322000000000001e-05, - "loss": 1.728, - "step": 1840 - }, - { - "epoch": 0.738, - "grad_norm": 1.6599818468093872, - "learning_rate": 6.312e-05, - "loss": 2.1892, - "step": 1845 - }, - { - "epoch": 0.74, - "grad_norm": 4.102724552154541, - "learning_rate": 6.302e-05, - "loss": 2.011, - "step": 1850 - }, - { - "epoch": 0.742, - "grad_norm": 1.7305388450622559, - "learning_rate": 6.292e-05, - "loss": 1.7146, - "step": 1855 - }, - { - "epoch": 0.744, - "grad_norm": 2.732679843902588, - "learning_rate": 6.282e-05, - "loss": 2.1723, - "step": 1860 - }, - { - "epoch": 0.746, - "grad_norm": 2.7860026359558105, - "learning_rate": 6.272e-05, - "loss": 1.3846, - "step": 1865 - }, - { - "epoch": 0.748, - "grad_norm": 2.3102917671203613, - "learning_rate": 6.262000000000001e-05, - "loss": 2.3062, - "step": 1870 - }, - { - "epoch": 0.75, - "grad_norm": 2.2898411750793457, - "learning_rate": 6.252e-05, - "loss": 1.8194, - "step": 1875 - }, - { - "epoch": 0.752, - "grad_norm": 2.242110252380371, - "learning_rate": 6.242e-05, - "loss": 1.3548, - "step": 1880 - }, - { - "epoch": 0.754, - "grad_norm": 2.670325994491577, - "learning_rate": 6.232e-05, - "loss": 1.7741, - "step": 1885 - }, - { - "epoch": 0.756, - "grad_norm": 2.8892014026641846, - "learning_rate": 6.222e-05, - "loss": 1.8173, - "step": 1890 - }, - { - "epoch": 0.758, - "grad_norm": 2.0819385051727295, - "learning_rate": 6.212e-05, - "loss": 1.8424, - "step": 1895 - }, - { - "epoch": 0.76, - "grad_norm": 3.9723422527313232, - "learning_rate": 6.202e-05, - "loss": 1.6035, - "step": 1900 - }, - { - "epoch": 0.762, - "grad_norm": 2.007082939147949, - "learning_rate": 6.192e-05, - "loss": 2.0778, - "step": 1905 - }, - { - "epoch": 0.764, - "grad_norm": 3.79123854637146, - "learning_rate": 6.182e-05, - "loss": 1.9806, - "step": 1910 - }, - { - "epoch": 0.766, - "grad_norm": 3.2290866374969482, - "learning_rate": 6.172e-05, - "loss": 1.8257, - "step": 1915 - }, - { - "epoch": 0.768, - "grad_norm": 1.8563956022262573, - "learning_rate": 6.162e-05, - "loss": 1.8678, - "step": 1920 - }, - { - "epoch": 0.77, - "grad_norm": 2.831134080886841, - "learning_rate": 6.152e-05, - "loss": 2.0049, - "step": 1925 - }, - { - "epoch": 0.772, - "grad_norm": 3.1902923583984375, - "learning_rate": 6.142e-05, - "loss": 1.5629, - "step": 1930 - }, - { - "epoch": 0.774, - "grad_norm": 2.6706533432006836, - "learning_rate": 6.132e-05, - "loss": 1.7534, - "step": 1935 - }, - { - "epoch": 0.776, - "grad_norm": 1.5922584533691406, - "learning_rate": 6.122e-05, - "loss": 1.6197, - "step": 1940 - }, - { - "epoch": 0.778, - "grad_norm": 3.367527723312378, - "learning_rate": 6.112e-05, - "loss": 1.7022, - "step": 1945 - }, - { - "epoch": 0.78, - "grad_norm": 2.544776678085327, - "learning_rate": 6.102e-05, - "loss": 2.0928, - "step": 1950 - }, - { - "epoch": 0.782, - "grad_norm": 1.8083670139312744, - "learning_rate": 6.092e-05, - "loss": 1.8053, - "step": 1955 - }, - { - "epoch": 0.784, - "grad_norm": 5.398744583129883, - "learning_rate": 6.082e-05, - "loss": 1.8233, - "step": 1960 - }, - { - "epoch": 0.786, - "grad_norm": 2.380007743835449, - "learning_rate": 6.072e-05, - "loss": 1.3794, - "step": 1965 - }, - { - "epoch": 0.788, - "grad_norm": 2.977511405944824, - "learning_rate": 6.062e-05, - "loss": 1.8151, - "step": 1970 - }, - { - "epoch": 0.79, - "grad_norm": 1.6027389764785767, - "learning_rate": 6.0519999999999997e-05, - "loss": 1.4474, - "step": 1975 - }, - { - "epoch": 0.792, - "grad_norm": 1.7922685146331787, - "learning_rate": 6.042e-05, - "loss": 1.4798, - "step": 1980 - }, - { - "epoch": 0.794, - "grad_norm": 4.0504984855651855, - "learning_rate": 6.032e-05, - "loss": 2.069, - "step": 1985 - }, - { - "epoch": 0.796, - "grad_norm": 1.401548147201538, - "learning_rate": 6.0219999999999996e-05, - "loss": 1.8933, - "step": 1990 - }, - { - "epoch": 0.798, - "grad_norm": 1.408260464668274, - "learning_rate": 6.012e-05, - "loss": 1.9556, - "step": 1995 - }, - { - "epoch": 0.8, - "grad_norm": 2.128838062286377, - "learning_rate": 6.002e-05, - "loss": 1.6432, - "step": 2000 - }, - { - "epoch": 0.802, - "grad_norm": 7.282062530517578, - "learning_rate": 5.9919999999999996e-05, - "loss": 2.1569, - "step": 2005 - }, - { - "epoch": 0.804, - "grad_norm": 2.412156343460083, - "learning_rate": 5.982e-05, - "loss": 1.4548, - "step": 2010 - }, - { - "epoch": 0.806, - "grad_norm": 2.9918742179870605, - "learning_rate": 5.972e-05, - "loss": 1.5009, - "step": 2015 - }, - { - "epoch": 0.808, - "grad_norm": 5.301854610443115, - "learning_rate": 5.9619999999999995e-05, - "loss": 1.5879, - "step": 2020 - }, - { - "epoch": 0.81, - "grad_norm": 3.3276255130767822, - "learning_rate": 5.952e-05, - "loss": 1.5994, - "step": 2025 - }, - { - "epoch": 0.812, - "grad_norm": 2.128038167953491, - "learning_rate": 5.942e-05, - "loss": 1.8374, - "step": 2030 - }, - { - "epoch": 0.814, - "grad_norm": 3.896848201751709, - "learning_rate": 5.9319999999999994e-05, - "loss": 1.5896, - "step": 2035 - }, - { - "epoch": 0.816, - "grad_norm": 2.371381998062134, - "learning_rate": 5.922e-05, - "loss": 1.7849, - "step": 2040 - }, - { - "epoch": 0.818, - "grad_norm": 1.7761462926864624, - "learning_rate": 5.9119999999999996e-05, - "loss": 2.2341, - "step": 2045 - }, - { - "epoch": 0.82, - "grad_norm": 2.826425552368164, - "learning_rate": 5.902e-05, - "loss": 2.1281, - "step": 2050 - }, - { - "epoch": 0.822, - "grad_norm": 3.5838959217071533, - "learning_rate": 5.892e-05, - "loss": 1.8984, - "step": 2055 - }, - { - "epoch": 0.824, - "grad_norm": 3.9069666862487793, - "learning_rate": 5.8819999999999996e-05, - "loss": 1.8578, - "step": 2060 - }, - { - "epoch": 0.826, - "grad_norm": 4.064440727233887, - "learning_rate": 5.872000000000001e-05, - "loss": 2.0205, - "step": 2065 - }, - { - "epoch": 0.828, - "grad_norm": 1.290831208229065, - "learning_rate": 5.862000000000001e-05, - "loss": 1.8112, - "step": 2070 - }, - { - "epoch": 0.83, - "grad_norm": 2.8391001224517822, - "learning_rate": 5.852000000000001e-05, - "loss": 1.3297, - "step": 2075 - }, - { - "epoch": 0.832, - "grad_norm": 2.2486915588378906, - "learning_rate": 5.8420000000000006e-05, - "loss": 1.5082, - "step": 2080 - }, - { - "epoch": 0.834, - "grad_norm": 2.228530168533325, - "learning_rate": 5.832000000000001e-05, - "loss": 2.0064, - "step": 2085 - }, - { - "epoch": 0.836, - "grad_norm": 2.0774176120758057, - "learning_rate": 5.822000000000001e-05, - "loss": 1.5593, - "step": 2090 - }, - { - "epoch": 0.838, - "grad_norm": 3.9520459175109863, - "learning_rate": 5.8120000000000006e-05, - "loss": 1.3591, - "step": 2095 - }, - { - "epoch": 0.84, - "grad_norm": 2.112677574157715, - "learning_rate": 5.802000000000001e-05, - "loss": 2.1816, - "step": 2100 - }, - { - "epoch": 0.842, - "grad_norm": 2.870356798171997, - "learning_rate": 5.792000000000001e-05, - "loss": 1.9012, - "step": 2105 - }, - { - "epoch": 0.844, - "grad_norm": 2.8879733085632324, - "learning_rate": 5.7820000000000005e-05, - "loss": 1.604, - "step": 2110 - }, - { - "epoch": 0.846, - "grad_norm": 2.116102933883667, - "learning_rate": 5.772000000000001e-05, - "loss": 1.5525, - "step": 2115 - }, - { - "epoch": 0.848, - "grad_norm": 4.587926387786865, - "learning_rate": 5.762000000000001e-05, - "loss": 2.0804, - "step": 2120 - }, - { - "epoch": 0.85, - "grad_norm": 1.983154058456421, - "learning_rate": 5.7520000000000005e-05, - "loss": 1.4631, - "step": 2125 - }, - { - "epoch": 0.852, - "grad_norm": 1.5361416339874268, - "learning_rate": 5.742000000000001e-05, - "loss": 2.3421, - "step": 2130 - }, - { - "epoch": 0.854, - "grad_norm": 1.5888581275939941, - "learning_rate": 5.732000000000001e-05, - "loss": 1.5937, - "step": 2135 - }, - { - "epoch": 0.856, - "grad_norm": 2.2069616317749023, - "learning_rate": 5.7220000000000004e-05, - "loss": 1.7698, - "step": 2140 - }, - { - "epoch": 0.858, - "grad_norm": 3.34380841255188, - "learning_rate": 5.712000000000001e-05, - "loss": 2.0116, - "step": 2145 - }, - { - "epoch": 0.86, - "grad_norm": 2.184051513671875, - "learning_rate": 5.7020000000000006e-05, - "loss": 1.8469, - "step": 2150 - }, - { - "epoch": 0.862, - "grad_norm": 4.115564823150635, - "learning_rate": 5.6920000000000004e-05, - "loss": 1.6461, - "step": 2155 - }, - { - "epoch": 0.864, - "grad_norm": 3.084815263748169, - "learning_rate": 5.682000000000001e-05, - "loss": 1.5599, - "step": 2160 - }, - { - "epoch": 0.866, - "grad_norm": 2.8951117992401123, - "learning_rate": 5.6720000000000006e-05, - "loss": 2.0385, - "step": 2165 - }, - { - "epoch": 0.868, - "grad_norm": 2.4090707302093506, - "learning_rate": 5.6620000000000003e-05, - "loss": 1.74, - "step": 2170 - }, - { - "epoch": 0.87, - "grad_norm": 2.6545732021331787, - "learning_rate": 5.652000000000001e-05, - "loss": 2.3722, - "step": 2175 - }, - { - "epoch": 0.872, - "grad_norm": 2.1310207843780518, - "learning_rate": 5.6420000000000005e-05, - "loss": 2.0919, - "step": 2180 - }, - { - "epoch": 0.874, - "grad_norm": 1.826372504234314, - "learning_rate": 5.632e-05, - "loss": 1.8353, - "step": 2185 - }, - { - "epoch": 0.876, - "grad_norm": 3.4520180225372314, - "learning_rate": 5.622000000000001e-05, - "loss": 1.8989, - "step": 2190 - }, - { - "epoch": 0.878, - "grad_norm": 3.487771511077881, - "learning_rate": 5.6120000000000005e-05, - "loss": 2.0489, - "step": 2195 - }, - { - "epoch": 0.88, - "grad_norm": 2.4317750930786133, - "learning_rate": 5.602000000000001e-05, - "loss": 1.5238, - "step": 2200 - }, - { - "epoch": 0.882, - "grad_norm": 4.03161096572876, - "learning_rate": 5.592000000000001e-05, - "loss": 2.0312, - "step": 2205 - }, - { - "epoch": 0.884, - "grad_norm": 1.701350450515747, - "learning_rate": 5.5820000000000004e-05, - "loss": 1.6582, - "step": 2210 - }, - { - "epoch": 0.886, - "grad_norm": 2.434293746948242, - "learning_rate": 5.572000000000001e-05, - "loss": 2.1474, - "step": 2215 - }, - { - "epoch": 0.888, - "grad_norm": 2.668346405029297, - "learning_rate": 5.5620000000000006e-05, - "loss": 1.7028, - "step": 2220 - }, - { - "epoch": 0.89, - "grad_norm": 2.782132148742676, - "learning_rate": 5.5520000000000004e-05, - "loss": 1.5188, - "step": 2225 - }, - { - "epoch": 0.892, - "grad_norm": 3.1809840202331543, - "learning_rate": 5.542000000000001e-05, - "loss": 1.5867, - "step": 2230 - }, - { - "epoch": 0.894, - "grad_norm": 3.710517644882202, - "learning_rate": 5.5320000000000006e-05, - "loss": 1.6012, - "step": 2235 - }, - { - "epoch": 0.896, - "grad_norm": 2.689161539077759, - "learning_rate": 5.522e-05, - "loss": 1.6461, - "step": 2240 - }, - { - "epoch": 0.898, - "grad_norm": 3.879901647567749, - "learning_rate": 5.512000000000001e-05, - "loss": 1.8078, - "step": 2245 - }, - { - "epoch": 0.9, - "grad_norm": 3.5880234241485596, - "learning_rate": 5.5020000000000005e-05, - "loss": 1.862, - "step": 2250 - }, - { - "epoch": 0.902, - "grad_norm": 2.162250518798828, - "learning_rate": 5.492e-05, - "loss": 1.7578, - "step": 2255 - }, - { - "epoch": 0.904, - "grad_norm": 2.5121278762817383, - "learning_rate": 5.482000000000001e-05, - "loss": 1.9823, - "step": 2260 - }, - { - "epoch": 0.906, - "grad_norm": 2.9544060230255127, - "learning_rate": 5.4720000000000005e-05, - "loss": 1.6525, - "step": 2265 - }, - { - "epoch": 0.908, - "grad_norm": 3.3571219444274902, - "learning_rate": 5.462e-05, - "loss": 1.5033, - "step": 2270 - }, - { - "epoch": 0.91, - "grad_norm": 2.5898938179016113, - "learning_rate": 5.4520000000000007e-05, - "loss": 1.7722, - "step": 2275 - }, - { - "epoch": 0.912, - "grad_norm": 3.3335447311401367, - "learning_rate": 5.4420000000000004e-05, - "loss": 1.6362, - "step": 2280 - }, - { - "epoch": 0.914, - "grad_norm": 2.584991455078125, - "learning_rate": 5.432e-05, - "loss": 1.4556, - "step": 2285 - }, - { - "epoch": 0.916, - "grad_norm": 2.4838953018188477, - "learning_rate": 5.4220000000000006e-05, - "loss": 1.4268, - "step": 2290 - }, - { - "epoch": 0.918, - "grad_norm": 2.082561492919922, - "learning_rate": 5.4120000000000004e-05, - "loss": 1.6695, - "step": 2295 - }, - { - "epoch": 0.92, - "grad_norm": 3.49015474319458, - "learning_rate": 5.402e-05, - "loss": 1.8325, - "step": 2300 - }, - { - "epoch": 0.922, - "grad_norm": 4.535400867462158, - "learning_rate": 5.3920000000000006e-05, - "loss": 1.7432, - "step": 2305 - }, - { - "epoch": 0.924, - "grad_norm": 1.199286699295044, - "learning_rate": 5.382e-05, - "loss": 2.2751, - "step": 2310 - }, - { - "epoch": 0.926, - "grad_norm": 3.7484588623046875, - "learning_rate": 5.372e-05, - "loss": 2.0561, - "step": 2315 - }, - { - "epoch": 0.928, - "grad_norm": 2.494021415710449, - "learning_rate": 5.3620000000000005e-05, - "loss": 1.7586, - "step": 2320 - }, - { - "epoch": 0.93, - "grad_norm": 1.4161405563354492, - "learning_rate": 5.352e-05, - "loss": 1.8513, - "step": 2325 - }, - { - "epoch": 0.932, - "grad_norm": 3.006577253341675, - "learning_rate": 5.342e-05, - "loss": 1.9067, - "step": 2330 - }, - { - "epoch": 0.934, - "grad_norm": 2.625708818435669, - "learning_rate": 5.3320000000000004e-05, - "loss": 1.4276, - "step": 2335 - }, - { - "epoch": 0.936, - "grad_norm": 2.3370842933654785, - "learning_rate": 5.322e-05, - "loss": 2.1078, - "step": 2340 - }, - { - "epoch": 0.938, - "grad_norm": 2.641144275665283, - "learning_rate": 5.3120000000000006e-05, - "loss": 1.3618, - "step": 2345 - }, - { - "epoch": 0.94, - "grad_norm": 1.7367668151855469, - "learning_rate": 5.3020000000000004e-05, - "loss": 2.147, - "step": 2350 - }, - { - "epoch": 0.942, - "grad_norm": 2.7725813388824463, - "learning_rate": 5.292e-05, - "loss": 1.437, - "step": 2355 - }, - { - "epoch": 0.944, - "grad_norm": 4.516371250152588, - "learning_rate": 5.2820000000000006e-05, - "loss": 1.7548, - "step": 2360 - }, - { - "epoch": 0.946, - "grad_norm": 3.1467254161834717, - "learning_rate": 5.2720000000000003e-05, - "loss": 1.5239, - "step": 2365 - }, - { - "epoch": 0.948, - "grad_norm": 3.392289638519287, - "learning_rate": 5.262e-05, - "loss": 1.6646, - "step": 2370 - }, - { - "epoch": 0.95, - "grad_norm": 2.6524746417999268, - "learning_rate": 5.2520000000000005e-05, - "loss": 1.6977, - "step": 2375 - }, - { - "epoch": 0.952, - "grad_norm": 1.8809561729431152, - "learning_rate": 5.242e-05, - "loss": 1.4686, - "step": 2380 - }, - { - "epoch": 0.954, - "grad_norm": 2.859346866607666, - "learning_rate": 5.232e-05, - "loss": 1.9559, - "step": 2385 - }, - { - "epoch": 0.956, - "grad_norm": 2.9633779525756836, - "learning_rate": 5.2220000000000005e-05, - "loss": 1.907, - "step": 2390 - }, - { - "epoch": 0.958, - "grad_norm": 2.6979637145996094, - "learning_rate": 5.212e-05, - "loss": 1.3605, - "step": 2395 - }, - { - "epoch": 0.96, - "grad_norm": 3.2229700088500977, - "learning_rate": 5.202e-05, - "loss": 1.4891, - "step": 2400 - }, - { - "epoch": 0.962, - "grad_norm": 2.6224522590637207, - "learning_rate": 5.1920000000000004e-05, - "loss": 1.6005, - "step": 2405 - }, - { - "epoch": 0.964, - "grad_norm": 2.480083703994751, - "learning_rate": 5.182e-05, - "loss": 1.596, - "step": 2410 - }, - { - "epoch": 0.966, - "grad_norm": 2.6120476722717285, - "learning_rate": 5.172e-05, - "loss": 2.1357, - "step": 2415 - }, - { - "epoch": 0.968, - "grad_norm": 1.8930892944335938, - "learning_rate": 5.1620000000000004e-05, - "loss": 1.8591, - "step": 2420 - }, - { - "epoch": 0.97, - "grad_norm": 2.999755382537842, - "learning_rate": 5.152e-05, - "loss": 1.46, - "step": 2425 - }, - { - "epoch": 0.972, - "grad_norm": 3.370266914367676, - "learning_rate": 5.142e-05, - "loss": 1.7493, - "step": 2430 - }, - { - "epoch": 0.974, - "grad_norm": 1.9898550510406494, - "learning_rate": 5.132e-05, - "loss": 1.7027, - "step": 2435 - }, - { - "epoch": 0.976, - "grad_norm": 1.545696496963501, - "learning_rate": 5.122e-05, - "loss": 1.6076, - "step": 2440 - }, - { - "epoch": 0.978, - "grad_norm": 2.1743006706237793, - "learning_rate": 5.112e-05, - "loss": 1.6397, - "step": 2445 - }, - { - "epoch": 0.98, - "grad_norm": 3.9286975860595703, - "learning_rate": 5.102e-05, - "loss": 1.9747, - "step": 2450 - }, - { - "epoch": 0.982, - "grad_norm": 3.640699863433838, - "learning_rate": 5.092e-05, - "loss": 2.0213, - "step": 2455 - }, - { - "epoch": 0.984, - "grad_norm": 2.4696404933929443, - "learning_rate": 5.082e-05, - "loss": 1.677, - "step": 2460 - }, - { - "epoch": 0.986, - "grad_norm": 3.111293077468872, - "learning_rate": 5.072e-05, - "loss": 1.9945, - "step": 2465 - }, - { - "epoch": 0.988, - "grad_norm": 2.899752616882324, - "learning_rate": 5.062e-05, - "loss": 1.8826, - "step": 2470 - }, - { - "epoch": 0.99, - "grad_norm": 1.4491517543792725, - "learning_rate": 5.052e-05, - "loss": 1.765, - "step": 2475 - }, - { - "epoch": 0.992, - "grad_norm": 1.7043366432189941, - "learning_rate": 5.042e-05, - "loss": 1.8315, - "step": 2480 - }, - { - "epoch": 0.994, - "grad_norm": 1.644760251045227, - "learning_rate": 5.032e-05, - "loss": 1.7612, - "step": 2485 - }, - { - "epoch": 0.996, - "grad_norm": 2.3809268474578857, - "learning_rate": 5.0220000000000004e-05, - "loss": 1.6422, - "step": 2490 - }, - { - "epoch": 0.998, - "grad_norm": 1.5746747255325317, - "learning_rate": 5.012e-05, - "loss": 1.7717, - "step": 2495 - }, - { - "epoch": 1.0, - "grad_norm": 1.9237031936645508, - "learning_rate": 5.002e-05, - "loss": 1.6089, - "step": 2500 - }, - { - "epoch": 1.002, - "grad_norm": 1.744374394416809, - "learning_rate": 4.992e-05, - "loss": 1.6755, - "step": 2505 - }, - { - "epoch": 1.004, - "grad_norm": 3.569504499435425, - "learning_rate": 4.982e-05, - "loss": 1.7663, - "step": 2510 - }, - { - "epoch": 1.006, - "grad_norm": 2.9554648399353027, - "learning_rate": 4.972e-05, - "loss": 2.0938, - "step": 2515 - }, - { - "epoch": 1.008, - "grad_norm": 5.585300922393799, - "learning_rate": 4.962e-05, - "loss": 1.4909, - "step": 2520 - }, - { - "epoch": 1.01, - "grad_norm": 2.8298325538635254, - "learning_rate": 4.952e-05, - "loss": 1.9067, - "step": 2525 - }, - { - "epoch": 1.012, - "grad_norm": 2.182053327560425, - "learning_rate": 4.942e-05, - "loss": 1.7318, - "step": 2530 - }, - { - "epoch": 1.014, - "grad_norm": 3.053006649017334, - "learning_rate": 4.932e-05, - "loss": 2.0501, - "step": 2535 - }, - { - "epoch": 1.016, - "grad_norm": 1.7813453674316406, - "learning_rate": 4.9220000000000006e-05, - "loss": 1.6234, - "step": 2540 - }, - { - "epoch": 1.018, - "grad_norm": 2.6586945056915283, - "learning_rate": 4.9120000000000004e-05, - "loss": 1.928, - "step": 2545 - }, - { - "epoch": 1.02, - "grad_norm": 2.4950873851776123, - "learning_rate": 4.902e-05, - "loss": 2.0764, - "step": 2550 - }, - { - "epoch": 1.022, - "grad_norm": 4.417685031890869, - "learning_rate": 4.8920000000000006e-05, - "loss": 2.0479, - "step": 2555 - }, - { - "epoch": 1.024, - "grad_norm": 2.933239698410034, - "learning_rate": 4.8820000000000004e-05, - "loss": 1.856, - "step": 2560 - }, - { - "epoch": 1.026, - "grad_norm": 1.873593807220459, - "learning_rate": 4.872000000000001e-05, - "loss": 2.118, - "step": 2565 - }, - { - "epoch": 1.028, - "grad_norm": 3.928393840789795, - "learning_rate": 4.8620000000000005e-05, - "loss": 1.7692, - "step": 2570 - }, - { - "epoch": 1.03, - "grad_norm": 2.2509684562683105, - "learning_rate": 4.852e-05, - "loss": 1.9995, - "step": 2575 - }, - { - "epoch": 1.032, - "grad_norm": 1.9440803527832031, - "learning_rate": 4.842000000000001e-05, - "loss": 1.8849, - "step": 2580 - }, - { - "epoch": 1.034, - "grad_norm": 3.265815019607544, - "learning_rate": 4.8320000000000005e-05, - "loss": 2.0061, - "step": 2585 - }, - { - "epoch": 1.036, - "grad_norm": 2.920482873916626, - "learning_rate": 4.822e-05, - "loss": 1.758, - "step": 2590 - }, - { - "epoch": 1.038, - "grad_norm": 2.102724313735962, - "learning_rate": 4.812000000000001e-05, - "loss": 1.7924, - "step": 2595 - }, - { - "epoch": 1.04, - "grad_norm": 2.5475990772247314, - "learning_rate": 4.8020000000000004e-05, - "loss": 1.7618, - "step": 2600 - }, - { - "epoch": 1.042, - "grad_norm": 2.3291637897491455, - "learning_rate": 4.792e-05, - "loss": 1.5242, - "step": 2605 - }, - { - "epoch": 1.044, - "grad_norm": 4.129797458648682, - "learning_rate": 4.7820000000000006e-05, - "loss": 1.4526, - "step": 2610 - }, - { - "epoch": 1.046, - "grad_norm": 2.763880968093872, - "learning_rate": 4.7720000000000004e-05, - "loss": 1.5913, - "step": 2615 - }, - { - "epoch": 1.048, - "grad_norm": 3.5403544902801514, - "learning_rate": 4.762e-05, - "loss": 1.5221, - "step": 2620 - }, - { - "epoch": 1.05, - "grad_norm": 5.005568027496338, - "learning_rate": 4.7520000000000006e-05, - "loss": 1.5918, - "step": 2625 - }, - { - "epoch": 1.052, - "grad_norm": 1.223071813583374, - "learning_rate": 4.742e-05, - "loss": 2.3062, - "step": 2630 - }, - { - "epoch": 1.054, - "grad_norm": 2.459355115890503, - "learning_rate": 4.732e-05, - "loss": 1.3783, - "step": 2635 - }, - { - "epoch": 1.056, - "grad_norm": 1.4663161039352417, - "learning_rate": 4.7220000000000005e-05, - "loss": 1.7848, - "step": 2640 - }, - { - "epoch": 1.058, - "grad_norm": 3.654409885406494, - "learning_rate": 4.712e-05, - "loss": 1.6248, - "step": 2645 - }, - { - "epoch": 1.06, - "grad_norm": 1.8045170307159424, - "learning_rate": 4.702e-05, - "loss": 1.7528, - "step": 2650 - }, - { - "epoch": 1.062, - "grad_norm": 1.64960515499115, - "learning_rate": 4.6920000000000005e-05, - "loss": 1.471, - "step": 2655 - }, - { - "epoch": 1.064, - "grad_norm": 2.820439577102661, - "learning_rate": 4.682e-05, - "loss": 2.0777, - "step": 2660 - }, - { - "epoch": 1.066, - "grad_norm": 2.804915428161621, - "learning_rate": 4.672e-05, - "loss": 1.512, - "step": 2665 - }, - { - "epoch": 1.068, - "grad_norm": 4.686802387237549, - "learning_rate": 4.6620000000000004e-05, - "loss": 1.5203, - "step": 2670 - }, - { - "epoch": 1.07, - "grad_norm": 2.2953226566314697, - "learning_rate": 4.652e-05, - "loss": 1.6024, - "step": 2675 - }, - { - "epoch": 1.072, - "grad_norm": 1.3032490015029907, - "learning_rate": 4.642e-05, - "loss": 2.141, - "step": 2680 - }, - { - "epoch": 1.074, - "grad_norm": 2.539076328277588, - "learning_rate": 4.6320000000000004e-05, - "loss": 1.4758, - "step": 2685 - }, - { - "epoch": 1.076, - "grad_norm": 2.2188971042633057, - "learning_rate": 4.622e-05, - "loss": 1.2609, - "step": 2690 - }, - { - "epoch": 1.078, - "grad_norm": 1.9796382188796997, - "learning_rate": 4.612e-05, - "loss": 1.5076, - "step": 2695 - }, - { - "epoch": 1.08, - "grad_norm": 1.668433666229248, - "learning_rate": 4.602e-05, - "loss": 1.5317, - "step": 2700 - }, - { - "epoch": 1.082, - "grad_norm": 2.701046943664551, - "learning_rate": 4.592e-05, - "loss": 1.5925, - "step": 2705 - }, - { - "epoch": 1.084, - "grad_norm": 2.438528299331665, - "learning_rate": 4.5820000000000005e-05, - "loss": 2.1409, - "step": 2710 - }, - { - "epoch": 1.086, - "grad_norm": 3.4532997608184814, - "learning_rate": 4.572e-05, - "loss": 2.0558, - "step": 2715 - }, - { - "epoch": 1.088, - "grad_norm": 4.009062767028809, - "learning_rate": 4.562e-05, - "loss": 2.0329, - "step": 2720 - }, - { - "epoch": 1.09, - "grad_norm": 1.6068295240402222, - "learning_rate": 4.5520000000000005e-05, - "loss": 1.9527, - "step": 2725 - }, - { - "epoch": 1.092, - "grad_norm": 1.442806363105774, - "learning_rate": 4.542e-05, - "loss": 2.0331, - "step": 2730 - }, - { - "epoch": 1.094, - "grad_norm": 2.4630472660064697, - "learning_rate": 4.532e-05, - "loss": 1.9652, - "step": 2735 - }, - { - "epoch": 1.096, - "grad_norm": 1.4572904109954834, - "learning_rate": 4.5220000000000004e-05, - "loss": 1.4349, - "step": 2740 - }, - { - "epoch": 1.098, - "grad_norm": 2.3821866512298584, - "learning_rate": 4.512e-05, - "loss": 1.9615, - "step": 2745 - }, - { - "epoch": 1.1, - "grad_norm": 2.3642356395721436, - "learning_rate": 4.502e-05, - "loss": 2.1396, - "step": 2750 - }, - { - "epoch": 1.102, - "grad_norm": 4.464609622955322, - "learning_rate": 4.4920000000000004e-05, - "loss": 2.1359, - "step": 2755 - }, - { - "epoch": 1.104, - "grad_norm": 2.425525665283203, - "learning_rate": 4.482e-05, - "loss": 1.4996, - "step": 2760 - }, - { - "epoch": 1.106, - "grad_norm": 1.7285360097885132, - "learning_rate": 4.472e-05, - "loss": 1.8631, - "step": 2765 - }, - { - "epoch": 1.108, - "grad_norm": 3.584416151046753, - "learning_rate": 4.462e-05, - "loss": 1.834, - "step": 2770 - }, - { - "epoch": 1.11, - "grad_norm": 2.855907678604126, - "learning_rate": 4.452e-05, - "loss": 1.9311, - "step": 2775 - }, - { - "epoch": 1.112, - "grad_norm": 2.8832621574401855, - "learning_rate": 4.442e-05, - "loss": 1.3509, - "step": 2780 - }, - { - "epoch": 1.114, - "grad_norm": 2.4656975269317627, - "learning_rate": 4.432e-05, - "loss": 1.7867, - "step": 2785 - }, - { - "epoch": 1.116, - "grad_norm": 1.789852499961853, - "learning_rate": 4.422e-05, - "loss": 1.6699, - "step": 2790 - }, - { - "epoch": 1.1179999999999999, - "grad_norm": 2.1377182006835938, - "learning_rate": 4.412e-05, - "loss": 1.3266, - "step": 2795 - }, - { - "epoch": 1.12, - "grad_norm": 2.461942672729492, - "learning_rate": 4.402e-05, - "loss": 1.8908, - "step": 2800 - }, - { - "epoch": 1.1219999999999999, - "grad_norm": 2.461036205291748, - "learning_rate": 4.392e-05, - "loss": 1.6533, - "step": 2805 - }, - { - "epoch": 1.124, - "grad_norm": 2.4012019634246826, - "learning_rate": 4.382e-05, - "loss": 1.923, - "step": 2810 - }, - { - "epoch": 1.126, - "grad_norm": 2.6008200645446777, - "learning_rate": 4.372e-05, - "loss": 1.9895, - "step": 2815 - }, - { - "epoch": 1.1280000000000001, - "grad_norm": 1.665590524673462, - "learning_rate": 4.362e-05, - "loss": 1.9219, - "step": 2820 - }, - { - "epoch": 1.13, - "grad_norm": 2.002432346343994, - "learning_rate": 4.352e-05, - "loss": 2.068, - "step": 2825 - }, - { - "epoch": 1.1320000000000001, - "grad_norm": 2.9861936569213867, - "learning_rate": 4.342e-05, - "loss": 1.6668, - "step": 2830 - }, - { - "epoch": 1.134, - "grad_norm": 2.0071372985839844, - "learning_rate": 4.332e-05, - "loss": 1.7266, - "step": 2835 - }, - { - "epoch": 1.1360000000000001, - "grad_norm": 2.5981099605560303, - "learning_rate": 4.3219999999999996e-05, - "loss": 2.1248, - "step": 2840 - }, - { - "epoch": 1.138, - "grad_norm": 2.252606153488159, - "learning_rate": 4.312000000000001e-05, - "loss": 2.2304, - "step": 2845 - }, - { - "epoch": 1.1400000000000001, - "grad_norm": 2.1472439765930176, - "learning_rate": 4.3020000000000005e-05, - "loss": 1.7398, - "step": 2850 - }, - { - "epoch": 1.142, - "grad_norm": 2.2459940910339355, - "learning_rate": 4.292e-05, - "loss": 1.503, - "step": 2855 - }, - { - "epoch": 1.144, - "grad_norm": 2.640773296356201, - "learning_rate": 4.282000000000001e-05, - "loss": 1.8056, - "step": 2860 - }, - { - "epoch": 1.146, - "grad_norm": 2.06899356842041, - "learning_rate": 4.2720000000000004e-05, - "loss": 1.3898, - "step": 2865 - }, - { - "epoch": 1.148, - "grad_norm": 1.788794994354248, - "learning_rate": 4.262e-05, - "loss": 1.5943, - "step": 2870 - }, - { - "epoch": 1.15, - "grad_norm": 2.640416383743286, - "learning_rate": 4.2520000000000006e-05, - "loss": 1.8443, - "step": 2875 - }, - { - "epoch": 1.152, - "grad_norm": 2.1008048057556152, - "learning_rate": 4.2420000000000004e-05, - "loss": 1.745, - "step": 2880 - }, - { - "epoch": 1.154, - "grad_norm": 2.0371053218841553, - "learning_rate": 4.232e-05, - "loss": 1.8983, - "step": 2885 - }, - { - "epoch": 1.156, - "grad_norm": 2.6234545707702637, - "learning_rate": 4.2220000000000006e-05, - "loss": 1.531, - "step": 2890 - }, - { - "epoch": 1.158, - "grad_norm": 2.050816535949707, - "learning_rate": 4.212e-05, - "loss": 1.4937, - "step": 2895 - }, - { - "epoch": 1.16, - "grad_norm": 2.1251163482666016, - "learning_rate": 4.202e-05, - "loss": 1.8614, - "step": 2900 - }, - { - "epoch": 1.162, - "grad_norm": 5.432298183441162, - "learning_rate": 4.1920000000000005e-05, - "loss": 1.6035, - "step": 2905 - }, - { - "epoch": 1.164, - "grad_norm": 1.9369288682937622, - "learning_rate": 4.182e-05, - "loss": 2.1265, - "step": 2910 - }, - { - "epoch": 1.166, - "grad_norm": 1.594335675239563, - "learning_rate": 4.172e-05, - "loss": 1.6807, - "step": 2915 - }, - { - "epoch": 1.168, - "grad_norm": 3.008535861968994, - "learning_rate": 4.1620000000000005e-05, - "loss": 1.5361, - "step": 2920 - }, - { - "epoch": 1.17, - "grad_norm": 2.3314995765686035, - "learning_rate": 4.152e-05, - "loss": 1.5224, - "step": 2925 - }, - { - "epoch": 1.172, - "grad_norm": 1.2118889093399048, - "learning_rate": 4.142000000000001e-05, - "loss": 1.8489, - "step": 2930 - }, - { - "epoch": 1.174, - "grad_norm": 3.8027493953704834, - "learning_rate": 4.1320000000000004e-05, - "loss": 1.7373, - "step": 2935 - }, - { - "epoch": 1.176, - "grad_norm": 2.39216685295105, - "learning_rate": 4.122e-05, - "loss": 1.7364, - "step": 2940 - }, - { - "epoch": 1.178, - "grad_norm": 2.981109142303467, - "learning_rate": 4.1120000000000006e-05, - "loss": 1.605, - "step": 2945 - }, - { - "epoch": 1.18, - "grad_norm": 2.1557092666625977, - "learning_rate": 4.1020000000000004e-05, - "loss": 1.5599, - "step": 2950 - }, - { - "epoch": 1.182, - "grad_norm": 1.6797981262207031, - "learning_rate": 4.092e-05, - "loss": 1.7236, - "step": 2955 - }, - { - "epoch": 1.184, - "grad_norm": 1.842079997062683, - "learning_rate": 4.0820000000000006e-05, - "loss": 1.7207, - "step": 2960 - }, - { - "epoch": 1.186, - "grad_norm": 1.6276612281799316, - "learning_rate": 4.072e-05, - "loss": 1.7635, - "step": 2965 - }, - { - "epoch": 1.188, - "grad_norm": 2.686825752258301, - "learning_rate": 4.062e-05, - "loss": 1.7592, - "step": 2970 - }, - { - "epoch": 1.19, - "grad_norm": 1.549454689025879, - "learning_rate": 4.0520000000000005e-05, - "loss": 1.6635, - "step": 2975 - }, - { - "epoch": 1.192, - "grad_norm": 1.4507205486297607, - "learning_rate": 4.042e-05, - "loss": 1.769, - "step": 2980 - }, - { - "epoch": 1.194, - "grad_norm": 2.5181939601898193, - "learning_rate": 4.032e-05, - "loss": 1.6546, - "step": 2985 - }, - { - "epoch": 1.196, - "grad_norm": 1.9741522073745728, - "learning_rate": 4.0220000000000005e-05, - "loss": 2.1699, - "step": 2990 - }, - { - "epoch": 1.198, - "grad_norm": 3.139752149581909, - "learning_rate": 4.012e-05, - "loss": 1.533, - "step": 2995 - }, - { - "epoch": 1.2, - "grad_norm": 1.711753487586975, - "learning_rate": 4.002e-05, - "loss": 1.5864, - "step": 3000 - }, - { - "epoch": 1.202, - "grad_norm": 3.6022934913635254, - "learning_rate": 3.9920000000000004e-05, - "loss": 1.8351, - "step": 3005 - }, - { - "epoch": 1.204, - "grad_norm": 1.8488565683364868, - "learning_rate": 3.982e-05, - "loss": 1.7007, - "step": 3010 - }, - { - "epoch": 1.206, - "grad_norm": 2.834606409072876, - "learning_rate": 3.972e-05, - "loss": 1.56, - "step": 3015 - }, - { - "epoch": 1.208, - "grad_norm": 3.387948751449585, - "learning_rate": 3.9620000000000004e-05, - "loss": 1.9211, - "step": 3020 - }, - { - "epoch": 1.21, - "grad_norm": 1.9332804679870605, - "learning_rate": 3.952e-05, - "loss": 1.5022, - "step": 3025 - }, - { - "epoch": 1.212, - "grad_norm": 1.5052354335784912, - "learning_rate": 3.942e-05, - "loss": 1.4375, - "step": 3030 - }, - { - "epoch": 1.214, - "grad_norm": 2.833865165710449, - "learning_rate": 3.932e-05, - "loss": 1.8459, - "step": 3035 - }, - { - "epoch": 1.216, - "grad_norm": 3.088143825531006, - "learning_rate": 3.922e-05, - "loss": 1.286, - "step": 3040 - }, - { - "epoch": 1.218, - "grad_norm": 1.8268078565597534, - "learning_rate": 3.912e-05, - "loss": 1.6514, - "step": 3045 - }, - { - "epoch": 1.22, - "grad_norm": 3.714958906173706, - "learning_rate": 3.902e-05, - "loss": 1.9956, - "step": 3050 - }, - { - "epoch": 1.222, - "grad_norm": 1.789611577987671, - "learning_rate": 3.892e-05, - "loss": 1.8499, - "step": 3055 - }, - { - "epoch": 1.224, - "grad_norm": 6.017509937286377, - "learning_rate": 3.882e-05, - "loss": 1.3559, - "step": 3060 - }, - { - "epoch": 1.226, - "grad_norm": 3.293497323989868, - "learning_rate": 3.872e-05, - "loss": 1.6115, - "step": 3065 - }, - { - "epoch": 1.228, - "grad_norm": 2.0134570598602295, - "learning_rate": 3.862e-05, - "loss": 1.601, - "step": 3070 - }, - { - "epoch": 1.23, - "grad_norm": 2.3811817169189453, - "learning_rate": 3.8520000000000004e-05, - "loss": 1.8027, - "step": 3075 - }, - { - "epoch": 1.232, - "grad_norm": 2.415764570236206, - "learning_rate": 3.842e-05, - "loss": 1.796, - "step": 3080 - }, - { - "epoch": 1.234, - "grad_norm": 4.111828804016113, - "learning_rate": 3.832e-05, - "loss": 1.5326, - "step": 3085 - }, - { - "epoch": 1.236, - "grad_norm": 2.0376603603363037, - "learning_rate": 3.822e-05, - "loss": 1.887, - "step": 3090 - }, - { - "epoch": 1.238, - "grad_norm": 2.478423833847046, - "learning_rate": 3.812e-05, - "loss": 1.6035, - "step": 3095 - }, - { - "epoch": 1.24, - "grad_norm": 2.4536828994750977, - "learning_rate": 3.802e-05, - "loss": 1.7419, - "step": 3100 - }, - { - "epoch": 1.242, - "grad_norm": 3.351297378540039, - "learning_rate": 3.792e-05, - "loss": 1.5085, - "step": 3105 - }, - { - "epoch": 1.244, - "grad_norm": 3.1790854930877686, - "learning_rate": 3.782e-05, - "loss": 2.0231, - "step": 3110 - }, - { - "epoch": 1.246, - "grad_norm": 3.5371744632720947, - "learning_rate": 3.772e-05, - "loss": 1.536, - "step": 3115 - }, - { - "epoch": 1.248, - "grad_norm": 2.3733763694763184, - "learning_rate": 3.762e-05, - "loss": 1.8053, - "step": 3120 - }, - { - "epoch": 1.25, - "grad_norm": 3.0253279209136963, - "learning_rate": 3.752e-05, - "loss": 1.6949, - "step": 3125 - }, - { - "epoch": 1.252, - "grad_norm": 3.237069845199585, - "learning_rate": 3.742e-05, - "loss": 2.0501, - "step": 3130 - }, - { - "epoch": 1.254, - "grad_norm": 2.2193691730499268, - "learning_rate": 3.732e-05, - "loss": 1.9702, - "step": 3135 - }, - { - "epoch": 1.256, - "grad_norm": 1.867637038230896, - "learning_rate": 3.722e-05, - "loss": 1.4139, - "step": 3140 - }, - { - "epoch": 1.258, - "grad_norm": 1.434982180595398, - "learning_rate": 3.712e-05, - "loss": 2.0195, - "step": 3145 - }, - { - "epoch": 1.26, - "grad_norm": 3.4245541095733643, - "learning_rate": 3.702e-05, - "loss": 1.3227, - "step": 3150 - }, - { - "epoch": 1.262, - "grad_norm": 2.501880168914795, - "learning_rate": 3.692e-05, - "loss": 1.8618, - "step": 3155 - }, - { - "epoch": 1.264, - "grad_norm": 2.0601398944854736, - "learning_rate": 3.682e-05, - "loss": 1.5895, - "step": 3160 - }, - { - "epoch": 1.266, - "grad_norm": 1.748300313949585, - "learning_rate": 3.672000000000001e-05, - "loss": 1.9179, - "step": 3165 - }, - { - "epoch": 1.268, - "grad_norm": 3.1191680431365967, - "learning_rate": 3.6620000000000005e-05, - "loss": 1.6142, - "step": 3170 - }, - { - "epoch": 1.27, - "grad_norm": 3.330815076828003, - "learning_rate": 3.652e-05, - "loss": 1.3749, - "step": 3175 - }, - { - "epoch": 1.272, - "grad_norm": 2.0404210090637207, - "learning_rate": 3.642000000000001e-05, - "loss": 1.6633, - "step": 3180 - }, - { - "epoch": 1.274, - "grad_norm": 3.1883702278137207, - "learning_rate": 3.6320000000000005e-05, - "loss": 1.6506, - "step": 3185 - }, - { - "epoch": 1.276, - "grad_norm": 2.241130828857422, - "learning_rate": 3.622e-05, - "loss": 1.5869, - "step": 3190 - }, - { - "epoch": 1.278, - "grad_norm": 2.0807995796203613, - "learning_rate": 3.6120000000000007e-05, - "loss": 1.5948, - "step": 3195 - }, - { - "epoch": 1.28, - "grad_norm": 3.549083709716797, - "learning_rate": 3.6020000000000004e-05, - "loss": 1.3417, - "step": 3200 - }, - { - "epoch": 1.282, - "grad_norm": 3.833611011505127, - "learning_rate": 3.592e-05, - "loss": 1.9591, - "step": 3205 - }, - { - "epoch": 1.284, - "grad_norm": 3.573197364807129, - "learning_rate": 3.5820000000000006e-05, - "loss": 1.5512, - "step": 3210 - }, - { - "epoch": 1.286, - "grad_norm": 2.6214630603790283, - "learning_rate": 3.5720000000000004e-05, - "loss": 1.6256, - "step": 3215 - }, - { - "epoch": 1.288, - "grad_norm": 3.9147443771362305, - "learning_rate": 3.562e-05, - "loss": 1.9456, - "step": 3220 - }, - { - "epoch": 1.29, - "grad_norm": 1.9649760723114014, - "learning_rate": 3.5520000000000006e-05, - "loss": 1.7262, - "step": 3225 - }, - { - "epoch": 1.292, - "grad_norm": 1.7031080722808838, - "learning_rate": 3.542e-05, - "loss": 1.4894, - "step": 3230 - }, - { - "epoch": 1.294, - "grad_norm": 2.9356372356414795, - "learning_rate": 3.532e-05, - "loss": 1.7021, - "step": 3235 - }, - { - "epoch": 1.296, - "grad_norm": 1.2076226472854614, - "learning_rate": 3.5220000000000005e-05, - "loss": 1.9649, - "step": 3240 - }, - { - "epoch": 1.298, - "grad_norm": 1.673506736755371, - "learning_rate": 3.512e-05, - "loss": 1.7253, - "step": 3245 - }, - { - "epoch": 1.3, - "grad_norm": 2.7352523803710938, - "learning_rate": 3.502e-05, - "loss": 1.3556, - "step": 3250 - }, - { - "epoch": 1.302, - "grad_norm": 3.243248224258423, - "learning_rate": 3.4920000000000004e-05, - "loss": 2.0573, - "step": 3255 - }, - { - "epoch": 1.304, - "grad_norm": 2.341780424118042, - "learning_rate": 3.482e-05, - "loss": 1.5711, - "step": 3260 - }, - { - "epoch": 1.306, - "grad_norm": 1.6308125257492065, - "learning_rate": 3.472e-05, - "loss": 1.3108, - "step": 3265 - }, - { - "epoch": 1.308, - "grad_norm": 3.9599924087524414, - "learning_rate": 3.4620000000000004e-05, - "loss": 2.3412, - "step": 3270 - }, - { - "epoch": 1.31, - "grad_norm": 1.9203848838806152, - "learning_rate": 3.452e-05, - "loss": 1.809, - "step": 3275 - }, - { - "epoch": 1.312, - "grad_norm": 2.534126043319702, - "learning_rate": 3.442e-05, - "loss": 1.8763, - "step": 3280 - }, - { - "epoch": 1.314, - "grad_norm": 1.4974746704101562, - "learning_rate": 3.4320000000000003e-05, - "loss": 1.7825, - "step": 3285 - }, - { - "epoch": 1.316, - "grad_norm": 2.634657859802246, - "learning_rate": 3.422e-05, - "loss": 1.4188, - "step": 3290 - }, - { - "epoch": 1.318, - "grad_norm": 2.7322049140930176, - "learning_rate": 3.412e-05, - "loss": 1.3707, - "step": 3295 - }, - { - "epoch": 1.32, - "grad_norm": 2.851452350616455, - "learning_rate": 3.402e-05, - "loss": 2.0354, - "step": 3300 - }, - { - "epoch": 1.322, - "grad_norm": 1.9547468423843384, - "learning_rate": 3.392e-05, - "loss": 1.9157, - "step": 3305 - }, - { - "epoch": 1.324, - "grad_norm": 2.0569794178009033, - "learning_rate": 3.3820000000000005e-05, - "loss": 1.7756, - "step": 3310 - }, - { - "epoch": 1.326, - "grad_norm": 2.3757388591766357, - "learning_rate": 3.372e-05, - "loss": 1.4369, - "step": 3315 - }, - { - "epoch": 1.328, - "grad_norm": 1.42741858959198, - "learning_rate": 3.362e-05, - "loss": 1.7811, - "step": 3320 - }, - { - "epoch": 1.33, - "grad_norm": 2.4347784519195557, - "learning_rate": 3.3520000000000004e-05, - "loss": 1.6027, - "step": 3325 - }, - { - "epoch": 1.332, - "grad_norm": 2.2193026542663574, - "learning_rate": 3.342e-05, - "loss": 1.2162, - "step": 3330 - }, - { - "epoch": 1.334, - "grad_norm": 2.4380321502685547, - "learning_rate": 3.332e-05, - "loss": 1.827, - "step": 3335 - }, - { - "epoch": 1.336, - "grad_norm": 1.8248051404953003, - "learning_rate": 3.3220000000000004e-05, - "loss": 1.8291, - "step": 3340 - }, - { - "epoch": 1.338, - "grad_norm": 2.8554930686950684, - "learning_rate": 3.312e-05, - "loss": 1.7322, - "step": 3345 - }, - { - "epoch": 1.34, - "grad_norm": 2.3782663345336914, - "learning_rate": 3.302e-05, - "loss": 1.765, - "step": 3350 - }, - { - "epoch": 1.342, - "grad_norm": 3.31634521484375, - "learning_rate": 3.292e-05, - "loss": 2.2507, - "step": 3355 - }, - { - "epoch": 1.3439999999999999, - "grad_norm": 2.7563440799713135, - "learning_rate": 3.282e-05, - "loss": 1.7399, - "step": 3360 - }, - { - "epoch": 1.346, - "grad_norm": 2.4215128421783447, - "learning_rate": 3.272e-05, - "loss": 1.5532, - "step": 3365 - }, - { - "epoch": 1.3479999999999999, - "grad_norm": 4.562981128692627, - "learning_rate": 3.262e-05, - "loss": 2.2052, - "step": 3370 - }, - { - "epoch": 1.35, - "grad_norm": 4.050610542297363, - "learning_rate": 3.252e-05, - "loss": 1.7591, - "step": 3375 - }, - { - "epoch": 1.3519999999999999, - "grad_norm": 1.5045493841171265, - "learning_rate": 3.242e-05, - "loss": 1.6352, - "step": 3380 - }, - { - "epoch": 1.354, - "grad_norm": 3.1320605278015137, - "learning_rate": 3.232e-05, - "loss": 2.1547, - "step": 3385 - }, - { - "epoch": 1.3559999999999999, - "grad_norm": 2.063000440597534, - "learning_rate": 3.222e-05, - "loss": 1.6313, - "step": 3390 - }, - { - "epoch": 1.358, - "grad_norm": 2.053938627243042, - "learning_rate": 3.212e-05, - "loss": 1.7112, - "step": 3395 - }, - { - "epoch": 1.3599999999999999, - "grad_norm": 2.418640613555908, - "learning_rate": 3.202e-05, - "loss": 1.4625, - "step": 3400 - }, - { - "epoch": 1.362, - "grad_norm": 2.6165764331817627, - "learning_rate": 3.192e-05, - "loss": 1.5001, - "step": 3405 - }, - { - "epoch": 1.3639999999999999, - "grad_norm": 2.136032819747925, - "learning_rate": 3.182e-05, - "loss": 1.4127, - "step": 3410 - }, - { - "epoch": 1.366, - "grad_norm": 2.4939239025115967, - "learning_rate": 3.172e-05, - "loss": 1.7215, - "step": 3415 - }, - { - "epoch": 1.3679999999999999, - "grad_norm": 1.5277928113937378, - "learning_rate": 3.162e-05, - "loss": 1.9664, - "step": 3420 - }, - { - "epoch": 1.37, - "grad_norm": 1.9791117906570435, - "learning_rate": 3.1519999999999996e-05, - "loss": 2.0302, - "step": 3425 - }, - { - "epoch": 1.3719999999999999, - "grad_norm": 1.7643859386444092, - "learning_rate": 3.142e-05, - "loss": 1.3785, - "step": 3430 - }, - { - "epoch": 1.374, - "grad_norm": 2.171893835067749, - "learning_rate": 3.132e-05, - "loss": 2.0154, - "step": 3435 - }, - { - "epoch": 1.376, - "grad_norm": 3.0752100944519043, - "learning_rate": 3.122e-05, - "loss": 1.698, - "step": 3440 - }, - { - "epoch": 1.3780000000000001, - "grad_norm": 1.4980363845825195, - "learning_rate": 3.112e-05, - "loss": 1.8132, - "step": 3445 - }, - { - "epoch": 1.38, - "grad_norm": 2.5946907997131348, - "learning_rate": 3.102e-05, - "loss": 1.6172, - "step": 3450 - }, - { - "epoch": 1.3820000000000001, - "grad_norm": 1.66837739944458, - "learning_rate": 3.092e-05, - "loss": 1.5751, - "step": 3455 - }, - { - "epoch": 1.384, - "grad_norm": 2.9651939868927, - "learning_rate": 3.082e-05, - "loss": 1.635, - "step": 3460 - }, - { - "epoch": 1.3860000000000001, - "grad_norm": 3.3698246479034424, - "learning_rate": 3.072e-05, - "loss": 1.8372, - "step": 3465 - }, - { - "epoch": 1.388, - "grad_norm": 1.5941882133483887, - "learning_rate": 3.062e-05, - "loss": 1.2569, - "step": 3470 - }, - { - "epoch": 1.3900000000000001, - "grad_norm": 1.779807448387146, - "learning_rate": 3.0520000000000006e-05, - "loss": 1.9385, - "step": 3475 - }, - { - "epoch": 1.392, - "grad_norm": 1.706462025642395, - "learning_rate": 3.0420000000000004e-05, - "loss": 1.774, - "step": 3480 - }, - { - "epoch": 1.3940000000000001, - "grad_norm": 3.015301465988159, - "learning_rate": 3.0320000000000004e-05, - "loss": 1.8036, - "step": 3485 - }, - { - "epoch": 1.396, - "grad_norm": 1.6445960998535156, - "learning_rate": 3.0220000000000005e-05, - "loss": 1.7795, - "step": 3490 - }, - { - "epoch": 1.3980000000000001, - "grad_norm": 2.8763880729675293, - "learning_rate": 3.0120000000000003e-05, - "loss": 1.5474, - "step": 3495 - }, - { - "epoch": 1.4, - "grad_norm": 2.344465732574463, - "learning_rate": 3.0020000000000004e-05, - "loss": 1.522, - "step": 3500 - }, - { - "epoch": 1.4020000000000001, - "grad_norm": 1.5552650690078735, - "learning_rate": 2.9920000000000005e-05, - "loss": 1.793, - "step": 3505 - }, - { - "epoch": 1.404, - "grad_norm": 2.935708999633789, - "learning_rate": 2.9820000000000002e-05, - "loss": 1.6886, - "step": 3510 - }, - { - "epoch": 1.4060000000000001, - "grad_norm": 2.3221919536590576, - "learning_rate": 2.9720000000000003e-05, - "loss": 1.7718, - "step": 3515 - }, - { - "epoch": 1.408, - "grad_norm": 3.25675106048584, - "learning_rate": 2.9620000000000004e-05, - "loss": 1.8605, - "step": 3520 - }, - { - "epoch": 1.41, - "grad_norm": 2.037717580795288, - "learning_rate": 2.9520000000000002e-05, - "loss": 1.7505, - "step": 3525 - }, - { - "epoch": 1.412, - "grad_norm": 1.540372610092163, - "learning_rate": 2.9420000000000003e-05, - "loss": 1.9402, - "step": 3530 - }, - { - "epoch": 1.414, - "grad_norm": 4.895864963531494, - "learning_rate": 2.9320000000000004e-05, - "loss": 1.6301, - "step": 3535 - }, - { - "epoch": 1.416, - "grad_norm": 3.14306640625, - "learning_rate": 2.922e-05, - "loss": 1.745, - "step": 3540 - }, - { - "epoch": 1.418, - "grad_norm": 3.124401092529297, - "learning_rate": 2.9120000000000002e-05, - "loss": 1.524, - "step": 3545 - }, - { - "epoch": 1.42, - "grad_norm": 2.047973394393921, - "learning_rate": 2.9020000000000003e-05, - "loss": 1.5372, - "step": 3550 - }, - { - "epoch": 1.422, - "grad_norm": 3.139409303665161, - "learning_rate": 2.8920000000000004e-05, - "loss": 1.5339, - "step": 3555 - }, - { - "epoch": 1.424, - "grad_norm": 1.6611011028289795, - "learning_rate": 2.8820000000000002e-05, - "loss": 1.6589, - "step": 3560 - }, - { - "epoch": 1.426, - "grad_norm": 3.551090955734253, - "learning_rate": 2.8720000000000003e-05, - "loss": 1.8029, - "step": 3565 - }, - { - "epoch": 1.428, - "grad_norm": 2.0900986194610596, - "learning_rate": 2.8620000000000004e-05, - "loss": 1.7191, - "step": 3570 - }, - { - "epoch": 1.43, - "grad_norm": 2.1817214488983154, - "learning_rate": 2.852e-05, - "loss": 1.7378, - "step": 3575 - }, - { - "epoch": 1.432, - "grad_norm": 2.5889766216278076, - "learning_rate": 2.8420000000000002e-05, - "loss": 1.974, - "step": 3580 - }, - { - "epoch": 1.434, - "grad_norm": 2.858646869659424, - "learning_rate": 2.8320000000000003e-05, - "loss": 1.7808, - "step": 3585 - }, - { - "epoch": 1.436, - "grad_norm": 1.6242611408233643, - "learning_rate": 2.822e-05, - "loss": 1.9002, - "step": 3590 - }, - { - "epoch": 1.438, - "grad_norm": 1.697709560394287, - "learning_rate": 2.8120000000000002e-05, - "loss": 1.7337, - "step": 3595 - }, - { - "epoch": 1.44, - "grad_norm": 3.6512959003448486, - "learning_rate": 2.8020000000000003e-05, - "loss": 1.9432, - "step": 3600 - }, - { - "epoch": 1.442, - "grad_norm": 1.8855968713760376, - "learning_rate": 2.792e-05, - "loss": 1.932, - "step": 3605 - }, - { - "epoch": 1.444, - "grad_norm": 2.7723069190979004, - "learning_rate": 2.782e-05, - "loss": 1.9459, - "step": 3610 - }, - { - "epoch": 1.446, - "grad_norm": 3.542825698852539, - "learning_rate": 2.7720000000000002e-05, - "loss": 1.1829, - "step": 3615 - }, - { - "epoch": 1.448, - "grad_norm": 3.098975419998169, - "learning_rate": 2.762e-05, - "loss": 1.5365, - "step": 3620 - }, - { - "epoch": 1.45, - "grad_norm": 1.659569501876831, - "learning_rate": 2.752e-05, - "loss": 1.6387, - "step": 3625 - }, - { - "epoch": 1.452, - "grad_norm": 2.3048620223999023, - "learning_rate": 2.7420000000000002e-05, - "loss": 1.3838, - "step": 3630 - }, - { - "epoch": 1.454, - "grad_norm": 3.800442934036255, - "learning_rate": 2.7320000000000003e-05, - "loss": 1.7217, - "step": 3635 - }, - { - "epoch": 1.456, - "grad_norm": 3.683425188064575, - "learning_rate": 2.722e-05, - "loss": 1.687, - "step": 3640 - }, - { - "epoch": 1.458, - "grad_norm": 1.6311277151107788, - "learning_rate": 2.712e-05, - "loss": 1.4124, - "step": 3645 - }, - { - "epoch": 1.46, - "grad_norm": 2.518873453140259, - "learning_rate": 2.7020000000000002e-05, - "loss": 2.0765, - "step": 3650 - }, - { - "epoch": 1.462, - "grad_norm": 1.3586816787719727, - "learning_rate": 2.692e-05, - "loss": 1.609, - "step": 3655 - }, - { - "epoch": 1.464, - "grad_norm": 2.318878650665283, - "learning_rate": 2.682e-05, - "loss": 1.6128, - "step": 3660 - }, - { - "epoch": 1.466, - "grad_norm": 2.112661361694336, - "learning_rate": 2.672e-05, - "loss": 1.4784, - "step": 3665 - }, - { - "epoch": 1.468, - "grad_norm": 1.8799687623977661, - "learning_rate": 2.662e-05, - "loss": 1.7146, - "step": 3670 - }, - { - "epoch": 1.47, - "grad_norm": 1.65269136428833, - "learning_rate": 2.652e-05, - "loss": 1.7689, - "step": 3675 - }, - { - "epoch": 1.472, - "grad_norm": 3.373119354248047, - "learning_rate": 2.642e-05, - "loss": 2.1181, - "step": 3680 - }, - { - "epoch": 1.474, - "grad_norm": 1.9318503141403198, - "learning_rate": 2.632e-05, - "loss": 1.6434, - "step": 3685 - }, - { - "epoch": 1.476, - "grad_norm": 3.8110852241516113, - "learning_rate": 2.622e-05, - "loss": 1.7684, - "step": 3690 - }, - { - "epoch": 1.478, - "grad_norm": 2.4162673950195312, - "learning_rate": 2.612e-05, - "loss": 2.0078, - "step": 3695 - }, - { - "epoch": 1.48, - "grad_norm": 1.7543604373931885, - "learning_rate": 2.602e-05, - "loss": 1.463, - "step": 3700 - }, - { - "epoch": 1.482, - "grad_norm": 1.7655296325683594, - "learning_rate": 2.592e-05, - "loss": 1.8911, - "step": 3705 - }, - { - "epoch": 1.484, - "grad_norm": 1.8368079662322998, - "learning_rate": 2.582e-05, - "loss": 1.7181, - "step": 3710 - }, - { - "epoch": 1.486, - "grad_norm": 2.1669936180114746, - "learning_rate": 2.572e-05, - "loss": 1.1812, - "step": 3715 - }, - { - "epoch": 1.488, - "grad_norm": 2.718224048614502, - "learning_rate": 2.562e-05, - "loss": 1.505, - "step": 3720 - }, - { - "epoch": 1.49, - "grad_norm": 2.1724534034729004, - "learning_rate": 2.552e-05, - "loss": 1.5774, - "step": 3725 - }, - { - "epoch": 1.492, - "grad_norm": 3.7047829627990723, - "learning_rate": 2.542e-05, - "loss": 1.7732, - "step": 3730 - }, - { - "epoch": 1.494, - "grad_norm": 2.8415119647979736, - "learning_rate": 2.5319999999999998e-05, - "loss": 2.0226, - "step": 3735 - }, - { - "epoch": 1.496, - "grad_norm": 2.6273069381713867, - "learning_rate": 2.522e-05, - "loss": 1.8373, - "step": 3740 - }, - { - "epoch": 1.498, - "grad_norm": 3.203504800796509, - "learning_rate": 2.512e-05, - "loss": 1.7035, - "step": 3745 - }, - { - "epoch": 1.5, - "grad_norm": 1.9744776487350464, - "learning_rate": 2.5019999999999998e-05, - "loss": 1.2184, - "step": 3750 - }, - { - "epoch": 1.502, - "grad_norm": 2.1510095596313477, - "learning_rate": 2.4920000000000002e-05, - "loss": 1.9375, - "step": 3755 - }, - { - "epoch": 1.504, - "grad_norm": 1.908165693283081, - "learning_rate": 2.4820000000000003e-05, - "loss": 1.2754, - "step": 3760 - }, - { - "epoch": 1.506, - "grad_norm": 3.470750570297241, - "learning_rate": 2.472e-05, - "loss": 1.6948, - "step": 3765 - }, - { - "epoch": 1.508, - "grad_norm": 1.6852209568023682, - "learning_rate": 2.462e-05, - "loss": 1.5546, - "step": 3770 - }, - { - "epoch": 1.51, - "grad_norm": 2.090461254119873, - "learning_rate": 2.4520000000000002e-05, - "loss": 2.1736, - "step": 3775 - }, - { - "epoch": 1.512, - "grad_norm": 1.2094796895980835, - "learning_rate": 2.442e-05, - "loss": 1.8343, - "step": 3780 - }, - { - "epoch": 1.514, - "grad_norm": 1.9683138132095337, - "learning_rate": 2.432e-05, - "loss": 1.8438, - "step": 3785 - }, - { - "epoch": 1.516, - "grad_norm": 2.4170594215393066, - "learning_rate": 2.4220000000000002e-05, - "loss": 1.686, - "step": 3790 - }, - { - "epoch": 1.518, - "grad_norm": 3.0025670528411865, - "learning_rate": 2.412e-05, - "loss": 1.8884, - "step": 3795 - }, - { - "epoch": 1.52, - "grad_norm": 3.286987543106079, - "learning_rate": 2.402e-05, - "loss": 1.7867, - "step": 3800 - }, - { - "epoch": 1.522, - "grad_norm": 3.0857369899749756, - "learning_rate": 2.392e-05, - "loss": 1.8645, - "step": 3805 - }, - { - "epoch": 1.524, - "grad_norm": 3.057649850845337, - "learning_rate": 2.3820000000000002e-05, - "loss": 1.4398, - "step": 3810 - }, - { - "epoch": 1.526, - "grad_norm": 1.4437475204467773, - "learning_rate": 2.372e-05, - "loss": 1.2692, - "step": 3815 - }, - { - "epoch": 1.528, - "grad_norm": 2.2691140174865723, - "learning_rate": 2.362e-05, - "loss": 1.5942, - "step": 3820 - }, - { - "epoch": 1.53, - "grad_norm": 1.6148360967636108, - "learning_rate": 2.3520000000000002e-05, - "loss": 1.9141, - "step": 3825 - }, - { - "epoch": 1.532, - "grad_norm": 1.8552954196929932, - "learning_rate": 2.342e-05, - "loss": 1.6889, - "step": 3830 - }, - { - "epoch": 1.534, - "grad_norm": 1.9940555095672607, - "learning_rate": 2.332e-05, - "loss": 2.0081, - "step": 3835 - }, - { - "epoch": 1.536, - "grad_norm": 2.539701223373413, - "learning_rate": 2.322e-05, - "loss": 1.756, - "step": 3840 - }, - { - "epoch": 1.538, - "grad_norm": 1.4828253984451294, - "learning_rate": 2.312e-05, - "loss": 1.8582, - "step": 3845 - }, - { - "epoch": 1.54, - "grad_norm": 1.9143613576889038, - "learning_rate": 2.302e-05, - "loss": 1.9463, - "step": 3850 - }, - { - "epoch": 1.542, - "grad_norm": 4.518005847930908, - "learning_rate": 2.292e-05, - "loss": 1.7049, - "step": 3855 - }, - { - "epoch": 1.544, - "grad_norm": 1.5553964376449585, - "learning_rate": 2.282e-05, - "loss": 1.4211, - "step": 3860 - }, - { - "epoch": 1.546, - "grad_norm": 3.2309939861297607, - "learning_rate": 2.2720000000000003e-05, - "loss": 1.6491, - "step": 3865 - }, - { - "epoch": 1.548, - "grad_norm": 4.333251953125, - "learning_rate": 2.2620000000000004e-05, - "loss": 1.7037, - "step": 3870 - }, - { - "epoch": 1.55, - "grad_norm": 1.44475519657135, - "learning_rate": 2.252e-05, - "loss": 2.0214, - "step": 3875 - }, - { - "epoch": 1.552, - "grad_norm": 3.0167627334594727, - "learning_rate": 2.2420000000000002e-05, - "loss": 1.9037, - "step": 3880 - }, - { - "epoch": 1.554, - "grad_norm": 1.7566245794296265, - "learning_rate": 2.2320000000000003e-05, - "loss": 1.5334, - "step": 3885 - }, - { - "epoch": 1.556, - "grad_norm": 1.9002455472946167, - "learning_rate": 2.222e-05, - "loss": 1.437, - "step": 3890 - }, - { - "epoch": 1.558, - "grad_norm": 2.522775888442993, - "learning_rate": 2.212e-05, - "loss": 1.6923, - "step": 3895 - }, - { - "epoch": 1.56, - "grad_norm": 3.733776569366455, - "learning_rate": 2.2020000000000003e-05, - "loss": 1.563, - "step": 3900 - }, - { - "epoch": 1.562, - "grad_norm": 1.627078890800476, - "learning_rate": 2.192e-05, - "loss": 2.1088, - "step": 3905 - }, - { - "epoch": 1.564, - "grad_norm": 1.7470308542251587, - "learning_rate": 2.182e-05, - "loss": 1.6227, - "step": 3910 - }, - { - "epoch": 1.5659999999999998, - "grad_norm": 2.38482666015625, - "learning_rate": 2.1720000000000002e-05, - "loss": 1.5989, - "step": 3915 - }, - { - "epoch": 1.568, - "grad_norm": 2.3000121116638184, - "learning_rate": 2.162e-05, - "loss": 1.7368, - "step": 3920 - }, - { - "epoch": 1.5699999999999998, - "grad_norm": 1.9650893211364746, - "learning_rate": 2.152e-05, - "loss": 1.7341, - "step": 3925 - }, - { - "epoch": 1.572, - "grad_norm": 2.2604129314422607, - "learning_rate": 2.142e-05, - "loss": 1.8987, - "step": 3930 - }, - { - "epoch": 1.5739999999999998, - "grad_norm": 2.943741798400879, - "learning_rate": 2.1320000000000003e-05, - "loss": 1.4595, - "step": 3935 - }, - { - "epoch": 1.576, - "grad_norm": 2.6729559898376465, - "learning_rate": 2.122e-05, - "loss": 1.7886, - "step": 3940 - }, - { - "epoch": 1.5779999999999998, - "grad_norm": 1.6457605361938477, - "learning_rate": 2.112e-05, - "loss": 1.5454, - "step": 3945 - }, - { - "epoch": 1.58, - "grad_norm": 1.680810570716858, - "learning_rate": 2.1020000000000002e-05, - "loss": 1.8204, - "step": 3950 - }, - { - "epoch": 1.5819999999999999, - "grad_norm": 3.292214870452881, - "learning_rate": 2.092e-05, - "loss": 1.424, - "step": 3955 - }, - { - "epoch": 1.584, - "grad_norm": 3.2848258018493652, - "learning_rate": 2.082e-05, - "loss": 1.346, - "step": 3960 - }, - { - "epoch": 1.5859999999999999, - "grad_norm": 2.960726737976074, - "learning_rate": 2.072e-05, - "loss": 1.6039, - "step": 3965 - }, - { - "epoch": 1.588, - "grad_norm": 3.5445568561553955, - "learning_rate": 2.062e-05, - "loss": 1.4933, - "step": 3970 - }, - { - "epoch": 1.5899999999999999, - "grad_norm": 3.332059383392334, - "learning_rate": 2.052e-05, - "loss": 1.7562, - "step": 3975 - }, - { - "epoch": 1.592, - "grad_norm": 4.794506072998047, - "learning_rate": 2.042e-05, - "loss": 1.5818, - "step": 3980 - }, - { - "epoch": 1.5939999999999999, - "grad_norm": 1.6709805727005005, - "learning_rate": 2.032e-05, - "loss": 1.8961, - "step": 3985 - }, - { - "epoch": 1.596, - "grad_norm": 2.365255117416382, - "learning_rate": 2.022e-05, - "loss": 1.5733, - "step": 3990 - }, - { - "epoch": 1.5979999999999999, - "grad_norm": 2.1693356037139893, - "learning_rate": 2.012e-05, - "loss": 1.574, - "step": 3995 - }, - { - "epoch": 1.6, - "grad_norm": 3.2418808937072754, - "learning_rate": 2.002e-05, - "loss": 1.5204, - "step": 4000 - }, - { - "epoch": 1.6019999999999999, - "grad_norm": 3.40478777885437, - "learning_rate": 1.992e-05, - "loss": 1.6627, - "step": 4005 - }, - { - "epoch": 1.604, - "grad_norm": 2.168471336364746, - "learning_rate": 1.982e-05, - "loss": 1.4472, - "step": 4010 - }, - { - "epoch": 1.6059999999999999, - "grad_norm": 2.151547908782959, - "learning_rate": 1.972e-05, - "loss": 1.693, - "step": 4015 - }, - { - "epoch": 1.608, - "grad_norm": 3.2873220443725586, - "learning_rate": 1.9620000000000002e-05, - "loss": 1.4235, - "step": 4020 - }, - { - "epoch": 1.6099999999999999, - "grad_norm": 2.6067519187927246, - "learning_rate": 1.9520000000000003e-05, - "loss": 1.7043, - "step": 4025 - }, - { - "epoch": 1.612, - "grad_norm": 3.547511339187622, - "learning_rate": 1.942e-05, - "loss": 2.0793, - "step": 4030 - }, - { - "epoch": 1.6139999999999999, - "grad_norm": 4.516319751739502, - "learning_rate": 1.932e-05, - "loss": 2.0383, - "step": 4035 - }, - { - "epoch": 1.616, - "grad_norm": 2.20668888092041, - "learning_rate": 1.9220000000000002e-05, - "loss": 1.5755, - "step": 4040 - }, - { - "epoch": 1.6179999999999999, - "grad_norm": 1.0403096675872803, - "learning_rate": 1.9120000000000003e-05, - "loss": 1.9345, - "step": 4045 - }, - { - "epoch": 1.62, - "grad_norm": 2.0750057697296143, - "learning_rate": 1.902e-05, - "loss": 1.7293, - "step": 4050 - }, - { - "epoch": 1.6219999999999999, - "grad_norm": 2.791288137435913, - "learning_rate": 1.8920000000000002e-05, - "loss": 1.6003, - "step": 4055 - }, - { - "epoch": 1.624, - "grad_norm": 2.4617395401000977, - "learning_rate": 1.8820000000000003e-05, - "loss": 1.5905, - "step": 4060 - }, - { - "epoch": 1.626, - "grad_norm": 2.102631092071533, - "learning_rate": 1.872e-05, - "loss": 2.1144, - "step": 4065 - }, - { - "epoch": 1.6280000000000001, - "grad_norm": 1.4703980684280396, - "learning_rate": 1.862e-05, - "loss": 1.708, - "step": 4070 - }, - { - "epoch": 1.63, - "grad_norm": 1.3034230470657349, - "learning_rate": 1.8520000000000002e-05, - "loss": 1.774, - "step": 4075 - }, - { - "epoch": 1.6320000000000001, - "grad_norm": 1.516777753829956, - "learning_rate": 1.842e-05, - "loss": 1.542, - "step": 4080 - }, - { - "epoch": 1.634, - "grad_norm": 1.5036194324493408, - "learning_rate": 1.832e-05, - "loss": 1.8689, - "step": 4085 - }, - { - "epoch": 1.6360000000000001, - "grad_norm": 4.367713928222656, - "learning_rate": 1.8220000000000002e-05, - "loss": 1.7957, - "step": 4090 - }, - { - "epoch": 1.638, - "grad_norm": 2.325582265853882, - "learning_rate": 1.812e-05, - "loss": 1.3961, - "step": 4095 - }, - { - "epoch": 1.6400000000000001, - "grad_norm": 2.2588531970977783, - "learning_rate": 1.802e-05, - "loss": 1.571, - "step": 4100 - }, - { - "epoch": 1.642, - "grad_norm": 3.0167784690856934, - "learning_rate": 1.792e-05, - "loss": 1.7083, - "step": 4105 - }, - { - "epoch": 1.6440000000000001, - "grad_norm": 2.339507818222046, - "learning_rate": 1.7820000000000002e-05, - "loss": 1.5699, - "step": 4110 - }, - { - "epoch": 1.646, - "grad_norm": 5.661227226257324, - "learning_rate": 1.772e-05, - "loss": 1.6432, - "step": 4115 - }, - { - "epoch": 1.6480000000000001, - "grad_norm": 2.84700870513916, - "learning_rate": 1.762e-05, - "loss": 1.462, - "step": 4120 - }, - { - "epoch": 1.65, - "grad_norm": 5.066640853881836, - "learning_rate": 1.752e-05, - "loss": 1.6516, - "step": 4125 - }, - { - "epoch": 1.6520000000000001, - "grad_norm": 2.449711799621582, - "learning_rate": 1.742e-05, - "loss": 2.0891, - "step": 4130 - }, - { - "epoch": 1.654, - "grad_norm": 2.4270060062408447, - "learning_rate": 1.732e-05, - "loss": 1.7712, - "step": 4135 - }, - { - "epoch": 1.6560000000000001, - "grad_norm": 2.409130334854126, - "learning_rate": 1.722e-05, - "loss": 1.5898, - "step": 4140 - }, - { - "epoch": 1.658, - "grad_norm": 4.040567398071289, - "learning_rate": 1.712e-05, - "loss": 1.3267, - "step": 4145 - }, - { - "epoch": 1.6600000000000001, - "grad_norm": 3.253746271133423, - "learning_rate": 1.702e-05, - "loss": 1.2956, - "step": 4150 - }, - { - "epoch": 1.662, - "grad_norm": 5.615586280822754, - "learning_rate": 1.692e-05, - "loss": 2.0444, - "step": 4155 - }, - { - "epoch": 1.6640000000000001, - "grad_norm": 2.2834932804107666, - "learning_rate": 1.6819999999999998e-05, - "loss": 1.7731, - "step": 4160 - }, - { - "epoch": 1.666, - "grad_norm": 2.3564233779907227, - "learning_rate": 1.672e-05, - "loss": 1.3126, - "step": 4165 - }, - { - "epoch": 1.6680000000000001, - "grad_norm": 3.5475549697875977, - "learning_rate": 1.662e-05, - "loss": 2.0114, - "step": 4170 - }, - { - "epoch": 1.67, - "grad_norm": 2.0407814979553223, - "learning_rate": 1.652e-05, - "loss": 1.6024, - "step": 4175 - }, - { - "epoch": 1.6720000000000002, - "grad_norm": 1.4806127548217773, - "learning_rate": 1.6420000000000002e-05, - "loss": 1.7259, - "step": 4180 - }, - { - "epoch": 1.674, - "grad_norm": 4.753927707672119, - "learning_rate": 1.6320000000000003e-05, - "loss": 2.2048, - "step": 4185 - }, - { - "epoch": 1.6760000000000002, - "grad_norm": 1.6219843626022339, - "learning_rate": 1.622e-05, - "loss": 1.7626, - "step": 4190 - }, - { - "epoch": 1.678, - "grad_norm": 2.0497982501983643, - "learning_rate": 1.612e-05, - "loss": 1.7135, - "step": 4195 - }, - { - "epoch": 1.6800000000000002, - "grad_norm": 3.37640643119812, - "learning_rate": 1.6020000000000002e-05, - "loss": 2.0108, - "step": 4200 - }, - { - "epoch": 1.682, - "grad_norm": 2.971378803253174, - "learning_rate": 1.592e-05, - "loss": 1.7901, - "step": 4205 - }, - { - "epoch": 1.6840000000000002, - "grad_norm": 2.1310267448425293, - "learning_rate": 1.582e-05, - "loss": 1.9743, - "step": 4210 - }, - { - "epoch": 1.686, - "grad_norm": 1.3451064825057983, - "learning_rate": 1.5720000000000002e-05, - "loss": 1.652, - "step": 4215 - }, - { - "epoch": 1.688, - "grad_norm": 2.2098116874694824, - "learning_rate": 1.5620000000000003e-05, - "loss": 2.1535, - "step": 4220 - }, - { - "epoch": 1.69, - "grad_norm": 3.1020560264587402, - "learning_rate": 1.552e-05, - "loss": 1.7025, - "step": 4225 - }, - { - "epoch": 1.692, - "grad_norm": 1.9873038530349731, - "learning_rate": 1.542e-05, - "loss": 1.9574, - "step": 4230 - }, - { - "epoch": 1.694, - "grad_norm": 3.3049862384796143, - "learning_rate": 1.5320000000000002e-05, - "loss": 1.5553, - "step": 4235 - }, - { - "epoch": 1.696, - "grad_norm": 2.3514750003814697, - "learning_rate": 1.5220000000000002e-05, - "loss": 1.6479, - "step": 4240 - }, - { - "epoch": 1.698, - "grad_norm": 1.8108309507369995, - "learning_rate": 1.5120000000000001e-05, - "loss": 2.2297, - "step": 4245 - }, - { - "epoch": 1.7, - "grad_norm": 3.397519826889038, - "learning_rate": 1.502e-05, - "loss": 1.9932, - "step": 4250 - }, - { - "epoch": 1.702, - "grad_norm": 2.1872971057891846, - "learning_rate": 1.4920000000000001e-05, - "loss": 1.8624, - "step": 4255 - }, - { - "epoch": 1.704, - "grad_norm": 2.399122953414917, - "learning_rate": 1.482e-05, - "loss": 1.4834, - "step": 4260 - }, - { - "epoch": 1.706, - "grad_norm": 2.538747787475586, - "learning_rate": 1.472e-05, - "loss": 1.6903, - "step": 4265 - }, - { - "epoch": 1.708, - "grad_norm": 2.160836935043335, - "learning_rate": 1.462e-05, - "loss": 1.7909, - "step": 4270 - }, - { - "epoch": 1.71, - "grad_norm": 1.9099202156066895, - "learning_rate": 1.452e-05, - "loss": 1.4634, - "step": 4275 - }, - { - "epoch": 1.712, - "grad_norm": 3.8388030529022217, - "learning_rate": 1.4420000000000001e-05, - "loss": 1.7921, - "step": 4280 - }, - { - "epoch": 1.714, - "grad_norm": 2.9180572032928467, - "learning_rate": 1.432e-05, - "loss": 1.8884, - "step": 4285 - }, - { - "epoch": 1.716, - "grad_norm": 2.863095760345459, - "learning_rate": 1.422e-05, - "loss": 1.972, - "step": 4290 - }, - { - "epoch": 1.718, - "grad_norm": 1.6249314546585083, - "learning_rate": 1.412e-05, - "loss": 1.7654, - "step": 4295 - }, - { - "epoch": 1.72, - "grad_norm": 3.312885284423828, - "learning_rate": 1.402e-05, - "loss": 1.4372, - "step": 4300 - }, - { - "epoch": 1.722, - "grad_norm": 2.0843143463134766, - "learning_rate": 1.3919999999999999e-05, - "loss": 1.676, - "step": 4305 - }, - { - "epoch": 1.724, - "grad_norm": 1.6616442203521729, - "learning_rate": 1.382e-05, - "loss": 1.5794, - "step": 4310 - }, - { - "epoch": 1.726, - "grad_norm": 1.461428165435791, - "learning_rate": 1.3719999999999999e-05, - "loss": 1.825, - "step": 4315 - }, - { - "epoch": 1.728, - "grad_norm": 1.8600455522537231, - "learning_rate": 1.362e-05, - "loss": 2.0891, - "step": 4320 - }, - { - "epoch": 1.73, - "grad_norm": 8.293989181518555, - "learning_rate": 1.352e-05, - "loss": 2.0749, - "step": 4325 - }, - { - "epoch": 1.732, - "grad_norm": 3.663043260574341, - "learning_rate": 1.3420000000000002e-05, - "loss": 1.7226, - "step": 4330 - }, - { - "epoch": 1.734, - "grad_norm": 2.3233726024627686, - "learning_rate": 1.3320000000000001e-05, - "loss": 1.7075, - "step": 4335 - }, - { - "epoch": 1.736, - "grad_norm": 1.6078758239746094, - "learning_rate": 1.3220000000000002e-05, - "loss": 1.8401, - "step": 4340 - }, - { - "epoch": 1.738, - "grad_norm": 2.9710512161254883, - "learning_rate": 1.3120000000000001e-05, - "loss": 1.5777, - "step": 4345 - }, - { - "epoch": 1.74, - "grad_norm": 1.867712378501892, - "learning_rate": 1.3020000000000002e-05, - "loss": 1.9425, - "step": 4350 - }, - { - "epoch": 1.742, - "grad_norm": 1.946547269821167, - "learning_rate": 1.2920000000000002e-05, - "loss": 1.3549, - "step": 4355 - }, - { - "epoch": 1.744, - "grad_norm": 1.9409579038619995, - "learning_rate": 1.2820000000000001e-05, - "loss": 1.9775, - "step": 4360 - }, - { - "epoch": 1.746, - "grad_norm": 2.9414451122283936, - "learning_rate": 1.2720000000000002e-05, - "loss": 1.3631, - "step": 4365 - }, - { - "epoch": 1.748, - "grad_norm": 2.5585744380950928, - "learning_rate": 1.2620000000000001e-05, - "loss": 1.7236, - "step": 4370 - }, - { - "epoch": 1.75, - "grad_norm": 3.3688697814941406, - "learning_rate": 1.252e-05, - "loss": 1.2604, - "step": 4375 - }, - { - "epoch": 1.752, - "grad_norm": 1.681897521018982, - "learning_rate": 1.2420000000000001e-05, - "loss": 1.3617, - "step": 4380 - }, - { - "epoch": 1.754, - "grad_norm": 2.4061717987060547, - "learning_rate": 1.232e-05, - "loss": 1.9308, - "step": 4385 - }, - { - "epoch": 1.756, - "grad_norm": 1.933555006980896, - "learning_rate": 1.2220000000000002e-05, - "loss": 1.3858, - "step": 4390 - }, - { - "epoch": 1.758, - "grad_norm": 2.967866897583008, - "learning_rate": 1.2120000000000001e-05, - "loss": 1.7174, - "step": 4395 - }, - { - "epoch": 1.76, - "grad_norm": 1.6695289611816406, - "learning_rate": 1.202e-05, - "loss": 1.6362, - "step": 4400 - }, - { - "epoch": 1.762, - "grad_norm": 1.535391926765442, - "learning_rate": 1.1920000000000001e-05, - "loss": 1.7996, - "step": 4405 - }, - { - "epoch": 1.764, - "grad_norm": 1.290605068206787, - "learning_rate": 1.182e-05, - "loss": 1.7682, - "step": 4410 - }, - { - "epoch": 1.766, - "grad_norm": 3.6815857887268066, - "learning_rate": 1.172e-05, - "loss": 1.6556, - "step": 4415 - }, - { - "epoch": 1.768, - "grad_norm": 2.138132333755493, - "learning_rate": 1.162e-05, - "loss": 1.7112, - "step": 4420 - }, - { - "epoch": 1.77, - "grad_norm": 2.808025360107422, - "learning_rate": 1.152e-05, - "loss": 1.411, - "step": 4425 - }, - { - "epoch": 1.772, - "grad_norm": 2.5961849689483643, - "learning_rate": 1.142e-05, - "loss": 1.9458, - "step": 4430 - }, - { - "epoch": 1.774, - "grad_norm": 1.5521067380905151, - "learning_rate": 1.132e-05, - "loss": 1.2973, - "step": 4435 - }, - { - "epoch": 1.776, - "grad_norm": 3.609253406524658, - "learning_rate": 1.122e-05, - "loss": 1.9642, - "step": 4440 - }, - { - "epoch": 1.778, - "grad_norm": 2.3775932788848877, - "learning_rate": 1.112e-05, - "loss": 1.5712, - "step": 4445 - }, - { - "epoch": 1.78, - "grad_norm": 2.9409773349761963, - "learning_rate": 1.1020000000000001e-05, - "loss": 1.7596, - "step": 4450 - }, - { - "epoch": 1.782, - "grad_norm": 2.5720837116241455, - "learning_rate": 1.092e-05, - "loss": 1.6529, - "step": 4455 - }, - { - "epoch": 1.784, - "grad_norm": 2.656883478164673, - "learning_rate": 1.0820000000000001e-05, - "loss": 1.7661, - "step": 4460 - }, - { - "epoch": 1.786, - "grad_norm": 2.429783582687378, - "learning_rate": 1.072e-05, - "loss": 1.6219, - "step": 4465 - }, - { - "epoch": 1.788, - "grad_norm": 3.2629878520965576, - "learning_rate": 1.062e-05, - "loss": 1.3955, - "step": 4470 - }, - { - "epoch": 1.79, - "grad_norm": 1.272605061531067, - "learning_rate": 1.0520000000000001e-05, - "loss": 1.9574, - "step": 4475 - }, - { - "epoch": 1.792, - "grad_norm": 2.0347158908843994, - "learning_rate": 1.042e-05, - "loss": 1.445, - "step": 4480 - }, - { - "epoch": 1.794, - "grad_norm": 4.4198737144470215, - "learning_rate": 1.0320000000000001e-05, - "loss": 1.4557, - "step": 4485 - }, - { - "epoch": 1.796, - "grad_norm": 2.2701990604400635, - "learning_rate": 1.022e-05, - "loss": 1.6719, - "step": 4490 - }, - { - "epoch": 1.798, - "grad_norm": 3.7548696994781494, - "learning_rate": 1.012e-05, - "loss": 1.5355, - "step": 4495 - }, - { - "epoch": 1.8, - "grad_norm": 2.8439321517944336, - "learning_rate": 1.002e-05, - "loss": 1.9391, - "step": 4500 - }, - { - "epoch": 1.802, - "grad_norm": 1.495534896850586, - "learning_rate": 9.92e-06, - "loss": 1.8336, - "step": 4505 - }, - { - "epoch": 1.804, - "grad_norm": 2.9073193073272705, - "learning_rate": 9.820000000000001e-06, - "loss": 1.7377, - "step": 4510 - }, - { - "epoch": 1.806, - "grad_norm": 2.329359292984009, - "learning_rate": 9.72e-06, - "loss": 1.6262, - "step": 4515 - }, - { - "epoch": 1.808, - "grad_norm": 1.923293113708496, - "learning_rate": 9.62e-06, - "loss": 1.8473, - "step": 4520 - }, - { - "epoch": 1.81, - "grad_norm": 2.3512725830078125, - "learning_rate": 9.52e-06, - "loss": 1.892, - "step": 4525 - }, - { - "epoch": 1.812, - "grad_norm": 1.9034477472305298, - "learning_rate": 9.420000000000001e-06, - "loss": 1.8625, - "step": 4530 - }, - { - "epoch": 1.814, - "grad_norm": 3.5805766582489014, - "learning_rate": 9.32e-06, - "loss": 1.3666, - "step": 4535 - }, - { - "epoch": 1.8159999999999998, - "grad_norm": 2.953458070755005, - "learning_rate": 9.220000000000002e-06, - "loss": 1.4938, - "step": 4540 - }, - { - "epoch": 1.818, - "grad_norm": 1.7378835678100586, - "learning_rate": 9.12e-06, - "loss": 1.8389, - "step": 4545 - }, - { - "epoch": 1.8199999999999998, - "grad_norm": 1.7877956628799438, - "learning_rate": 9.02e-06, - "loss": 1.1831, - "step": 4550 - }, - { - "epoch": 1.822, - "grad_norm": 3.1905672550201416, - "learning_rate": 8.920000000000001e-06, - "loss": 1.7312, - "step": 4555 - }, - { - "epoch": 1.8239999999999998, - "grad_norm": 2.953925371170044, - "learning_rate": 8.82e-06, - "loss": 1.6129, - "step": 4560 - }, - { - "epoch": 1.826, - "grad_norm": 3.367154121398926, - "learning_rate": 8.720000000000001e-06, - "loss": 1.7215, - "step": 4565 - }, - { - "epoch": 1.8279999999999998, - "grad_norm": 1.696204423904419, - "learning_rate": 8.62e-06, - "loss": 1.5069, - "step": 4570 - }, - { - "epoch": 1.83, - "grad_norm": 2.1172213554382324, - "learning_rate": 8.52e-06, - "loss": 1.4005, - "step": 4575 - }, - { - "epoch": 1.8319999999999999, - "grad_norm": 3.5448379516601562, - "learning_rate": 8.42e-06, - "loss": 1.4161, - "step": 4580 - }, - { - "epoch": 1.834, - "grad_norm": 2.4616730213165283, - "learning_rate": 8.32e-06, - "loss": 1.8131, - "step": 4585 - }, - { - "epoch": 1.8359999999999999, - "grad_norm": 2.7810232639312744, - "learning_rate": 8.22e-06, - "loss": 1.3348, - "step": 4590 - }, - { - "epoch": 1.838, - "grad_norm": 2.5865628719329834, - "learning_rate": 8.12e-06, - "loss": 1.6154, - "step": 4595 - }, - { - "epoch": 1.8399999999999999, - "grad_norm": 2.204624891281128, - "learning_rate": 8.02e-06, - "loss": 2.1785, - "step": 4600 - }, - { - "epoch": 1.842, - "grad_norm": 3.4042181968688965, - "learning_rate": 7.92e-06, - "loss": 1.4708, - "step": 4605 - }, - { - "epoch": 1.8439999999999999, - "grad_norm": 1.8127588033676147, - "learning_rate": 7.820000000000001e-06, - "loss": 1.8008, - "step": 4610 - }, - { - "epoch": 1.846, - "grad_norm": 1.541019082069397, - "learning_rate": 7.72e-06, - "loss": 1.6201, - "step": 4615 - }, - { - "epoch": 1.8479999999999999, - "grad_norm": 2.185657262802124, - "learning_rate": 7.620000000000001e-06, - "loss": 1.702, - "step": 4620 - }, - { - "epoch": 1.85, - "grad_norm": 1.4571075439453125, - "learning_rate": 7.520000000000001e-06, - "loss": 1.5476, - "step": 4625 - }, - { - "epoch": 1.8519999999999999, - "grad_norm": 1.6196515560150146, - "learning_rate": 7.420000000000001e-06, - "loss": 1.4612, - "step": 4630 - }, - { - "epoch": 1.854, - "grad_norm": 1.3069943189620972, - "learning_rate": 7.32e-06, - "loss": 1.9224, - "step": 4635 - }, - { - "epoch": 1.8559999999999999, - "grad_norm": 2.624645948410034, - "learning_rate": 7.22e-06, - "loss": 1.953, - "step": 4640 - }, - { - "epoch": 1.858, - "grad_norm": 2.6739323139190674, - "learning_rate": 7.1200000000000004e-06, - "loss": 2.1286, - "step": 4645 - }, - { - "epoch": 1.8599999999999999, - "grad_norm": 2.8309803009033203, - "learning_rate": 7.0200000000000006e-06, - "loss": 1.7787, - "step": 4650 - }, - { - "epoch": 1.862, - "grad_norm": 2.3118069171905518, - "learning_rate": 6.92e-06, - "loss": 1.4192, - "step": 4655 - }, - { - "epoch": 1.8639999999999999, - "grad_norm": 2.7963597774505615, - "learning_rate": 6.82e-06, - "loss": 1.4506, - "step": 4660 - }, - { - "epoch": 1.866, - "grad_norm": 2.4627857208251953, - "learning_rate": 6.72e-06, - "loss": 1.8756, - "step": 4665 - }, - { - "epoch": 1.8679999999999999, - "grad_norm": 2.761760950088501, - "learning_rate": 6.62e-06, - "loss": 1.6445, - "step": 4670 - }, - { - "epoch": 1.87, - "grad_norm": 2.5045080184936523, - "learning_rate": 6.519999999999999e-06, - "loss": 1.9228, - "step": 4675 - }, - { - "epoch": 1.8719999999999999, - "grad_norm": 1.7712461948394775, - "learning_rate": 6.4199999999999995e-06, - "loss": 1.8878, - "step": 4680 - }, - { - "epoch": 1.874, - "grad_norm": 1.506697177886963, - "learning_rate": 6.320000000000001e-06, - "loss": 1.8037, - "step": 4685 - }, - { - "epoch": 1.876, - "grad_norm": 1.942428708076477, - "learning_rate": 6.22e-06, - "loss": 2.2267, - "step": 4690 - }, - { - "epoch": 1.8780000000000001, - "grad_norm": 2.2255895137786865, - "learning_rate": 6.12e-06, - "loss": 1.5492, - "step": 4695 - }, - { - "epoch": 1.88, - "grad_norm": 2.8011889457702637, - "learning_rate": 6.02e-06, - "loss": 1.6278, - "step": 4700 - }, - { - "epoch": 1.8820000000000001, - "grad_norm": 1.4503179788589478, - "learning_rate": 5.920000000000001e-06, - "loss": 1.7414, - "step": 4705 - }, - { - "epoch": 1.884, - "grad_norm": 3.064669132232666, - "learning_rate": 5.82e-06, - "loss": 1.7617, - "step": 4710 - }, - { - "epoch": 1.8860000000000001, - "grad_norm": 2.1588497161865234, - "learning_rate": 5.72e-06, - "loss": 1.7811, - "step": 4715 - }, - { - "epoch": 1.888, - "grad_norm": 2.479996919631958, - "learning_rate": 5.62e-06, - "loss": 1.6982, - "step": 4720 - }, - { - "epoch": 1.8900000000000001, - "grad_norm": 1.8513116836547852, - "learning_rate": 5.5200000000000005e-06, - "loss": 2.3348, - "step": 4725 - }, - { - "epoch": 1.892, - "grad_norm": 1.7887202501296997, - "learning_rate": 5.42e-06, - "loss": 1.8174, - "step": 4730 - }, - { - "epoch": 1.8940000000000001, - "grad_norm": 2.4711050987243652, - "learning_rate": 5.32e-06, - "loss": 1.7915, - "step": 4735 - }, - { - "epoch": 1.896, - "grad_norm": 3.108246326446533, - "learning_rate": 5.220000000000001e-06, - "loss": 1.7375, - "step": 4740 - }, - { - "epoch": 1.8980000000000001, - "grad_norm": 2.101332664489746, - "learning_rate": 5.12e-06, - "loss": 2.0854, - "step": 4745 - }, - { - "epoch": 1.9, - "grad_norm": 2.0829579830169678, - "learning_rate": 5.02e-06, - "loss": 1.3063, - "step": 4750 - }, - { - "epoch": 1.9020000000000001, - "grad_norm": 2.5459961891174316, - "learning_rate": 4.92e-06, - "loss": 2.1524, - "step": 4755 - }, - { - "epoch": 1.904, - "grad_norm": 2.613213539123535, - "learning_rate": 4.8200000000000004e-06, - "loss": 1.5327, - "step": 4760 - }, - { - "epoch": 1.9060000000000001, - "grad_norm": 2.7435600757598877, - "learning_rate": 4.72e-06, - "loss": 1.5273, - "step": 4765 - }, - { - "epoch": 1.908, - "grad_norm": 2.016525983810425, - "learning_rate": 4.62e-06, - "loss": 1.9101, - "step": 4770 - }, - { - "epoch": 1.9100000000000001, - "grad_norm": 2.934013605117798, - "learning_rate": 4.52e-06, - "loss": 1.9333, - "step": 4775 - }, - { - "epoch": 1.912, - "grad_norm": 3.2015531063079834, - "learning_rate": 4.420000000000001e-06, - "loss": 2.0798, - "step": 4780 - }, - { - "epoch": 1.9140000000000001, - "grad_norm": 2.9536473751068115, - "learning_rate": 4.32e-06, - "loss": 1.4299, - "step": 4785 - }, - { - "epoch": 1.916, - "grad_norm": 1.9119105339050293, - "learning_rate": 4.22e-06, - "loss": 1.8954, - "step": 4790 - }, - { - "epoch": 1.9180000000000001, - "grad_norm": 3.10184645652771, - "learning_rate": 4.12e-06, - "loss": 1.5219, - "step": 4795 - }, - { - "epoch": 1.92, - "grad_norm": 1.8539482355117798, - "learning_rate": 4.0200000000000005e-06, - "loss": 2.0397, - "step": 4800 - }, - { - "epoch": 1.9220000000000002, - "grad_norm": 2.319058895111084, - "learning_rate": 3.92e-06, - "loss": 1.6486, - "step": 4805 - }, - { - "epoch": 1.924, - "grad_norm": 1.6333422660827637, - "learning_rate": 3.82e-06, - "loss": 1.6049, - "step": 4810 - }, - { - "epoch": 1.9260000000000002, - "grad_norm": 2.516272783279419, - "learning_rate": 3.72e-06, - "loss": 1.2497, - "step": 4815 - }, - { - "epoch": 1.928, - "grad_norm": 2.6449923515319824, - "learning_rate": 3.6200000000000005e-06, - "loss": 1.7566, - "step": 4820 - }, - { - "epoch": 1.9300000000000002, - "grad_norm": 3.312422037124634, - "learning_rate": 3.52e-06, - "loss": 1.7462, - "step": 4825 - }, - { - "epoch": 1.932, - "grad_norm": 2.3123393058776855, - "learning_rate": 3.4200000000000003e-06, - "loss": 1.5405, - "step": 4830 - }, - { - "epoch": 1.9340000000000002, - "grad_norm": 1.9016692638397217, - "learning_rate": 3.3200000000000004e-06, - "loss": 1.8866, - "step": 4835 - }, - { - "epoch": 1.936, - "grad_norm": 1.7640721797943115, - "learning_rate": 3.22e-06, - "loss": 2.0203, - "step": 4840 - }, - { - "epoch": 1.938, - "grad_norm": 2.3422842025756836, - "learning_rate": 3.12e-06, - "loss": 1.702, - "step": 4845 - }, - { - "epoch": 1.94, - "grad_norm": 2.2424137592315674, - "learning_rate": 3.0200000000000003e-06, - "loss": 1.9926, - "step": 4850 - }, - { - "epoch": 1.942, - "grad_norm": 2.6288299560546875, - "learning_rate": 2.92e-06, - "loss": 1.6564, - "step": 4855 - }, - { - "epoch": 1.944, - "grad_norm": 3.5492444038391113, - "learning_rate": 2.82e-06, - "loss": 1.5973, - "step": 4860 - }, - { - "epoch": 1.946, - "grad_norm": 6.756591320037842, - "learning_rate": 2.72e-06, - "loss": 2.0104, - "step": 4865 - }, - { - "epoch": 1.948, - "grad_norm": 3.185244560241699, - "learning_rate": 2.6200000000000003e-06, - "loss": 2.0237, - "step": 4870 - }, - { - "epoch": 1.95, - "grad_norm": 1.2352628707885742, - "learning_rate": 2.52e-06, - "loss": 2.0167, - "step": 4875 - }, - { - "epoch": 1.952, - "grad_norm": 2.9884321689605713, - "learning_rate": 2.42e-06, - "loss": 1.471, - "step": 4880 - }, - { - "epoch": 1.954, - "grad_norm": 1.4655405282974243, - "learning_rate": 2.32e-06, - "loss": 1.6469, - "step": 4885 - }, - { - "epoch": 1.956, - "grad_norm": 3.2296645641326904, - "learning_rate": 2.2200000000000003e-06, - "loss": 1.6163, - "step": 4890 - }, - { - "epoch": 1.958, - "grad_norm": 1.4382346868515015, - "learning_rate": 2.12e-06, - "loss": 1.4849, - "step": 4895 - }, - { - "epoch": 1.96, - "grad_norm": 1.3463542461395264, - "learning_rate": 2.02e-06, - "loss": 2.0737, - "step": 4900 - }, - { - "epoch": 1.962, - "grad_norm": 2.8371078968048096, - "learning_rate": 1.92e-06, - "loss": 1.3538, - "step": 4905 - }, - { - "epoch": 1.964, - "grad_norm": 4.151133060455322, - "learning_rate": 1.8200000000000002e-06, - "loss": 1.5621, - "step": 4910 - }, - { - "epoch": 1.966, - "grad_norm": 2.528104066848755, - "learning_rate": 1.72e-06, - "loss": 1.7611, - "step": 4915 - }, - { - "epoch": 1.968, - "grad_norm": 1.5124061107635498, - "learning_rate": 1.62e-06, - "loss": 1.8466, - "step": 4920 - }, - { - "epoch": 1.97, - "grad_norm": 1.403752326965332, - "learning_rate": 1.52e-06, - "loss": 1.8551, - "step": 4925 - }, - { - "epoch": 1.972, - "grad_norm": 3.7391104698181152, - "learning_rate": 1.4200000000000002e-06, - "loss": 1.3728, - "step": 4930 - }, - { - "epoch": 1.974, - "grad_norm": 1.4216769933700562, - "learning_rate": 1.32e-06, - "loss": 1.781, - "step": 4935 - }, - { - "epoch": 1.976, - "grad_norm": 1.8286603689193726, - "learning_rate": 1.2200000000000002e-06, - "loss": 1.8448, - "step": 4940 - }, - { - "epoch": 1.978, - "grad_norm": 2.8573355674743652, - "learning_rate": 1.12e-06, - "loss": 1.4593, - "step": 4945 - }, - { - "epoch": 1.98, - "grad_norm": 1.883048415184021, - "learning_rate": 1.0200000000000002e-06, - "loss": 1.8214, - "step": 4950 - }, - { - "epoch": 1.982, - "grad_norm": 2.809840202331543, - "learning_rate": 9.2e-07, - "loss": 1.8759, - "step": 4955 - }, - { - "epoch": 1.984, - "grad_norm": 2.3656716346740723, - "learning_rate": 8.200000000000001e-07, - "loss": 1.5094, - "step": 4960 - }, - { - "epoch": 1.986, - "grad_norm": 2.967402219772339, - "learning_rate": 7.2e-07, - "loss": 1.9461, - "step": 4965 - }, - { - "epoch": 1.988, - "grad_norm": 1.8330488204956055, - "learning_rate": 6.2e-07, - "loss": 1.8007, - "step": 4970 - }, - { - "epoch": 1.99, - "grad_norm": 4.4840216636657715, - "learning_rate": 5.2e-07, - "loss": 1.8761, - "step": 4975 - }, - { - "epoch": 1.992, - "grad_norm": 1.8227864503860474, - "learning_rate": 4.2e-07, - "loss": 1.661, - "step": 4980 - }, - { - "epoch": 1.994, - "grad_norm": 1.694053053855896, - "learning_rate": 3.2e-07, - "loss": 2.0821, - "step": 4985 - }, - { - "epoch": 1.996, - "grad_norm": 4.086534023284912, - "learning_rate": 2.2e-07, - "loss": 1.5419, - "step": 4990 - }, - { - "epoch": 1.998, - "grad_norm": 1.7249412536621094, - "learning_rate": 1.2e-07, - "loss": 1.4585, - "step": 4995 - }, - { - "epoch": 2.0, - "grad_norm": 2.045156955718994, - "learning_rate": 2e-08, - "loss": 1.6716, - "step": 5000 - } - ], - "logging_steps": 5, - "max_steps": 5000, - "num_input_tokens_seen": 0, - "num_train_epochs": 2, - "save_steps": 500, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": true - }, - "attributes": {} - } - }, - "total_flos": 1310727810318336.0, - "train_batch_size": 2, - "trial_name": null, - "trial_params": null -}