diff --git "a/all_experiments_details.json" "b/all_experiments_details.json" --- "a/all_experiments_details.json" +++ "b/all_experiments_details.json" @@ -3,16 +3,17 @@ "config": { "BASE_MODEL_ID": "Qwen/Qwen3-4B-Instruct-2507", "DATASET_ID": [ - "daichira/structured-hard-sft-4k", "u-10bei/structured_data_with_cot_dataset_512", - "u-10bei/structured_data_with_cot_dataset_512_v2" + "u-10bei/structured_data_with_cot_dataset_512_v2u-10bei/structured_data_with_cot_dataset_512_v4", + "u-10bei/structured_data_with_cot_dataset_512_v5", + "u-10bei/structured_data_with_cot_dataset_v2" ], "BASE_OUT_DIR": "./lora_experiments", "SEED": 3407, "VAL_RATIO": 0.05, "MAX_SEQ_LEN": 512, - "LORA_R": 128, - "LORA_ALPHA": 24, + "LORA_R": 64, + "LORA_ALPHA": 128, "LORA_DROPOUT": 0.0, "LORA_TARGET_MODULES": [ "q_proj", @@ -23,11 +24,11 @@ "up_proj", "down_proj" ], - "EPOCHS": 1, + "EPOCHS": 5, "PER_DEVICE_TRAIN_BS": 2, "PER_DEVICE_EVAL_BS": 2, "GRAD_ACCUM": 8, - "LR": 0.0002, + "LR": 1e-06, "WARMUP_RATIO": 0.1, "WEIGHT_DECAY": 0.05, "MAX_STEPS": -1, @@ -47,492 +48,2576 @@ "OUTPUT_LEARN_MODE": "after_marker", "UPSAMPLE_ENABLE": false, "UPSAMPLE_RULES_JSON": "{\"xml_to_yaml\": 2.0}", - "Experiment_Name": "Method_RSLoRA_R256", - "USE_RSLORA": true + "Experiment_Name": "R_64_ALPHA_128" }, "history": [ { - "loss": 1.4503, - "grad_norm": 0.6611063480377197, - "learning_rate": 3.2142857142857144e-05, - "epoch": 0.018144704014515765, + "loss": 1.8907, + "grad_norm": 3.5059776306152344, + "learning_rate": 3.010033444816053e-08, + "epoch": 0.016760946993505135, "step": 10 }, { - "loss": 1.0865, - "grad_norm": 0.9616151452064514, - "learning_rate": 6.785714285714286e-05, - "epoch": 0.03628940802903153, + "loss": 2.1073, + "grad_norm": 5.140995502471924, + "learning_rate": 6.354515050167224e-08, + "epoch": 0.03352189398701027, "step": 20 }, { - "loss": 0.9333, - "grad_norm": 0.47518712282180786, - "learning_rate": 0.00010357142857142859, - "epoch": 0.05443411204354729, + "loss": 2.1196, + "grad_norm": 4.122618198394775, + "learning_rate": 9.698996655518395e-08, + "epoch": 0.0502828409805154, "step": 30 }, { - "loss": 0.8193, - "grad_norm": 0.46924060583114624, - "learning_rate": 0.0001392857142857143, - "epoch": 0.07257881605806306, + "loss": 1.9858, + "grad_norm": 4.268527984619141, + "learning_rate": 1.3043478260869563e-07, + "epoch": 0.06704378797402054, "step": 40 }, { - "loss": 0.7101, - "grad_norm": 0.3533737361431122, - "learning_rate": 0.000175, - "epoch": 0.09072352007257882, + "loss": 2.0532, + "grad_norm": 3.432629108428955, + "learning_rate": 1.6387959866220735e-07, + "epoch": 0.08380473496752566, "step": 50 }, { - "eval_loss": 0.6981692910194397, - "eval_runtime": 76.8039, - "eval_samples_per_second": 5.95, - "eval_steps_per_second": 2.982, - "epoch": 0.09072352007257882, + "eval_loss": 2.1637799739837646, + "eval_runtime": 79.031, + "eval_samples_per_second": 6.301, + "eval_steps_per_second": 3.151, + "epoch": 0.08380473496752566, "step": 50 }, { - "loss": 0.7362, - "grad_norm": 0.4759365916252136, - "learning_rate": 0.0001999819475629623, - "epoch": 0.10886822408709458, + "loss": 1.9455, + "grad_norm": 3.9416215419769287, + "learning_rate": 1.9732441471571906e-07, + "epoch": 0.1005656819610308, "step": 60 }, { - "loss": 0.8242, - "grad_norm": 0.3445497155189514, - "learning_rate": 0.00019966119663520412, - "epoch": 0.12701292810161036, + "loss": 2.0286, + "grad_norm": 3.982705593109131, + "learning_rate": 2.3076923076923078e-07, + "epoch": 0.11732662895453592, "step": 70 }, { - "loss": 0.816, - "grad_norm": 0.4535239338874817, - "learning_rate": 0.000198940761218769, - "epoch": 0.14515763211612612, + "loss": 2.02, + "grad_norm": 5.758102893829346, + "learning_rate": 2.6421404682274245e-07, + "epoch": 0.13408757594804108, "step": 80 }, { - "loss": 0.6256, - "grad_norm": 0.2236223965883255, - "learning_rate": 0.000197823530571169, - "epoch": 0.16330233613064188, + "loss": 1.964, + "grad_norm": 5.359775066375732, + "learning_rate": 2.976588628762542e-07, + "epoch": 0.1508485229415462, "step": 90 }, { - "loss": 0.6991, - "grad_norm": 0.41939008235931396, - "learning_rate": 0.00019631398527035422, - "epoch": 0.18144704014515764, + "loss": 2.0269, + "grad_norm": 4.892724990844727, + "learning_rate": 3.311036789297659e-07, + "epoch": 0.16760946993505133, "step": 100 }, { - "eval_loss": 0.6606893539428711, - "eval_runtime": 76.0198, - "eval_samples_per_second": 6.012, - "eval_steps_per_second": 3.012, - "epoch": 0.18144704014515764, + "eval_loss": 2.112635612487793, + "eval_runtime": 78.9151, + "eval_samples_per_second": 6.311, + "eval_steps_per_second": 3.155, + "epoch": 0.16760946993505133, "step": 100 }, { - "loss": 0.6879, - "grad_norm": 0.21272054314613342, - "learning_rate": 0.00019441817924565786, - "epoch": 0.1995917441596734, + "loss": 2.1211, + "grad_norm": 4.311831474304199, + "learning_rate": 3.6454849498327757e-07, + "epoch": 0.18437041692855646, "step": 110 }, { - "loss": 0.7107, - "grad_norm": 0.361630916595459, - "learning_rate": 0.0001921437154989221, - "epoch": 0.21773644817418916, + "loss": 1.7914, + "grad_norm": 3.819464683532715, + "learning_rate": 3.979933110367893e-07, + "epoch": 0.2011313639220616, "step": 120 }, { - "loss": 0.6812, - "grad_norm": 0.2023937702178955, - "learning_rate": 0.0001894997156131734, - "epoch": 0.23588115218870492, + "loss": 1.958, + "grad_norm": 3.3366334438323975, + "learning_rate": 4.3143812709030095e-07, + "epoch": 0.21789231091556674, "step": 130 }, { - "loss": 0.7045, - "grad_norm": 0.30442872643470764, - "learning_rate": 0.00018649678317113084, - "epoch": 0.2540258562032207, + "loss": 1.9575, + "grad_norm": 3.8945889472961426, + "learning_rate": 4.648829431438127e-07, + "epoch": 0.23465325790907185, "step": 140 }, { - "loss": 0.8145, - "grad_norm": 0.34262141585350037, - "learning_rate": 0.00018314696123025454, - "epoch": 0.27217056021773645, + "loss": 2.0176, + "grad_norm": 3.6403913497924805, + "learning_rate": 4.983277591973244e-07, + "epoch": 0.251414204902577, "step": 150 }, { - "eval_loss": 0.6401548385620117, - "eval_runtime": 75.3842, - "eval_samples_per_second": 6.062, - "eval_steps_per_second": 3.038, - "epoch": 0.27217056021773645, + "eval_loss": 1.9623165130615234, + "eval_runtime": 78.7041, + "eval_samples_per_second": 6.327, + "eval_steps_per_second": 3.164, + "epoch": 0.251414204902577, "step": 150 }, { - "loss": 0.6708, - "grad_norm": 0.307359904050827, - "learning_rate": 0.00017946368402487845, - "epoch": 0.29031526423225223, + "loss": 1.7846, + "grad_norm": 2.4720418453216553, + "learning_rate": 5.317725752508361e-07, + "epoch": 0.26817515189608215, "step": 160 }, { - "loss": 0.7885, - "grad_norm": 0.29199010133743286, - "learning_rate": 0.00017546172308912213, - "epoch": 0.30845996824676797, + "loss": 1.7023, + "grad_norm": 2.5866756439208984, + "learning_rate": 5.652173913043477e-07, + "epoch": 0.28493609888958726, "step": 170 }, { - "loss": 0.7305, - "grad_norm": 0.20208679139614105, - "learning_rate": 0.000171157128016652, - "epoch": 0.32660467226128376, + "loss": 1.6763, + "grad_norm": 2.9348649978637695, + "learning_rate": 5.986622073578596e-07, + "epoch": 0.3016970458830924, "step": 180 }, { - "loss": 0.6736, - "grad_norm": 0.2638019025325775, - "learning_rate": 0.00016656716209487174, - "epoch": 0.3447493762757995, + "loss": 1.5781, + "grad_norm": 2.411113739013672, + "learning_rate": 6.321070234113712e-07, + "epoch": 0.31845799287659754, "step": 190 }, { - "loss": 0.5909, - "grad_norm": 0.1571992188692093, - "learning_rate": 0.00016171023307167545, - "epoch": 0.3628940802903153, + "loss": 1.6026, + "grad_norm": 2.2124366760253906, + "learning_rate": 6.655518394648829e-07, + "epoch": 0.33521893987010265, "step": 200 }, { - "eval_loss": 0.6256077289581299, - "eval_runtime": 75.1757, - "eval_samples_per_second": 6.079, - "eval_steps_per_second": 3.046, - "epoch": 0.3628940802903153, + "eval_loss": 1.7386550903320312, + "eval_runtime": 78.883, + "eval_samples_per_second": 6.313, + "eval_steps_per_second": 3.157, + "epoch": 0.33521893987010265, "step": 200 }, { - "loss": 0.7633, - "grad_norm": 0.1657303422689438, - "learning_rate": 0.00015660581933241993, - "epoch": 0.381038784304831, + "loss": 1.4584, + "grad_norm": 1.65778386592865, + "learning_rate": 6.989966555183946e-07, + "epoch": 0.3519798868636078, "step": 210 }, { - "loss": 0.6618, - "grad_norm": 0.22835072875022888, - "learning_rate": 0.00015127439178317745, - "epoch": 0.3991834883193468, + "loss": 1.4641, + "grad_norm": 1.6079872846603394, + "learning_rate": 7.324414715719063e-07, + "epoch": 0.3687408338571129, "step": 220 }, { - "loss": 0.6575, - "grad_norm": 0.2166450172662735, - "learning_rate": 0.0001457373317535515, - "epoch": 0.41732819233386254, + "loss": 1.4958, + "grad_norm": 1.3282182216644287, + "learning_rate": 7.658862876254181e-07, + "epoch": 0.38550178085061804, "step": 230 }, { - "loss": 0.6041, - "grad_norm": 0.2187417596578598, - "learning_rate": 0.00014001684524830057, - "epoch": 0.4354728963483783, + "loss": 1.5744, + "grad_norm": 0.7879586815834045, + "learning_rate": 7.993311036789297e-07, + "epoch": 0.4022627278441232, "step": 240 }, { - "loss": 0.7904, - "grad_norm": 0.17704260349273682, - "learning_rate": 0.00013413587389165784, - "epoch": 0.45361760036289406, + "loss": 1.6045, + "grad_norm": 0.9567638635635376, + "learning_rate": 8.327759197324414e-07, + "epoch": 0.4190236748376283, "step": 250 }, { - "eval_loss": 0.6180712580680847, - "eval_runtime": 75.7849, - "eval_samples_per_second": 6.03, - "eval_steps_per_second": 3.022, - "epoch": 0.45361760036289406, + "eval_loss": 1.5930449962615967, + "eval_runtime": 80.1329, + "eval_samples_per_second": 6.215, + "eval_steps_per_second": 3.107, + "epoch": 0.4190236748376283, "step": 250 }, { - "loss": 0.7716, - "grad_norm": 0.22328545153141022, - "learning_rate": 0.0001281180029214988, - "epoch": 0.47176230437740985, + "loss": 1.5604, + "grad_norm": 0.7901080250740051, + "learning_rate": 8.662207357859531e-07, + "epoch": 0.4357846218311335, "step": 260 }, { - "loss": 0.647, - "grad_norm": 0.1940474510192871, - "learning_rate": 0.00012198736660234009, - "epoch": 0.4899070083919256, + "loss": 1.3883, + "grad_norm": 0.6865659952163696, + "learning_rate": 8.996655518394648e-07, + "epoch": 0.4525455688246386, "step": 270 }, { - "loss": 0.6896, - "grad_norm": 0.18775729835033417, - "learning_rate": 0.00011576855143650371, - "epoch": 0.5080517124064414, + "loss": 1.4236, + "grad_norm": 0.8144264221191406, + "learning_rate": 9.331103678929766e-07, + "epoch": 0.4693065158181437, "step": 280 }, { - "loss": 0.6765, - "grad_norm": 0.25463321805000305, - "learning_rate": 0.00010948649756161246, - "epoch": 0.5261964164209572, + "loss": 1.3641, + "grad_norm": 0.6162766218185425, + "learning_rate": 9.665551839464883e-07, + "epoch": 0.48606746281164886, "step": 290 }, { - "loss": 0.6583, - "grad_norm": 0.16848962008953094, - "learning_rate": 0.00010316639872985472, - "epoch": 0.5443411204354729, + "loss": 1.3853, + "grad_norm": 0.7314286231994629, + "learning_rate": 1e-06, + "epoch": 0.502828409805154, "step": 300 }, { - "eval_loss": 0.6103786826133728, - "eval_runtime": 73.9265, - "eval_samples_per_second": 6.182, - "eval_steps_per_second": 3.098, - "epoch": 0.5443411204354729, + "eval_loss": 1.5064737796783447, + "eval_runtime": 78.6727, + "eval_samples_per_second": 6.33, + "eval_steps_per_second": 3.165, + "epoch": 0.502828409805154, "step": 300 }, { - "loss": 0.6652, - "grad_norm": 0.27448564767837524, - "learning_rate": 9.683360127014529e-05, - "epoch": 0.5624858244499886, + "loss": 1.3806, + "grad_norm": 0.5293618440628052, + "learning_rate": 9.999658002614155e-07, + "epoch": 0.5195893567986591, "step": 310 }, { - "loss": 0.701, - "grad_norm": 0.2189791053533554, - "learning_rate": 9.051350243838756e-05, - "epoch": 0.5806305284645045, + "loss": 1.6231, + "grad_norm": 0.5351464748382568, + "learning_rate": 9.998632057241507e-07, + "epoch": 0.5363503037921643, "step": 320 }, { - "loss": 0.6166, - "grad_norm": 0.17203940451145172, - "learning_rate": 8.423144856349631e-05, - "epoch": 0.5987752324790202, + "loss": 1.3607, + "grad_norm": 0.5800453424453735, + "learning_rate": 9.99692230423031e-07, + "epoch": 0.5531112507856694, "step": 330 }, { - "loss": 0.5667, - "grad_norm": 0.220821350812912, - "learning_rate": 7.801263339765994e-05, - "epoch": 0.6169199364935359, + "loss": 1.4255, + "grad_norm": 0.5451828241348267, + "learning_rate": 9.994528977472987e-07, + "epoch": 0.5698721977791745, "step": 340 }, { - "loss": 0.6526, - "grad_norm": 0.2224995642900467, - "learning_rate": 7.188199707850122e-05, - "epoch": 0.6350646405080517, + "loss": 1.458, + "grad_norm": 0.6325808167457581, + "learning_rate": 9.991452404374136e-07, + "epoch": 0.5866331447726797, "step": 350 }, { - "eval_loss": 0.6001651287078857, - "eval_runtime": 75.1345, - "eval_samples_per_second": 6.082, - "eval_steps_per_second": 3.048, - "epoch": 0.6350646405080517, + "eval_loss": 1.4532791376113892, + "eval_runtime": 78.8426, + "eval_samples_per_second": 6.316, + "eval_steps_per_second": 3.158, + "epoch": 0.5866331447726797, "step": 350 }, { - "loss": 0.6697, - "grad_norm": 0.20313851535320282, - "learning_rate": 6.586412610834221e-05, - "epoch": 0.6532093445225675, + "loss": 1.2395, + "grad_norm": 0.582156777381897, + "learning_rate": 9.98769300580574e-07, + "epoch": 0.6033940917661847, "step": 360 }, { - "loss": 0.5862, - "grad_norm": 0.1557992547750473, - "learning_rate": 5.998315475169942e-05, - "epoch": 0.6713540485370832, + "loss": 1.2645, + "grad_norm": 0.6619167923927307, + "learning_rate": 9.983251296049592e-07, + "epoch": 0.6201550387596899, "step": 370 }, { - "loss": 0.6341, - "grad_norm": 0.29822641611099243, - "learning_rate": 5.4262668246448475e-05, - "epoch": 0.689498752551599, + "loss": 1.2875, + "grad_norm": 0.5032066106796265, + "learning_rate": 9.978127882726946e-07, + "epoch": 0.6369159857531951, "step": 380 }, { - "loss": 0.6841, - "grad_norm": 0.30022329092025757, - "learning_rate": 4.872560821682256e-05, - "epoch": 0.7076434565661147, + "loss": 1.3396, + "grad_norm": 0.739372968673706, + "learning_rate": 9.97232346671538e-07, + "epoch": 0.6536769327467001, "step": 390 }, { - "loss": 0.729, - "grad_norm": 0.26096341013908386, - "learning_rate": 4.339418066758008e-05, - "epoch": 0.7257881605806306, + "loss": 1.4678, + "grad_norm": 0.7378211617469788, + "learning_rate": 9.96583884205294e-07, + "epoch": 0.6704378797402053, "step": 400 }, { - "eval_loss": 0.5905945897102356, - "eval_runtime": 76.6602, - "eval_samples_per_second": 5.961, - "eval_steps_per_second": 2.987, - "epoch": 0.7257881605806306, + "eval_loss": 1.4118926525115967, + "eval_runtime": 80.1337, + "eval_samples_per_second": 6.215, + "eval_steps_per_second": 3.107, + "epoch": 0.6704378797402053, "step": 400 }, { - "loss": 0.6138, - "grad_norm": 0.2632121741771698, - "learning_rate": 3.828976692832458e-05, - "epoch": 0.7439328645951463, + "loss": 1.2475, + "grad_norm": 0.5222254991531372, + "learning_rate": 9.958674895829497e-07, + "epoch": 0.6871988267337105, "step": 410 }, { - "loss": 0.6976, - "grad_norm": 0.24841086566448212, - "learning_rate": 3.343283790512829e-05, - "epoch": 0.762077568609662, + "loss": 1.4585, + "grad_norm": 0.8027982115745544, + "learning_rate": 9.950832608065402e-07, + "epoch": 0.7039597737272156, "step": 420 }, { - "loss": 0.7324, - "grad_norm": 0.33077147603034973, - "learning_rate": 2.8842871983347998e-05, - "epoch": 0.7802222726241779, + "loss": 1.3031, + "grad_norm": 0.5209378600120544, + "learning_rate": 9.942313051577426e-07, + "epoch": 0.7207207207207207, "step": 430 }, { - "loss": 0.5743, - "grad_norm": 0.29825517535209656, - "learning_rate": 2.45382769108779e-05, - "epoch": 0.7983669766386936, + "loss": 1.4187, + "grad_norm": 0.6038073897361755, + "learning_rate": 9.933117391831984e-07, + "epoch": 0.7374816677142259, "step": 440 }, { - "loss": 0.5662, - "grad_norm": 0.18266697227954865, - "learning_rate": 2.0536315975121544e-05, - "epoch": 0.8165116806532093, + "loss": 1.335, + "grad_norm": 0.7092902660369873, + "learning_rate": 9.923246886785712e-07, + "epoch": 0.754242614707731, "step": 450 }, { - "eval_loss": 0.5785723924636841, - "eval_runtime": 74.3692, - "eval_samples_per_second": 6.145, - "eval_steps_per_second": 3.079, - "epoch": 0.8165116806532093, + "eval_loss": 1.37515389919281, + "eval_runtime": 78.7004, + "eval_samples_per_second": 6.328, + "eval_steps_per_second": 3.164, + "epoch": 0.754242614707731, "step": 450 }, { - "loss": 0.6846, - "grad_norm": 0.37334564328193665, - "learning_rate": 1.6853038769745467e-05, - "epoch": 0.8346563846677251, + "loss": 1.2147, + "grad_norm": 0.6610886454582214, + "learning_rate": 9.912702886713383e-07, + "epoch": 0.7710035617012361, "step": 460 }, { - "loss": 0.6121, - "grad_norm": 0.31827959418296814, - "learning_rate": 1.3503216828869192e-05, - "epoch": 0.8528010886822409, + "loss": 1.342, + "grad_norm": 0.6342706084251404, + "learning_rate": 9.901486834023181e-07, + "epoch": 0.7877645086947412, "step": 470 }, { - "loss": 0.719, - "grad_norm": 0.5225608348846436, - "learning_rate": 1.0500284386826597e-05, - "epoch": 0.8709457926967566, + "loss": 1.309, + "grad_norm": 0.8199939727783203, + "learning_rate": 9.889600263059384e-07, + "epoch": 0.8045254556882464, "step": 480 }, { - "loss": 0.6807, - "grad_norm": 0.2689962387084961, - "learning_rate": 7.856284501077926e-06, - "epoch": 0.8890904967112724, + "loss": 1.3816, + "grad_norm": 0.8439438939094543, + "learning_rate": 9.87704479989247e-07, + "epoch": 0.8212864026817516, "step": 490 }, { - "loss": 0.5893, - "grad_norm": 0.34263530373573303, - "learning_rate": 5.581820754342137e-06, - "epoch": 0.9072352007257881, + "loss": 1.3429, + "grad_norm": 0.7948578596115112, + "learning_rate": 9.863822162096672e-07, + "epoch": 0.8380473496752566, "step": 500 }, { - "eval_loss": 0.5692603588104248, - "eval_runtime": 74.4074, - "eval_samples_per_second": 6.142, - "eval_steps_per_second": 3.078, - "epoch": 0.9072352007257881, + "eval_loss": 1.341338038444519, + "eval_runtime": 78.8544, + "eval_samples_per_second": 6.315, + "eval_steps_per_second": 3.158, + "epoch": 0.8380473496752566, "step": 500 }, { - "loss": 0.5906, - "grad_norm": 0.28205356001853943, - "learning_rate": 3.6860147296457816e-06, - "epoch": 0.925379904740304, + "loss": 1.3624, + "grad_norm": 0.6835984587669373, + "learning_rate": 9.849934158515017e-07, + "epoch": 0.8548082966687618, "step": 510 }, { - "loss": 0.6412, - "grad_norm": 0.3020014464855194, - "learning_rate": 2.1764694288310184e-06, - "epoch": 0.9435246087548197, + "loss": 1.1831, + "grad_norm": 0.6039908528327942, + "learning_rate": 9.835382689011868e-07, + "epoch": 0.871569243662267, "step": 520 }, { - "loss": 0.7172, - "grad_norm": 0.3621278703212738, - "learning_rate": 1.0592387812310311e-06, - "epoch": 0.9616693127693354, + "loss": 1.243, + "grad_norm": 0.5669766664505005, + "learning_rate": 9.82016974421304e-07, + "epoch": 0.888330190655772, "step": 530 }, { - "loss": 0.5935, - "grad_norm": 0.31613534688949585, - "learning_rate": 3.3880336479590325e-07, - "epoch": 0.9798140167838512, + "loss": 1.2414, + "grad_norm": 0.724523663520813, + "learning_rate": 9.804297405233474e-07, + "epoch": 0.9050911376492772, "step": 540 }, { - "loss": 0.5792, - "grad_norm": 0.4579828679561615, - "learning_rate": 1.8052437037707758e-08, - "epoch": 0.997958720798367, + "loss": 1.2665, + "grad_norm": 0.5642393231391907, + "learning_rate": 9.787767843392544e-07, + "epoch": 0.9218520846427823, "step": 550 }, { - "eval_loss": 0.5670668482780457, - "eval_runtime": 72.4806, - "eval_samples_per_second": 6.305, - "eval_steps_per_second": 3.159, - "epoch": 0.997958720798367, + "eval_loss": 1.3112993240356445, + "eval_runtime": 80.5311, + "eval_samples_per_second": 6.184, + "eval_steps_per_second": 3.092, + "epoch": 0.9218520846427823, "step": 550 }, { - "train_runtime": 4669.6027, - "train_samples_per_second": 1.888, - "train_steps_per_second": 0.118, - "total_flos": 8.286147539211264e+16, - "train_loss": 0.7052618392567703, - "epoch": 1.0, - "step": 552, - "total_runtime_sec": 4670.917282342911 + "loss": 1.2605, + "grad_norm": 0.6140392422676086, + "learning_rate": 9.770583319917029e-07, + "epoch": 0.9386130316362874, + "step": 560 + }, + { + "loss": 1.2227, + "grad_norm": 0.6274238228797913, + "learning_rate": 9.752746185631766e-07, + "epoch": 0.9553739786297926, + "step": 570 + }, + { + "loss": 1.2693, + "grad_norm": 0.5612761974334717, + "learning_rate": 9.734258880638076e-07, + "epoch": 0.9721349256232977, + "step": 580 + }, + { + "loss": 1.3072, + "grad_norm": 0.7661623954772949, + "learning_rate": 9.715123933979953e-07, + "epoch": 0.9888958726168029, + "step": 590 + }, + { + "loss": 1.2428, + "grad_norm": 0.649665892124176, + "learning_rate": 9.695343963298086e-07, + "epoch": 1.0050282840980516, + "step": 600 + }, + { + "eval_loss": 1.2851722240447998, + "eval_runtime": 79.0258, + "eval_samples_per_second": 6.302, + "eval_steps_per_second": 3.151, + "epoch": 1.0050282840980516, + "step": 600 + }, + { + "loss": 1.263, + "grad_norm": 0.6536946892738342, + "learning_rate": 9.674921674471785e-07, + "epoch": 1.0217892310915566, + "step": 610 + }, + { + "loss": 1.2098, + "grad_norm": 0.5894418358802795, + "learning_rate": 9.653859861248805e-07, + "epoch": 1.038550178085062, + "step": 620 + }, + { + "loss": 1.2002, + "grad_norm": 0.5719261169433594, + "learning_rate": 9.632161404863174e-07, + "epoch": 1.055311125078567, + "step": 630 + }, + { + "loss": 1.3116, + "grad_norm": 0.7122625112533569, + "learning_rate": 9.609829273641032e-07, + "epoch": 1.072072072072072, + "step": 640 + }, + { + "loss": 1.0923, + "grad_norm": 0.8119747042655945, + "learning_rate": 9.586866522594584e-07, + "epoch": 1.0888330190655773, + "step": 650 + }, + { + "eval_loss": 1.2604608535766602, + "eval_runtime": 79.0547, + "eval_samples_per_second": 6.299, + "eval_steps_per_second": 3.15, + "epoch": 1.0888330190655773, + "step": 650 + }, + { + "loss": 1.2157, + "grad_norm": 0.5922772288322449, + "learning_rate": 9.563276293004155e-07, + "epoch": 1.1055939660590823, + "step": 660 + }, + { + "loss": 1.2892, + "grad_norm": 0.6105348467826843, + "learning_rate": 9.53906181198849e-07, + "epoch": 1.1223549130525874, + "step": 670 + }, + { + "loss": 1.1476, + "grad_norm": 0.7103086113929749, + "learning_rate": 9.51422639206327e-07, + "epoch": 1.1391158600460927, + "step": 680 + }, + { + "loss": 1.108, + "grad_norm": 0.716642439365387, + "learning_rate": 9.488773430687973e-07, + "epoch": 1.1558768070395977, + "step": 690 + }, + { + "loss": 1.3284, + "grad_norm": 0.6440051794052124, + "learning_rate": 9.4627064098011e-07, + "epoch": 1.1726377540331028, + "step": 700 + }, + { + "eval_loss": 1.2372387647628784, + "eval_runtime": 78.588, + "eval_samples_per_second": 6.337, + "eval_steps_per_second": 3.168, + "epoch": 1.1726377540331028, + "step": 700 + }, + { + "loss": 1.0961, + "grad_norm": 0.5385606288909912, + "learning_rate": 9.436028895343848e-07, + "epoch": 1.189398701026608, + "step": 710 + }, + { + "loss": 1.1597, + "grad_norm": 0.8156595230102539, + "learning_rate": 9.408744536772303e-07, + "epoch": 1.206159648020113, + "step": 720 + }, + { + "loss": 1.1478, + "grad_norm": 0.7858404517173767, + "learning_rate": 9.380857066558184e-07, + "epoch": 1.2229205950136182, + "step": 730 + }, + { + "loss": 1.2074, + "grad_norm": 0.7118250727653503, + "learning_rate": 9.352370299678258e-07, + "epoch": 1.2396815420071234, + "step": 740 + }, + { + "loss": 1.1792, + "grad_norm": 0.7732047438621521, + "learning_rate": 9.323288133092445e-07, + "epoch": 1.2564424890006285, + "step": 750 + }, + { + "eval_loss": 1.2164807319641113, + "eval_runtime": 78.9387, + "eval_samples_per_second": 6.309, + "eval_steps_per_second": 3.154, + "epoch": 1.2564424890006285, + "step": 750 + }, + { + "loss": 0.9987, + "grad_norm": 0.7803258895874023, + "learning_rate": 9.293614545210724e-07, + "epoch": 1.2732034359941338, + "step": 760 + }, + { + "loss": 1.2523, + "grad_norm": 0.69403475522995, + "learning_rate": 9.263353595348891e-07, + "epoch": 1.2899643829876388, + "step": 770 + }, + { + "loss": 1.1585, + "grad_norm": 0.7996134161949158, + "learning_rate": 9.23250942317324e-07, + "epoch": 1.3067253299811439, + "step": 780 + }, + { + "loss": 1.1139, + "grad_norm": 0.6396450400352478, + "learning_rate": 9.201086248134276e-07, + "epoch": 1.323486276974649, + "step": 790 + }, + { + "loss": 1.1116, + "grad_norm": 0.6725043058395386, + "learning_rate": 9.169088368889484e-07, + "epoch": 1.3402472239681542, + "step": 800 + }, + { + "eval_loss": 1.1973844766616821, + "eval_runtime": 80.6555, + "eval_samples_per_second": 6.174, + "eval_steps_per_second": 3.087, + "epoch": 1.3402472239681542, + "step": 800 + }, + { + "loss": 1.0994, + "grad_norm": 0.7814534306526184, + "learning_rate": 9.136520162715286e-07, + "epoch": 1.3570081709616593, + "step": 810 + }, + { + "loss": 1.0118, + "grad_norm": 0.7589584589004517, + "learning_rate": 9.103386084908233e-07, + "epoch": 1.3737691179551645, + "step": 820 + }, + { + "loss": 1.1388, + "grad_norm": 0.7946352958679199, + "learning_rate": 9.069690668175519e-07, + "epoch": 1.3905300649486696, + "step": 830 + }, + { + "loss": 1.1872, + "grad_norm": 1.228780746459961, + "learning_rate": 9.035438522014923e-07, + "epoch": 1.4072910119421747, + "step": 840 + }, + { + "loss": 1.1591, + "grad_norm": 0.6566089987754822, + "learning_rate": 9.000634332084219e-07, + "epoch": 1.42405195893568, + "step": 850 + }, + { + "eval_loss": 1.1793015003204346, + "eval_runtime": 80.6691, + "eval_samples_per_second": 6.173, + "eval_steps_per_second": 3.087, + "epoch": 1.42405195893568, + "step": 850 + }, + { + "loss": 1.1832, + "grad_norm": 0.8144505023956299, + "learning_rate": 8.9652828595602e-07, + "epoch": 1.440812905929185, + "step": 860 + }, + { + "loss": 0.9717, + "grad_norm": 0.659873366355896, + "learning_rate": 8.929388940487338e-07, + "epoch": 1.4575738529226903, + "step": 870 + }, + { + "loss": 1.0355, + "grad_norm": 0.703195333480835, + "learning_rate": 8.892957485116233e-07, + "epoch": 1.4743347999161953, + "step": 880 + }, + { + "loss": 0.9693, + "grad_norm": 0.6595771908760071, + "learning_rate": 8.855993477231883e-07, + "epoch": 1.4910957469097004, + "step": 890 + }, + { + "loss": 1.1721, + "grad_norm": 0.9032782912254333, + "learning_rate": 8.818501973471912e-07, + "epoch": 1.5078566939032054, + "step": 900 + }, + { + "eval_loss": 1.1632665395736694, + "eval_runtime": 78.9179, + "eval_samples_per_second": 6.31, + "eval_steps_per_second": 3.155, + "epoch": 1.5078566939032054, + "step": 900 + }, + { + "loss": 1.1077, + "grad_norm": 0.7769476175308228, + "learning_rate": 8.780488102634836e-07, + "epoch": 1.5246176408967107, + "step": 910 + }, + { + "loss": 1.1579, + "grad_norm": 0.8173115849494934, + "learning_rate": 8.741957064978433e-07, + "epoch": 1.5413785878902158, + "step": 920 + }, + { + "loss": 1.0498, + "grad_norm": 0.774395763874054, + "learning_rate": 8.702914131508365e-07, + "epoch": 1.558139534883721, + "step": 930 + }, + { + "loss": 1.0696, + "grad_norm": 0.7452784180641174, + "learning_rate": 8.663364643257104e-07, + "epoch": 1.574900481877226, + "step": 940 + }, + { + "loss": 1.214, + "grad_norm": 0.7853773236274719, + "learning_rate": 8.623314010553288e-07, + "epoch": 1.5916614288707311, + "step": 950 + }, + { + "eval_loss": 1.1473941802978516, + "eval_runtime": 80.6501, + "eval_samples_per_second": 6.175, + "eval_steps_per_second": 3.087, + "epoch": 1.5916614288707311, + "step": 950 + }, + { + "loss": 1.2111, + "grad_norm": 0.7786531448364258, + "learning_rate": 8.582767712281591e-07, + "epoch": 1.6084223758642362, + "step": 960 + }, + { + "loss": 1.0599, + "grad_norm": 0.78724604845047, + "learning_rate": 8.541731295133219e-07, + "epoch": 1.6251833228577415, + "step": 970 + }, + { + "loss": 1.0114, + "grad_norm": 0.9002764821052551, + "learning_rate": 8.500210372847126e-07, + "epoch": 1.6419442698512468, + "step": 980 + }, + { + "loss": 0.9775, + "grad_norm": 0.8179922103881836, + "learning_rate": 8.458210625442068e-07, + "epoch": 1.6587052168447518, + "step": 990 + }, + { + "loss": 1.1444, + "grad_norm": 0.6862105131149292, + "learning_rate": 8.415737798439568e-07, + "epoch": 1.6754661638382569, + "step": 1000 + }, + { + "eval_loss": 1.1338735818862915, + "eval_runtime": 78.5686, + "eval_samples_per_second": 6.338, + "eval_steps_per_second": 3.169, + "epoch": 1.6754661638382569, + "step": 1000 + }, + { + "loss": 1.0762, + "grad_norm": 0.7431554794311523, + "learning_rate": 8.372797702077952e-07, + "epoch": 1.692227110831762, + "step": 1010 + }, + { + "loss": 1.0976, + "grad_norm": 0.7127805948257446, + "learning_rate": 8.329396210517496e-07, + "epoch": 1.708988057825267, + "step": 1020 + }, + { + "loss": 1.1504, + "grad_norm": 0.9699788093566895, + "learning_rate": 8.285539261036868e-07, + "epoch": 1.7257490048187722, + "step": 1030 + }, + { + "loss": 1.0243, + "grad_norm": 0.6960281729698181, + "learning_rate": 8.241232853220894e-07, + "epoch": 1.7425099518122775, + "step": 1040 + }, + { + "loss": 1.1155, + "grad_norm": 0.9629572033882141, + "learning_rate": 8.196483048139834e-07, + "epoch": 1.7592708988057826, + "step": 1050 + }, + { + "eval_loss": 1.122306227684021, + "eval_runtime": 78.8404, + "eval_samples_per_second": 6.317, + "eval_steps_per_second": 3.158, + "epoch": 1.7592708988057826, + "step": 1050 + }, + { + "loss": 1.0379, + "grad_norm": 0.9436600208282471, + "learning_rate": 8.151295967520231e-07, + "epoch": 1.7760318457992876, + "step": 1060 + }, + { + "loss": 0.9271, + "grad_norm": 0.8760473132133484, + "learning_rate": 8.105677792907462e-07, + "epoch": 1.7927927927927927, + "step": 1070 + }, + { + "loss": 1.1017, + "grad_norm": 0.8457250595092773, + "learning_rate": 8.059634764820114e-07, + "epoch": 1.809553739786298, + "step": 1080 + }, + { + "loss": 1.0982, + "grad_norm": 0.7704038619995117, + "learning_rate": 8.013173181896282e-07, + "epoch": 1.826314686779803, + "step": 1090 + }, + { + "loss": 0.9232, + "grad_norm": 0.8354272246360779, + "learning_rate": 7.966299400031928e-07, + "epoch": 1.8430756337733083, + "step": 1100 + }, + { + "eval_loss": 1.1116451025009155, + "eval_runtime": 78.8427, + "eval_samples_per_second": 6.316, + "eval_steps_per_second": 3.158, + "epoch": 1.8430756337733083, + "step": 1100 + }, + { + "loss": 0.9978, + "grad_norm": 0.8250349164009094, + "learning_rate": 7.919019831511399e-07, + "epoch": 1.8598365807668134, + "step": 1110 + }, + { + "loss": 1.0313, + "grad_norm": 0.7708654999732971, + "learning_rate": 7.871340944130228e-07, + "epoch": 1.8765975277603184, + "step": 1120 + }, + { + "loss": 1.0518, + "grad_norm": 0.6994651556015015, + "learning_rate": 7.823269260310351e-07, + "epoch": 1.8933584747538235, + "step": 1130 + }, + { + "loss": 1.0081, + "grad_norm": 0.8944517374038696, + "learning_rate": 7.774811356207851e-07, + "epoch": 1.9101194217473287, + "step": 1140 + }, + { + "loss": 1.0257, + "grad_norm": 0.8253750801086426, + "learning_rate": 7.725973860813338e-07, + "epoch": 1.926880368740834, + "step": 1150 + }, + { + "eval_loss": 1.1017853021621704, + "eval_runtime": 78.7751, + "eval_samples_per_second": 6.322, + "eval_steps_per_second": 3.161, + "epoch": 1.926880368740834, + "step": 1150 + }, + { + "loss": 0.9687, + "grad_norm": 0.7468390464782715, + "learning_rate": 7.676763455045113e-07, + "epoch": 1.943641315734339, + "step": 1160 + }, + { + "loss": 0.9275, + "grad_norm": 0.8286957740783691, + "learning_rate": 7.627186870835228e-07, + "epoch": 1.9604022627278441, + "step": 1170 + }, + { + "loss": 0.9793, + "grad_norm": 0.7848206758499146, + "learning_rate": 7.577250890208564e-07, + "epoch": 1.9771632097213492, + "step": 1180 + }, + { + "loss": 0.9569, + "grad_norm": 0.8074430823326111, + "learning_rate": 7.526962344355055e-07, + "epoch": 1.9939241567148542, + "step": 1190 + }, + { + "loss": 0.9004, + "grad_norm": 0.8617448210716248, + "learning_rate": 7.476328112695185e-07, + "epoch": 2.010056568196103, + "step": 1200 + }, + { + "eval_loss": 1.0935511589050293, + "eval_runtime": 79.7429, + "eval_samples_per_second": 6.245, + "eval_steps_per_second": 3.123, + "epoch": 2.010056568196103, + "step": 1200 + }, + { + "loss": 0.929, + "grad_norm": 1.1476644277572632, + "learning_rate": 7.425355121938901e-07, + "epoch": 2.026817515189608, + "step": 1210 + }, + { + "loss": 1.014, + "grad_norm": 0.8628258109092712, + "learning_rate": 7.37405034513804e-07, + "epoch": 2.0435784621831132, + "step": 1220 + }, + { + "loss": 1.0312, + "grad_norm": 0.8636768460273743, + "learning_rate": 7.322420800732418e-07, + "epoch": 2.0603394091766183, + "step": 1230 + }, + { + "loss": 1.0864, + "grad_norm": 0.6343582272529602, + "learning_rate": 7.270473551589723e-07, + "epoch": 2.077100356170124, + "step": 1240 + }, + { + "loss": 1.1058, + "grad_norm": 0.9320090413093567, + "learning_rate": 7.218215704039321e-07, + "epoch": 2.093861303163629, + "step": 1250 + }, + { + "eval_loss": 1.0855053663253784, + "eval_runtime": 86.4087, + "eval_samples_per_second": 5.763, + "eval_steps_per_second": 2.882, + "epoch": 2.093861303163629, + "step": 1250 + }, + { + "loss": 1.0237, + "grad_norm": 0.8435965776443481, + "learning_rate": 7.16565440690011e-07, + "epoch": 2.110622250157134, + "step": 1260 + }, + { + "loss": 1.0289, + "grad_norm": 2.9359328746795654, + "learning_rate": 7.112796850502578e-07, + "epoch": 2.127383197150639, + "step": 1270 + }, + { + "loss": 1.1569, + "grad_norm": 0.8461847901344299, + "learning_rate": 7.05965026570517e-07, + "epoch": 2.144144144144144, + "step": 1280 + }, + { + "loss": 1.0697, + "grad_norm": 0.7913004755973816, + "learning_rate": 7.006221922905111e-07, + "epoch": 2.160905091137649, + "step": 1290 + }, + { + "loss": 1.0053, + "grad_norm": 0.9257745146751404, + "learning_rate": 6.95251913104383e-07, + "epoch": 2.1776660381311546, + "step": 1300 + }, + { + "eval_loss": 1.0785434246063232, + "eval_runtime": 91.5293, + "eval_samples_per_second": 5.441, + "eval_steps_per_second": 2.72, + "epoch": 2.1776660381311546, + "step": 1300 + }, + { + "loss": 0.9318, + "grad_norm": 1.099394679069519, + "learning_rate": 6.898549236607098e-07, + "epoch": 2.1944269851246596, + "step": 1310 + }, + { + "loss": 0.9994, + "grad_norm": 1.126979947090149, + "learning_rate": 6.844319622620039e-07, + "epoch": 2.2111879321181647, + "step": 1320 + }, + { + "loss": 1.0099, + "grad_norm": 0.7986570596694946, + "learning_rate": 6.789837707637142e-07, + "epoch": 2.2279488791116697, + "step": 1330 + }, + { + "loss": 0.9439, + "grad_norm": 0.7648953199386597, + "learning_rate": 6.735110944727404e-07, + "epoch": 2.2447098261051748, + "step": 1340 + }, + { + "loss": 1.034, + "grad_norm": 1.128675103187561, + "learning_rate": 6.68014682045477e-07, + "epoch": 2.2614707730986803, + "step": 1350 + }, + { + "eval_loss": 1.0722501277923584, + "eval_runtime": 87.4702, + "eval_samples_per_second": 5.693, + "eval_steps_per_second": 2.847, + "epoch": 2.2614707730986803, + "step": 1350 + }, + { + "loss": 0.9582, + "grad_norm": 1.0245682001113892, + "learning_rate": 6.624952853853962e-07, + "epoch": 2.2782317200921853, + "step": 1360 + }, + { + "loss": 0.999, + "grad_norm": 0.8576367497444153, + "learning_rate": 6.569536595401899e-07, + "epoch": 2.2949926670856904, + "step": 1370 + }, + { + "loss": 0.8678, + "grad_norm": 0.8585372567176819, + "learning_rate": 6.513905625984792e-07, + "epoch": 2.3117536140791954, + "step": 1380 + }, + { + "loss": 1.0609, + "grad_norm": 0.7375667095184326, + "learning_rate": 6.458067555861082e-07, + "epoch": 2.3285145610727005, + "step": 1390 + }, + { + "loss": 0.9888, + "grad_norm": 0.892026424407959, + "learning_rate": 6.402030023620378e-07, + "epoch": 2.3452755080662055, + "step": 1400 + }, + { + "eval_loss": 1.0668587684631348, + "eval_runtime": 88.1591, + "eval_samples_per_second": 5.649, + "eval_steps_per_second": 2.824, + "epoch": 2.3452755080662055, + "step": 1400 + }, + { + "loss": 0.9441, + "grad_norm": 0.9670091867446899, + "learning_rate": 6.345800695138491e-07, + "epoch": 2.362036455059711, + "step": 1410 + }, + { + "loss": 1.0477, + "grad_norm": 0.7848078608512878, + "learning_rate": 6.289387262528765e-07, + "epoch": 2.378797402053216, + "step": 1420 + }, + { + "loss": 1.0631, + "grad_norm": 0.8168392181396484, + "learning_rate": 6.232797443089786e-07, + "epoch": 2.395558349046721, + "step": 1430 + }, + { + "loss": 1.0002, + "grad_norm": 1.408933162689209, + "learning_rate": 6.176038978249682e-07, + "epoch": 2.412319296040226, + "step": 1440 + }, + { + "loss": 1.0355, + "grad_norm": 0.7367719411849976, + "learning_rate": 6.119119632507095e-07, + "epoch": 2.4290802430337313, + "step": 1450 + }, + { + "eval_loss": 1.0617327690124512, + "eval_runtime": 91.8946, + "eval_samples_per_second": 5.419, + "eval_steps_per_second": 2.71, + "epoch": 2.4290802430337313, + "step": 1450 + }, + { + "loss": 1.0361, + "grad_norm": 1.1367297172546387, + "learning_rate": 6.062047192369002e-07, + "epoch": 2.4458411900272363, + "step": 1460 + }, + { + "loss": 0.887, + "grad_norm": 0.9835084080696106, + "learning_rate": 6.004829465285534e-07, + "epoch": 2.462602137020742, + "step": 1470 + }, + { + "loss": 0.9896, + "grad_norm": 0.7928352355957031, + "learning_rate": 5.947474278581929e-07, + "epoch": 2.479363084014247, + "step": 1480 + }, + { + "loss": 0.9035, + "grad_norm": 1.2313053607940674, + "learning_rate": 5.889989478387752e-07, + "epoch": 2.496124031007752, + "step": 1490 + }, + { + "loss": 0.9664, + "grad_norm": 1.1574859619140625, + "learning_rate": 5.832382928563559e-07, + "epoch": 2.512884978001257, + "step": 1500 + }, + { + "eval_loss": 1.0569761991500854, + "eval_runtime": 89.9236, + "eval_samples_per_second": 5.538, + "eval_steps_per_second": 2.769, + "epoch": 2.512884978001257, + "step": 1500 + }, + { + "loss": 0.9718, + "grad_norm": 0.8173345923423767, + "learning_rate": 5.77466250962513e-07, + "epoch": 2.529645924994762, + "step": 1510 + }, + { + "loss": 0.9839, + "grad_norm": 0.9848146438598633, + "learning_rate": 5.71683611766542e-07, + "epoch": 2.5464068719882675, + "step": 1520 + }, + { + "loss": 0.9926, + "grad_norm": 1.1053595542907715, + "learning_rate": 5.658911663274381e-07, + "epoch": 2.5631678189817726, + "step": 1530 + }, + { + "loss": 1.0227, + "grad_norm": 1.3307827711105347, + "learning_rate": 5.600897070456806e-07, + "epoch": 2.5799287659752776, + "step": 1540 + }, + { + "loss": 1.0063, + "grad_norm": 0.9872063994407654, + "learning_rate": 5.542800275548328e-07, + "epoch": 2.5966897129687827, + "step": 1550 + }, + { + "eval_loss": 1.0520267486572266, + "eval_runtime": 91.1026, + "eval_samples_per_second": 5.466, + "eval_steps_per_second": 2.733, + "epoch": 2.5966897129687827, + "step": 1550 + }, + { + "loss": 1.0276, + "grad_norm": 0.9122388362884521, + "learning_rate": 5.484629226129741e-07, + "epoch": 2.6134506599622878, + "step": 1560 + }, + { + "loss": 1.0624, + "grad_norm": 0.9454676508903503, + "learning_rate": 5.426391879939778e-07, + "epoch": 2.6302116069557933, + "step": 1570 + }, + { + "loss": 0.9732, + "grad_norm": 1.2247437238693237, + "learning_rate": 5.368096203786499e-07, + "epoch": 2.646972553949298, + "step": 1580 + }, + { + "loss": 0.974, + "grad_norm": 0.9937558174133301, + "learning_rate": 5.30975017245745e-07, + "epoch": 2.6637335009428034, + "step": 1590 + }, + { + "loss": 0.8743, + "grad_norm": 1.4427547454833984, + "learning_rate": 5.251361767628701e-07, + "epoch": 2.6804944479363084, + "step": 1600 + }, + { + "eval_loss": 1.047125220298767, + "eval_runtime": 83.9836, + "eval_samples_per_second": 5.93, + "eval_steps_per_second": 2.965, + "epoch": 2.6804944479363084, + "step": 1600 + }, + { + "loss": 0.9842, + "grad_norm": 0.8265151381492615, + "learning_rate": 5.192938976772981e-07, + "epoch": 2.6972553949298135, + "step": 1610 + }, + { + "loss": 0.8733, + "grad_norm": 0.9592719078063965, + "learning_rate": 5.134489792066985e-07, + "epoch": 2.7140163419233185, + "step": 1620 + }, + { + "loss": 1.0066, + "grad_norm": 0.7584534287452698, + "learning_rate": 5.076022209298067e-07, + "epoch": 2.7307772889168236, + "step": 1630 + }, + { + "loss": 0.9494, + "grad_norm": 1.0090166330337524, + "learning_rate": 5.01754422677041e-07, + "epoch": 2.747538235910329, + "step": 1640 + }, + { + "loss": 1.0154, + "grad_norm": 0.8638355135917664, + "learning_rate": 4.959063844210877e-07, + "epoch": 2.764299182903834, + "step": 1650 + }, + { + "eval_loss": 1.043318510055542, + "eval_runtime": 83.5648, + "eval_samples_per_second": 5.959, + "eval_steps_per_second": 2.98, + "epoch": 2.764299182903834, + "step": 1650 + }, + { + "loss": 0.9535, + "grad_norm": 1.0650914907455444, + "learning_rate": 4.900589061674649e-07, + "epoch": 2.781060129897339, + "step": 1660 + }, + { + "loss": 1.0118, + "grad_norm": 0.974525511264801, + "learning_rate": 4.842127878450835e-07, + "epoch": 2.7978210768908442, + "step": 1670 + }, + { + "loss": 0.9973, + "grad_norm": 1.2957974672317505, + "learning_rate": 4.783688291968167e-07, + "epoch": 2.8145820238843493, + "step": 1680 + }, + { + "loss": 0.9681, + "grad_norm": 1.5214028358459473, + "learning_rate": 4.7252782967009695e-07, + "epoch": 2.831342970877855, + "step": 1690 + }, + { + "loss": 0.8866, + "grad_norm": 0.826191782951355, + "learning_rate": 4.666905883075516e-07, + "epoch": 2.84810391787136, + "step": 1700 + }, + { + "eval_loss": 1.0400274991989136, + "eval_runtime": 87.0299, + "eval_samples_per_second": 5.722, + "eval_steps_per_second": 2.861, + "epoch": 2.84810391787136, + "step": 1700 + }, + { + "loss": 0.8643, + "grad_norm": 1.1714636087417603, + "learning_rate": 4.608579036376955e-07, + "epoch": 2.864864864864865, + "step": 1710 + }, + { + "loss": 1.0582, + "grad_norm": 1.2290012836456299, + "learning_rate": 4.5503057356569236e-07, + "epoch": 2.88162581185837, + "step": 1720 + }, + { + "loss": 0.9494, + "grad_norm": 0.9297714829444885, + "learning_rate": 4.492093952642027e-07, + "epoch": 2.898386758851875, + "step": 1730 + }, + { + "loss": 0.9734, + "grad_norm": 0.9026303887367249, + "learning_rate": 4.433951650643307e-07, + "epoch": 2.9151477058453805, + "step": 1740 + }, + { + "loss": 1.0392, + "grad_norm": 0.9304074048995972, + "learning_rate": 4.375886783466887e-07, + "epoch": 2.931908652838885, + "step": 1750 + }, + { + "eval_loss": 1.0371840000152588, + "eval_runtime": 85.5465, + "eval_samples_per_second": 5.821, + "eval_steps_per_second": 2.911, + "epoch": 2.931908652838885, + "step": 1750 + }, + { + "loss": 1.0665, + "grad_norm": 1.2075411081314087, + "learning_rate": 4.3179072943258764e-07, + "epoch": 2.9486695998323906, + "step": 1760 + }, + { + "loss": 0.9014, + "grad_norm": 1.3210104703903198, + "learning_rate": 4.2600211147537634e-07, + "epoch": 2.9654305468258957, + "step": 1770 + }, + { + "loss": 0.8844, + "grad_norm": 0.9938521385192871, + "learning_rate": 4.2022361635193835e-07, + "epoch": 2.9821914938194007, + "step": 1780 + }, + { + "loss": 1.0234, + "grad_norm": 1.0057061910629272, + "learning_rate": 4.1445603455436425e-07, + "epoch": 2.998952440812906, + "step": 1790 + }, + { + "loss": 0.9891, + "grad_norm": 0.8251981735229492, + "learning_rate": 4.0870015508181304e-07, + "epoch": 3.0150848522941547, + "step": 1800 + }, + { + "eval_loss": 1.0345975160598755, + "eval_runtime": 87.5549, + "eval_samples_per_second": 5.688, + "eval_steps_per_second": 2.844, + "epoch": 3.0150848522941547, + "step": 1800 + }, + { + "loss": 0.9519, + "grad_norm": 1.1525200605392456, + "learning_rate": 4.029567653325778e-07, + "epoch": 3.0318457992876597, + "step": 1810 + }, + { + "loss": 1.0024, + "grad_norm": 0.8204362392425537, + "learning_rate": 3.972266509963707e-07, + "epoch": 3.048606746281165, + "step": 1820 + }, + { + "loss": 0.9529, + "grad_norm": 1.0004171133041382, + "learning_rate": 3.9151059594684093e-07, + "epoch": 3.06536769327467, + "step": 1830 + }, + { + "loss": 0.9776, + "grad_norm": 0.9302512407302856, + "learning_rate": 3.858093821343425e-07, + "epoch": 3.0821286402681753, + "step": 1840 + }, + { + "loss": 0.8896, + "grad_norm": 1.0334570407867432, + "learning_rate": 3.801237894789632e-07, + "epoch": 3.0988895872616804, + "step": 1850 + }, + { + "eval_loss": 1.0320886373519897, + "eval_runtime": 80.1434, + "eval_samples_per_second": 6.214, + "eval_steps_per_second": 3.107, + "epoch": 3.0988895872616804, + "step": 1850 + }, + { + "loss": 0.9555, + "grad_norm": 0.9561256766319275, + "learning_rate": 3.744545957638332e-07, + "epoch": 3.1156505342551855, + "step": 1860 + }, + { + "loss": 0.972, + "grad_norm": 1.024395227432251, + "learning_rate": 3.6880257652872447e-07, + "epoch": 3.1324114812486905, + "step": 1870 + }, + { + "loss": 1.0468, + "grad_norm": 1.217880129814148, + "learning_rate": 3.6316850496395855e-07, + "epoch": 3.1491724282421956, + "step": 1880 + }, + { + "loss": 0.817, + "grad_norm": 1.033187985420227, + "learning_rate": 3.57553151804634e-07, + "epoch": 3.1659333752357006, + "step": 1890 + }, + { + "loss": 1.0248, + "grad_norm": 1.149246335029602, + "learning_rate": 3.519572852251914e-07, + "epoch": 3.182694322229206, + "step": 1900 + }, + { + "eval_loss": 1.0297226905822754, + "eval_runtime": 82.8338, + "eval_samples_per_second": 6.012, + "eval_steps_per_second": 3.006, + "epoch": 3.182694322229206, + "step": 1900 + }, + { + "loss": 0.9628, + "grad_norm": 1.001043438911438, + "learning_rate": 3.4638167073432693e-07, + "epoch": 3.199455269222711, + "step": 1910 + }, + { + "loss": 0.9998, + "grad_norm": 1.1501474380493164, + "learning_rate": 3.4082707107027343e-07, + "epoch": 3.2162162162162162, + "step": 1920 + }, + { + "loss": 1.0307, + "grad_norm": 0.9195724725723267, + "learning_rate": 3.352942460964564e-07, + "epoch": 3.2329771632097213, + "step": 1930 + }, + { + "loss": 0.8789, + "grad_norm": 1.1816115379333496, + "learning_rate": 3.297839526975467e-07, + "epoch": 3.2497381102032263, + "step": 1940 + }, + { + "loss": 0.9866, + "grad_norm": 1.1459033489227295, + "learning_rate": 3.242969446759195e-07, + "epoch": 3.2664990571967314, + "step": 1950 + }, + { + "eval_loss": 1.0278631448745728, + "eval_runtime": 82.4435, + "eval_samples_per_second": 6.041, + "eval_steps_per_second": 3.02, + "epoch": 3.2664990571967314, + "step": 1950 + }, + { + "loss": 1.0124, + "grad_norm": 1.1944814920425415, + "learning_rate": 3.188339726485344e-07, + "epoch": 3.283260004190237, + "step": 1960 + }, + { + "loss": 0.964, + "grad_norm": 1.3220574855804443, + "learning_rate": 3.133957839442526e-07, + "epoch": 3.300020951183742, + "step": 1970 + }, + { + "loss": 1.0585, + "grad_norm": 0.9459046721458435, + "learning_rate": 3.079831225016023e-07, + "epoch": 3.316781898177247, + "step": 1980 + }, + { + "loss": 1.0015, + "grad_norm": 1.162500023841858, + "learning_rate": 3.02596728767009e-07, + "epoch": 3.333542845170752, + "step": 1990 + }, + { + "loss": 0.9869, + "grad_norm": 1.0170260667800903, + "learning_rate": 2.9723733959350303e-07, + "epoch": 3.350303792164257, + "step": 2000 + }, + { + "eval_loss": 1.0256966352462769, + "eval_runtime": 80.2113, + "eval_samples_per_second": 6.209, + "eval_steps_per_second": 3.104, + "epoch": 3.350303792164257, + "step": 2000 + }, + { + "loss": 1.0388, + "grad_norm": 0.9814099669456482, + "learning_rate": 2.9190568813991957e-07, + "epoch": 3.3670647391577626, + "step": 2010 + }, + { + "loss": 1.0851, + "grad_norm": 0.835394561290741, + "learning_rate": 2.8660250377060216e-07, + "epoch": 3.3838256861512677, + "step": 2020 + }, + { + "loss": 0.9552, + "grad_norm": 0.8727148771286011, + "learning_rate": 2.8132851195562717e-07, + "epoch": 3.4005866331447727, + "step": 2030 + }, + { + "loss": 0.9742, + "grad_norm": 0.9619453549385071, + "learning_rate": 2.7608443417155997e-07, + "epoch": 3.4173475801382778, + "step": 2040 + }, + { + "loss": 0.8432, + "grad_norm": 0.8250086903572083, + "learning_rate": 2.708709878027584e-07, + "epoch": 3.434108527131783, + "step": 2050 + }, + { + "eval_loss": 1.023971676826477, + "eval_runtime": 80.4758, + "eval_samples_per_second": 6.188, + "eval_steps_per_second": 3.094, + "epoch": 3.434108527131783, + "step": 2050 + }, + { + "loss": 0.9283, + "grad_norm": 1.4034401178359985, + "learning_rate": 2.656888860432337e-07, + "epoch": 3.4508694741252883, + "step": 2060 + }, + { + "loss": 1.0269, + "grad_norm": 1.1039758920669556, + "learning_rate": 2.605388377990879e-07, + "epoch": 3.4676304211187934, + "step": 2070 + }, + { + "loss": 0.9756, + "grad_norm": 1.1378097534179688, + "learning_rate": 2.554215475915358e-07, + "epoch": 3.4843913681122984, + "step": 2080 + }, + { + "loss": 0.9271, + "grad_norm": 1.0743006467819214, + "learning_rate": 2.503377154605264e-07, + "epoch": 3.5011523151058035, + "step": 2090 + }, + { + "loss": 0.9881, + "grad_norm": 1.0452345609664917, + "learning_rate": 2.452880368689798e-07, + "epoch": 3.5179132620993085, + "step": 2100 + }, + { + "eval_loss": 1.0225834846496582, + "eval_runtime": 81.0513, + "eval_samples_per_second": 6.144, + "eval_steps_per_second": 3.072, + "epoch": 3.5179132620993085, + "step": 2100 + }, + { + "loss": 0.9588, + "grad_norm": 0.8060430288314819, + "learning_rate": 2.402732026076468e-07, + "epoch": 3.5346742090928136, + "step": 2110 + }, + { + "loss": 0.9306, + "grad_norm": 1.1182574033737183, + "learning_rate": 2.352938987006106e-07, + "epoch": 3.5514351560863187, + "step": 2120 + }, + { + "loss": 0.7893, + "grad_norm": 1.048654317855835, + "learning_rate": 2.3035080631143893e-07, + "epoch": 3.568196103079824, + "step": 2130 + }, + { + "loss": 0.9771, + "grad_norm": 0.9173722863197327, + "learning_rate": 2.254446016500019e-07, + "epoch": 3.584957050073329, + "step": 2140 + }, + { + "loss": 0.8972, + "grad_norm": 0.6042903661727905, + "learning_rate": 2.205759558799669e-07, + "epoch": 3.6017179970668343, + "step": 2150 + }, + { + "eval_loss": 1.0213065147399902, + "eval_runtime": 78.4141, + "eval_samples_per_second": 6.351, + "eval_steps_per_second": 3.175, + "epoch": 3.6017179970668343, + "step": 2150 + }, + { + "loss": 0.9735, + "grad_norm": 1.3135992288589478, + "learning_rate": 2.1574553502698434e-07, + "epoch": 3.6184789440603393, + "step": 2160 + }, + { + "loss": 0.9689, + "grad_norm": 0.8670032620429993, + "learning_rate": 2.1095399988757572e-07, + "epoch": 3.6352398910538444, + "step": 2170 + }, + { + "loss": 0.9137, + "grad_norm": 0.9747626185417175, + "learning_rate": 2.0620200593873816e-07, + "epoch": 3.65200083804735, + "step": 2180 + }, + { + "loss": 1.0097, + "grad_norm": 1.069161057472229, + "learning_rate": 2.0149020324827487e-07, + "epoch": 3.668761785040855, + "step": 2190 + }, + { + "loss": 0.9414, + "grad_norm": 0.9334145784378052, + "learning_rate": 1.9681923638586657e-07, + "epoch": 3.68552273203436, + "step": 2200 + }, + { + "eval_loss": 1.0197768211364746, + "eval_runtime": 78.9262, + "eval_samples_per_second": 6.31, + "eval_steps_per_second": 3.155, + "epoch": 3.68552273203436, + "step": 2200 + }, + { + "loss": 0.9483, + "grad_norm": 0.9081120491027832, + "learning_rate": 1.921897443348958e-07, + "epoch": 3.702283679027865, + "step": 2210 + }, + { + "loss": 0.7712, + "grad_norm": 0.8620381355285645, + "learning_rate": 1.876023604050347e-07, + "epoch": 3.71904462602137, + "step": 2220 + }, + { + "loss": 0.9694, + "grad_norm": 0.9712274074554443, + "learning_rate": 1.8305771214560773e-07, + "epoch": 3.7358055730148756, + "step": 2230 + }, + { + "loss": 0.9673, + "grad_norm": 1.1614145040512085, + "learning_rate": 1.7855642125974458e-07, + "epoch": 3.75256652000838, + "step": 2240 + }, + { + "loss": 0.9736, + "grad_norm": 0.8890505433082581, + "learning_rate": 1.740991035193317e-07, + "epoch": 3.7693274670018857, + "step": 2250 + }, + { + "eval_loss": 1.019027829170227, + "eval_runtime": 78.5313, + "eval_samples_per_second": 6.341, + "eval_steps_per_second": 3.171, + "epoch": 3.7693274670018857, + "step": 2250 + }, + { + "loss": 0.9323, + "grad_norm": 0.9356293678283691, + "learning_rate": 1.6968636868077514e-07, + "epoch": 3.7860884139953908, + "step": 2260 + }, + { + "loss": 0.8985, + "grad_norm": 1.0536713600158691, + "learning_rate": 1.6531882040158645e-07, + "epoch": 3.802849360988896, + "step": 2270 + }, + { + "loss": 0.9493, + "grad_norm": 0.9968764185905457, + "learning_rate": 1.609970561578034e-07, + "epoch": 3.819610307982401, + "step": 2280 + }, + { + "loss": 0.9251, + "grad_norm": 1.304086685180664, + "learning_rate": 1.5672166716225533e-07, + "epoch": 3.836371254975906, + "step": 2290 + }, + { + "loss": 1.0068, + "grad_norm": 0.8861364722251892, + "learning_rate": 1.524932382836861e-07, + "epoch": 3.8531322019694114, + "step": 2300 + }, + { + "eval_loss": 1.0179270505905151, + "eval_runtime": 78.5978, + "eval_samples_per_second": 6.336, + "eval_steps_per_second": 3.168, + "epoch": 3.8531322019694114, + "step": 2300 + }, + { + "loss": 0.9483, + "grad_norm": 1.5795460939407349, + "learning_rate": 1.4831234796674515e-07, + "epoch": 3.8698931489629165, + "step": 2310 + }, + { + "loss": 0.9287, + "grad_norm": 0.9922671914100647, + "learning_rate": 1.4417956815285576e-07, + "epoch": 3.8866540959564215, + "step": 2320 + }, + { + "loss": 0.9983, + "grad_norm": 0.9773964881896973, + "learning_rate": 1.4009546420197522e-07, + "epoch": 3.9034150429499266, + "step": 2330 + }, + { + "loss": 0.8862, + "grad_norm": 0.863190233707428, + "learning_rate": 1.3606059481525296e-07, + "epoch": 3.9201759899434316, + "step": 2340 + }, + { + "loss": 0.901, + "grad_norm": 1.0745151042938232, + "learning_rate": 1.320755119586024e-07, + "epoch": 3.936936936936937, + "step": 2350 + }, + { + "eval_loss": 1.0171653032302856, + "eval_runtime": 78.7387, + "eval_samples_per_second": 6.325, + "eval_steps_per_second": 3.162, + "epoch": 3.936936936936937, + "step": 2350 + }, + { + "loss": 0.9247, + "grad_norm": 1.1230072975158691, + "learning_rate": 1.2814076078719111e-07, + "epoch": 3.953697883930442, + "step": 2360 + }, + { + "loss": 0.9028, + "grad_norm": 1.0916768312454224, + "learning_rate": 1.24256879570865e-07, + "epoch": 3.9704588309239472, + "step": 2370 + }, + { + "loss": 0.9669, + "grad_norm": 1.2703828811645508, + "learning_rate": 1.2042439962051316e-07, + "epoch": 3.9872197779174523, + "step": 2380 + }, + { + "loss": 0.9533, + "grad_norm": 0.9693753123283386, + "learning_rate": 1.1664384521538529e-07, + "epoch": 4.003352189398701, + "step": 2390 + }, + { + "loss": 0.9692, + "grad_norm": 0.9873965382575989, + "learning_rate": 1.129157335313709e-07, + "epoch": 4.020113136392206, + "step": 2400 + }, + { + "eval_loss": 1.0167144536972046, + "eval_runtime": 78.925, + "eval_samples_per_second": 6.31, + "eval_steps_per_second": 3.155, + "epoch": 4.020113136392206, + "step": 2400 + }, + { + "loss": 0.8831, + "grad_norm": 1.532692313194275, + "learning_rate": 1.0924057457025004e-07, + "epoch": 4.036874083385711, + "step": 2410 + }, + { + "loss": 0.9041, + "grad_norm": 1.508852481842041, + "learning_rate": 1.0561887108992557e-07, + "epoch": 4.053635030379216, + "step": 2420 + }, + { + "loss": 0.8563, + "grad_norm": 0.8709840178489685, + "learning_rate": 1.0205111853564635e-07, + "epoch": 4.070395977372722, + "step": 2430 + }, + { + "loss": 0.9356, + "grad_norm": 0.9158110022544861, + "learning_rate": 9.853780497223141e-08, + "epoch": 4.0871569243662265, + "step": 2440 + }, + { + "loss": 0.9407, + "grad_norm": 1.1595951318740845, + "learning_rate": 9.507941101730243e-08, + "epoch": 4.103917871359732, + "step": 2450 + }, + { + "eval_loss": 1.0161402225494385, + "eval_runtime": 85.8936, + "eval_samples_per_second": 5.798, + "eval_steps_per_second": 2.899, + "epoch": 4.103917871359732, + "step": 2450 + }, + { + "loss": 0.8546, + "grad_norm": 1.0961774587631226, + "learning_rate": 9.16764097755361e-08, + "epoch": 4.120678818353237, + "step": 2460 + }, + { + "loss": 0.8384, + "grad_norm": 1.2403303384780884, + "learning_rate": 8.832926677394387e-08, + "epoch": 4.137439765346742, + "step": 2470 + }, + { + "loss": 0.912, + "grad_norm": 0.8818101286888123, + "learning_rate": 8.503843989818843e-08, + "epoch": 4.154200712340248, + "step": 2480 + }, + { + "loss": 0.8917, + "grad_norm": 0.7812055349349976, + "learning_rate": 8.180437932994521e-08, + "epoch": 4.170961659333752, + "step": 2490 + }, + { + "loss": 0.9279, + "grad_norm": 1.0156097412109375, + "learning_rate": 7.862752748531831e-08, + "epoch": 4.187722606327258, + "step": 2500 + }, + { + "eval_loss": 1.015537977218628, + "eval_runtime": 83.5946, + "eval_samples_per_second": 5.957, + "eval_steps_per_second": 2.979, + "epoch": 4.187722606327258, + "step": 2500 + }, + { + "loss": 0.9586, + "grad_norm": 1.0586673021316528, + "learning_rate": 7.550831895431797e-08, + "epoch": 4.204483553320762, + "step": 2510 + }, + { + "loss": 0.9436, + "grad_norm": 1.20182466506958, + "learning_rate": 7.244718044140985e-08, + "epoch": 4.221244500314268, + "step": 2520 + }, + { + "loss": 0.9139, + "grad_norm": 0.9434934854507446, + "learning_rate": 6.944453070714162e-08, + "epoch": 4.238005447307773, + "step": 2530 + }, + { + "loss": 0.9247, + "grad_norm": 1.0741653442382812, + "learning_rate": 6.650078051085689e-08, + "epoch": 4.254766394301278, + "step": 2540 + }, + { + "loss": 1.0214, + "grad_norm": 1.047150731086731, + "learning_rate": 6.361633255450449e-08, + "epoch": 4.271527341294783, + "step": 2550 + }, + { + "eval_loss": 1.0152950286865234, + "eval_runtime": 87.3007, + "eval_samples_per_second": 5.704, + "eval_steps_per_second": 2.852, + "epoch": 4.271527341294783, + "step": 2550 + }, + { + "loss": 0.9456, + "grad_norm": 0.9247802495956421, + "learning_rate": 6.079158142754853e-08, + "epoch": 4.288288288288288, + "step": 2560 + }, + { + "loss": 0.9563, + "grad_norm": 0.912956714630127, + "learning_rate": 5.802691355298978e-08, + "epoch": 4.3050492352817935, + "step": 2570 + }, + { + "loss": 1.0284, + "grad_norm": 1.1536314487457275, + "learning_rate": 5.5322707134502374e-08, + "epoch": 4.321810182275298, + "step": 2580 + }, + { + "loss": 0.9638, + "grad_norm": 1.1163140535354614, + "learning_rate": 5.267933210469666e-08, + "epoch": 4.338571129268804, + "step": 2590 + }, + { + "loss": 0.9299, + "grad_norm": 0.9342344403266907, + "learning_rate": 5.009715007451265e-08, + "epoch": 4.355332076262309, + "step": 2600 + }, + { + "eval_loss": 1.0149102210998535, + "eval_runtime": 80.4709, + "eval_samples_per_second": 6.189, + "eval_steps_per_second": 3.094, + "epoch": 4.355332076262309, + "step": 2600 + }, + { + "loss": 0.949, + "grad_norm": 0.8422918319702148, + "learning_rate": 4.7576514283752034e-08, + "epoch": 4.372093023255814, + "step": 2610 + }, + { + "loss": 0.9393, + "grad_norm": 0.9308228492736816, + "learning_rate": 4.51177695527552e-08, + "epoch": 4.388853970249319, + "step": 2620 + }, + { + "loss": 0.9508, + "grad_norm": 1.0049588680267334, + "learning_rate": 4.272125223523038e-08, + "epoch": 4.405614917242824, + "step": 2630 + }, + { + "loss": 0.9882, + "grad_norm": 1.437691569328308, + "learning_rate": 4.038729017224052e-08, + "epoch": 4.422375864236329, + "step": 2640 + }, + { + "loss": 0.9925, + "grad_norm": 1.2916972637176514, + "learning_rate": 3.811620264735549e-08, + "epoch": 4.439136811229835, + "step": 2650 + }, + { + "eval_loss": 1.0145703554153442, + "eval_runtime": 80.4565, + "eval_samples_per_second": 6.19, + "eval_steps_per_second": 3.095, + "epoch": 4.439136811229835, + "step": 2650 + }, + { + "loss": 0.9384, + "grad_norm": 1.57413649559021, + "learning_rate": 3.590830034297382e-08, + "epoch": 4.4558977582233394, + "step": 2660 + }, + { + "loss": 0.9637, + "grad_norm": 0.9036434888839722, + "learning_rate": 3.376388529782215e-08, + "epoch": 4.472658705216845, + "step": 2670 + }, + { + "loss": 1.0354, + "grad_norm": 1.0106011629104614, + "learning_rate": 3.1683250865636114e-08, + "epoch": 4.4894196522103496, + "step": 2680 + }, + { + "loss": 1.0022, + "grad_norm": 1.2949162721633911, + "learning_rate": 2.9666681675030448e-08, + "epoch": 4.506180599203855, + "step": 2690 + }, + { + "loss": 0.9365, + "grad_norm": 1.0403434038162231, + "learning_rate": 2.7714453590561848e-08, + "epoch": 4.5229415461973606, + "step": 2700 + }, + { + "eval_loss": 1.0145213603973389, + "eval_runtime": 80.1701, + "eval_samples_per_second": 6.212, + "eval_steps_per_second": 3.106, + "epoch": 4.5229415461973606, + "step": 2700 + }, + { + "loss": 0.9626, + "grad_norm": 0.9351533651351929, + "learning_rate": 2.5826833674990888e-08, + "epoch": 4.539702493190865, + "step": 2710 + }, + { + "loss": 0.9194, + "grad_norm": 1.2137658596038818, + "learning_rate": 2.4004080152748184e-08, + "epoch": 4.556463440184371, + "step": 2720 + }, + { + "loss": 0.9633, + "grad_norm": 0.9525250792503357, + "learning_rate": 2.2246442374609597e-08, + "epoch": 4.573224387177875, + "step": 2730 + }, + { + "loss": 0.9341, + "grad_norm": 0.8369765281677246, + "learning_rate": 2.0554160783585294e-08, + "epoch": 4.589985334171381, + "step": 2740 + }, + { + "loss": 0.9747, + "grad_norm": 1.0690010786056519, + "learning_rate": 1.8927466882027344e-08, + "epoch": 4.606746281164886, + "step": 2750 + }, + { + "eval_loss": 1.0143725872039795, + "eval_runtime": 78.6989, + "eval_samples_per_second": 6.328, + "eval_steps_per_second": 3.164, + "epoch": 4.606746281164886, + "step": 2750 + }, + { + "loss": 1.0532, + "grad_norm": 1.0263952016830444, + "learning_rate": 1.736658319996054e-08, + "epoch": 4.623507228158391, + "step": 2760 + }, + { + "loss": 0.8263, + "grad_norm": 0.918011486530304, + "learning_rate": 1.5871723264640313e-08, + "epoch": 4.640268175151896, + "step": 2770 + }, + { + "loss": 0.8762, + "grad_norm": 0.8909319043159485, + "learning_rate": 1.444309157134288e-08, + "epoch": 4.657029122145401, + "step": 2780 + }, + { + "loss": 0.9734, + "grad_norm": 0.8371102213859558, + "learning_rate": 1.3080883555389944e-08, + "epoch": 4.6737900691389065, + "step": 2790 + }, + { + "loss": 0.8306, + "grad_norm": 1.1059422492980957, + "learning_rate": 1.1785285565413639e-08, + "epoch": 4.690551016132411, + "step": 2800 + }, + { + "eval_loss": 1.0143219232559204, + "eval_runtime": 83.9047, + "eval_samples_per_second": 5.935, + "eval_steps_per_second": 2.968, + "epoch": 4.690551016132411, + "step": 2800 + }, + { + "loss": 0.8965, + "grad_norm": 1.0232831239700317, + "learning_rate": 1.055647483786437e-08, + "epoch": 4.707311963125917, + "step": 2810 + }, + { + "loss": 0.8825, + "grad_norm": 1.0429071187973022, + "learning_rate": 9.394619472764486e-09, + "epoch": 4.724072910119422, + "step": 2820 + }, + { + "loss": 0.9159, + "grad_norm": 0.9357975721359253, + "learning_rate": 8.299878410713224e-09, + "epoch": 4.740833857112927, + "step": 2830 + }, + { + "loss": 1.0004, + "grad_norm": 1.033324956893921, + "learning_rate": 7.272401411143159e-09, + "epoch": 4.757594804106432, + "step": 2840 + }, + { + "loss": 0.9712, + "grad_norm": 1.018965721130371, + "learning_rate": 6.312329031833319e-09, + "epoch": 4.774355751099937, + "step": 2850 + }, + { + "eval_loss": 1.0143295526504517, + "eval_runtime": 86.6954, + "eval_samples_per_second": 5.744, + "eval_steps_per_second": 2.872, + "epoch": 4.774355751099937, + "step": 2850 + }, + { + "loss": 0.8875, + "grad_norm": 1.0147583484649658, + "learning_rate": 5.419792609681284e-09, + "epoch": 4.791116698093442, + "step": 2860 + }, + { + "loss": 0.9097, + "grad_norm": 1.2517958879470825, + "learning_rate": 4.594914242736503e-09, + "epoch": 4.807877645086947, + "step": 2870 + }, + { + "loss": 1.0227, + "grad_norm": 1.0632728338241577, + "learning_rate": 3.837806773496821e-09, + "epoch": 4.824638592080452, + "step": 2880 + }, + { + "loss": 0.9568, + "grad_norm": 0.9697940349578857, + "learning_rate": 3.1485737734724406e-09, + "epoch": 4.841399539073958, + "step": 2890 + }, + { + "loss": 0.9155, + "grad_norm": 0.8009698987007141, + "learning_rate": 2.5273095290169742e-09, + "epoch": 4.8581604860674625, + "step": 2900 + }, + { + "eval_loss": 1.014455795288086, + "eval_runtime": 87.3886, + "eval_samples_per_second": 5.699, + "eval_steps_per_second": 2.849, + "epoch": 4.8581604860674625, + "step": 2900 + }, + { + "loss": 0.9871, + "grad_norm": 1.121019959449768, + "learning_rate": 1.974099028429599e-09, + "epoch": 4.874921433060968, + "step": 2910 + }, + { + "loss": 0.9819, + "grad_norm": 1.091722846031189, + "learning_rate": 1.4890179503281862e-09, + "epoch": 4.891682380054473, + "step": 2920 + }, + { + "loss": 0.9754, + "grad_norm": 0.8344740271568298, + "learning_rate": 1.072132653297031e-09, + "epoch": 4.908443327047978, + "step": 2930 + }, + { + "loss": 0.9712, + "grad_norm": 0.9808083176612854, + "learning_rate": 7.235001668088325e-10, + "epoch": 4.925204274041484, + "step": 2940 + }, + { + "loss": 0.878, + "grad_norm": 0.7794699668884277, + "learning_rate": 4.4316818342321483e-10, + "epoch": 4.941965221034988, + "step": 2950 + }, + { + "eval_loss": 1.0143743753433228, + "eval_runtime": 85.6525, + "eval_samples_per_second": 5.814, + "eval_steps_per_second": 2.907, + "epoch": 4.941965221034988, + "step": 2950 + }, + { + "loss": 1.0162, + "grad_norm": 0.7946218252182007, + "learning_rate": 2.31175052262389e-10, + "epoch": 4.958726168028494, + "step": 2960 + }, + { + "loss": 1.0109, + "grad_norm": 0.9577650427818298, + "learning_rate": 8.754977376496108e-11, + "epoch": 4.975487115021998, + "step": 2970 + }, + { + "loss": 0.9705, + "grad_norm": 1.5243313312530518, + "learning_rate": 1.2311995718883306e-11, + "epoch": 4.992248062015504, + "step": 2980 + }, + { + "train_runtime": 24337.0096, + "train_samples_per_second": 1.961, + "train_steps_per_second": 0.123, + "total_flos": 4.190815953316639e+17, + "train_loss": 1.108071906442818, + "epoch": 5.0, + "step": 2985, + "total_runtime_sec": 24338.520318984985 } ] }