| { | |
| "Experiment_1": { | |
| "config": { | |
| "BASE_MODEL_ID": "Qwen/Qwen3-4B-Instruct-2507", | |
| "DATASET_ID": [ | |
| "u-10bei/structured_data_with_cot_dataset_512", | |
| "u-10bei/structured_data_with_cot_dataset_512_v2", | |
| "u-10bei/structured_data_with_cot_dataset_512_v4", | |
| "u-10bei/structured_data_with_cot_dataset_512_v5", | |
| "u-10bei/structured_data_with_cot_dataset_v2" | |
| ], | |
| "BASE_OUT_DIR": "./lora_experiments", | |
| "SEED": 3407, | |
| "VAL_RATIO": 0.05, | |
| "MAX_SEQ_LEN": 512, | |
| "LORA_R": 80, | |
| "LORA_ALPHA": 160, | |
| "LORA_DROPOUT": 0.0, | |
| "LORA_TARGET_MODULES": [ | |
| "q_proj", | |
| "k_proj", | |
| "v_proj", | |
| "o_proj", | |
| "gate_proj", | |
| "up_proj", | |
| "down_proj" | |
| ], | |
| "EPOCHS": 1, | |
| "PER_DEVICE_TRAIN_BS": 2, | |
| "PER_DEVICE_EVAL_BS": 2, | |
| "GRAD_ACCUM": 8, | |
| "LR": 1e-06, | |
| "WARMUP_RATIO": 0.1, | |
| "WEIGHT_DECAY": 0.05, | |
| "MAX_STEPS": -1, | |
| "LOGGING_STEPS": 10, | |
| "EVAL_STEPS": 50, | |
| "SAVE_STEPS": 100, | |
| "SAVE_TOTAL_LIMIT": 2, | |
| "MASK_COT": true, | |
| "OUTPUT_MARKERS": [ | |
| "Output:", | |
| "OUTPUT:", | |
| "Final:", | |
| "Answer:", | |
| "Result:", | |
| "Response:" | |
| ], | |
| "OUTPUT_LEARN_MODE": "after_marker", | |
| "UPSAMPLE_ENABLE": false, | |
| "UPSAMPLE_RULES_JSON": "{\"xml_to_yaml\": 2.0}", | |
| "Experiment_Name": "R_64_ALPHA_128" | |
| }, | |
| "history": [ | |
| { | |
| "loss": 1.9238, | |
| "grad_norm": 5.567883491516113, | |
| "learning_rate": 8.181818181818182e-08, | |
| "epoch": 0.009169054441260744, | |
| "step": 10 | |
| }, | |
| { | |
| "loss": 1.7942, | |
| "grad_norm": 3.81577467918396, | |
| "learning_rate": 1.7272727272727272e-07, | |
| "epoch": 0.01833810888252149, | |
| "step": 20 | |
| }, | |
| { | |
| "loss": 1.9151, | |
| "grad_norm": 4.943554878234863, | |
| "learning_rate": 2.636363636363636e-07, | |
| "epoch": 0.027507163323782235, | |
| "step": 30 | |
| }, | |
| { | |
| "loss": 1.7679, | |
| "grad_norm": 4.759264945983887, | |
| "learning_rate": 3.545454545454545e-07, | |
| "epoch": 0.03667621776504298, | |
| "step": 40 | |
| }, | |
| { | |
| "loss": 2.0977, | |
| "grad_norm": 6.200092315673828, | |
| "learning_rate": 4.4545454545454544e-07, | |
| "epoch": 0.045845272206303724, | |
| "step": 50 | |
| }, | |
| { | |
| "eval_loss": 1.8969855308532715, | |
| "eval_runtime": 165.3674, | |
| "eval_samples_per_second": 5.612, | |
| "eval_steps_per_second": 2.806, | |
| "epoch": 0.045845272206303724, | |
| "step": 50 | |
| }, | |
| { | |
| "loss": 1.8792, | |
| "grad_norm": 4.762349605560303, | |
| "learning_rate": 5.363636363636363e-07, | |
| "epoch": 0.05501432664756447, | |
| "step": 60 | |
| }, | |
| { | |
| "loss": 1.8231, | |
| "grad_norm": 4.353812217712402, | |
| "learning_rate": 6.272727272727273e-07, | |
| "epoch": 0.06418338108882521, | |
| "step": 70 | |
| }, | |
| { | |
| "loss": 1.8868, | |
| "grad_norm": 4.538625240325928, | |
| "learning_rate": 7.181818181818181e-07, | |
| "epoch": 0.07335243553008595, | |
| "step": 80 | |
| }, | |
| { | |
| "loss": 1.6794, | |
| "grad_norm": 3.11306095123291, | |
| "learning_rate": 8.09090909090909e-07, | |
| "epoch": 0.0825214899713467, | |
| "step": 90 | |
| }, | |
| { | |
| "loss": 1.7012, | |
| "grad_norm": 3.303393602371216, | |
| "learning_rate": 9e-07, | |
| "epoch": 0.09169054441260745, | |
| "step": 100 | |
| }, | |
| { | |
| "eval_loss": 1.6190643310546875, | |
| "eval_runtime": 163.7929, | |
| "eval_samples_per_second": 5.666, | |
| "eval_steps_per_second": 2.833, | |
| "epoch": 0.09169054441260745, | |
| "step": 100 | |
| }, | |
| { | |
| "loss": 1.6738, | |
| "grad_norm": 2.3432679176330566, | |
| "learning_rate": 9.909090909090909e-07, | |
| "epoch": 0.1008595988538682, | |
| "step": 110 | |
| }, | |
| { | |
| "loss": 1.5342, | |
| "grad_norm": 1.411333441734314, | |
| "learning_rate": 9.997923381619255e-07, | |
| "epoch": 0.11002865329512894, | |
| "step": 120 | |
| }, | |
| { | |
| "loss": 1.4696, | |
| "grad_norm": 1.3321274518966675, | |
| "learning_rate": 9.990747162241872e-07, | |
| "epoch": 0.11919770773638969, | |
| "step": 130 | |
| }, | |
| { | |
| "loss": 1.4552, | |
| "grad_norm": 1.0566920042037964, | |
| "learning_rate": 9.978453061876695e-07, | |
| "epoch": 0.12836676217765042, | |
| "step": 140 | |
| }, | |
| { | |
| "loss": 1.5192, | |
| "grad_norm": 0.9345868229866028, | |
| "learning_rate": 9.96105368780285e-07, | |
| "epoch": 0.13753581661891118, | |
| "step": 150 | |
| }, | |
| { | |
| "eval_loss": 1.3909412622451782, | |
| "eval_runtime": 170.2558, | |
| "eval_samples_per_second": 5.451, | |
| "eval_steps_per_second": 2.725, | |
| "epoch": 0.13753581661891118, | |
| "step": 150 | |
| }, | |
| { | |
| "loss": 1.4217, | |
| "grad_norm": 0.8553086519241333, | |
| "learning_rate": 9.938566882624436e-07, | |
| "epoch": 0.1467048710601719, | |
| "step": 160 | |
| }, | |
| { | |
| "loss": 1.4101, | |
| "grad_norm": 0.8422027230262756, | |
| "learning_rate": 9.911015705973398e-07, | |
| "epoch": 0.15587392550143267, | |
| "step": 170 | |
| }, | |
| { | |
| "loss": 1.4131, | |
| "grad_norm": 0.6763940453529358, | |
| "learning_rate": 9.878428410862482e-07, | |
| "epoch": 0.1650429799426934, | |
| "step": 180 | |
| }, | |
| { | |
| "loss": 1.4201, | |
| "grad_norm": 0.5860380530357361, | |
| "learning_rate": 9.8408384147125e-07, | |
| "epoch": 0.17421203438395416, | |
| "step": 190 | |
| }, | |
| { | |
| "loss": 1.2782, | |
| "grad_norm": 0.7507234215736389, | |
| "learning_rate": 9.79828426508364e-07, | |
| "epoch": 0.1833810888252149, | |
| "step": 200 | |
| }, | |
| { | |
| "eval_loss": 1.3075143098831177, | |
| "eval_runtime": 168.9052, | |
| "eval_samples_per_second": 5.494, | |
| "eval_steps_per_second": 2.747, | |
| "epoch": 0.1833810888252149, | |
| "step": 200 | |
| }, | |
| { | |
| "loss": 1.3912, | |
| "grad_norm": 0.6225572228431702, | |
| "learning_rate": 9.750809600145952e-07, | |
| "epoch": 0.19255014326647565, | |
| "step": 210 | |
| }, | |
| { | |
| "loss": 1.2743, | |
| "grad_norm": 0.5334329009056091, | |
| "learning_rate": 9.698463103929541e-07, | |
| "epoch": 0.2017191977077364, | |
| "step": 220 | |
| }, | |
| { | |
| "loss": 1.2105, | |
| "grad_norm": 0.730050265789032, | |
| "learning_rate": 9.641298456400363e-07, | |
| "epoch": 0.21088825214899715, | |
| "step": 230 | |
| }, | |
| { | |
| "loss": 1.3692, | |
| "grad_norm": 0.646460235118866, | |
| "learning_rate": 9.579374278412817e-07, | |
| "epoch": 0.22005730659025788, | |
| "step": 240 | |
| }, | |
| { | |
| "loss": 1.2971, | |
| "grad_norm": 0.5395148992538452, | |
| "learning_rate": 9.512754071595603e-07, | |
| "epoch": 0.22922636103151864, | |
| "step": 250 | |
| }, | |
| { | |
| "eval_loss": 1.2624306678771973, | |
| "eval_runtime": 157.9571, | |
| "eval_samples_per_second": 5.875, | |
| "eval_steps_per_second": 2.938, | |
| "epoch": 0.22922636103151864, | |
| "step": 250 | |
| }, | |
| { | |
| "loss": 1.3346, | |
| "grad_norm": 0.5858215093612671, | |
| "learning_rate": 9.441506153232442e-07, | |
| "epoch": 0.23839541547277937, | |
| "step": 260 | |
| }, | |
| { | |
| "loss": 1.2195, | |
| "grad_norm": 0.5456379055976868, | |
| "learning_rate": 9.365703586204494e-07, | |
| "epoch": 0.2475644699140401, | |
| "step": 270 | |
| }, | |
| { | |
| "loss": 1.3763, | |
| "grad_norm": 0.8651963472366333, | |
| "learning_rate": 9.285424104066275e-07, | |
| "epoch": 0.25673352435530084, | |
| "step": 280 | |
| }, | |
| { | |
| "loss": 1.2256, | |
| "grad_norm": 0.5938352942466736, | |
| "learning_rate": 9.20075003133193e-07, | |
| "epoch": 0.2659025787965616, | |
| "step": 290 | |
| }, | |
| { | |
| "loss": 1.3137, | |
| "grad_norm": 0.6355459690093994, | |
| "learning_rate": 9.111768199053586e-07, | |
| "epoch": 0.27507163323782235, | |
| "step": 300 | |
| }, | |
| { | |
| "eval_loss": 1.2268821001052856, | |
| "eval_runtime": 164.1638, | |
| "eval_samples_per_second": 5.653, | |
| "eval_steps_per_second": 2.826, | |
| "epoch": 0.27507163323782235, | |
| "step": 300 | |
| }, | |
| { | |
| "loss": 1.4409, | |
| "grad_norm": 0.9009450078010559, | |
| "learning_rate": 9.018569855778383e-07, | |
| "epoch": 0.2842406876790831, | |
| "step": 310 | |
| }, | |
| { | |
| "loss": 1.2566, | |
| "grad_norm": 0.5912006497383118, | |
| "learning_rate": 8.921250573975455e-07, | |
| "epoch": 0.2934097421203438, | |
| "step": 320 | |
| }, | |
| { | |
| "loss": 1.1593, | |
| "grad_norm": 0.6118663549423218, | |
| "learning_rate": 8.81991015202887e-07, | |
| "epoch": 0.3025787965616046, | |
| "step": 330 | |
| }, | |
| { | |
| "loss": 1.2555, | |
| "grad_norm": 0.581721305847168, | |
| "learning_rate": 8.714652511896993e-07, | |
| "epoch": 0.31174785100286534, | |
| "step": 340 | |
| }, | |
| { | |
| "loss": 1.1209, | |
| "grad_norm": 0.5230151414871216, | |
| "learning_rate": 8.605585592543211e-07, | |
| "epoch": 0.3209169054441261, | |
| "step": 350 | |
| }, | |
| { | |
| "eval_loss": 1.1941955089569092, | |
| "eval_runtime": 162.9119, | |
| "eval_samples_per_second": 5.696, | |
| "eval_steps_per_second": 2.848, | |
| "epoch": 0.3209169054441261, | |
| "step": 350 | |
| }, | |
| { | |
| "loss": 1.3564, | |
| "grad_norm": 0.565862238407135, | |
| "learning_rate": 8.492821239247363e-07, | |
| "epoch": 0.3300859598853868, | |
| "step": 360 | |
| }, | |
| { | |
| "loss": 1.3997, | |
| "grad_norm": 0.6057285666465759, | |
| "learning_rate": 8.376475088911317e-07, | |
| "epoch": 0.33925501432664756, | |
| "step": 370 | |
| }, | |
| { | |
| "loss": 1.0878, | |
| "grad_norm": 0.7517871260643005, | |
| "learning_rate": 8.256666451476336e-07, | |
| "epoch": 0.3484240687679083, | |
| "step": 380 | |
| }, | |
| { | |
| "loss": 1.3009, | |
| "grad_norm": 0.7168652415275574, | |
| "learning_rate": 8.133518187573862e-07, | |
| "epoch": 0.35759312320916903, | |
| "step": 390 | |
| }, | |
| { | |
| "loss": 1.1606, | |
| "grad_norm": 0.7250906825065613, | |
| "learning_rate": 8.007156582535131e-07, | |
| "epoch": 0.3667621776504298, | |
| "step": 400 | |
| }, | |
| { | |
| "eval_loss": 1.166169285774231, | |
| "eval_runtime": 158.6155, | |
| "eval_samples_per_second": 5.851, | |
| "eval_steps_per_second": 2.925, | |
| "epoch": 0.3667621776504298, | |
| "step": 400 | |
| }, | |
| { | |
| "loss": 1.2593, | |
| "grad_norm": 0.6665163636207581, | |
| "learning_rate": 7.877711216888867e-07, | |
| "epoch": 0.37593123209169055, | |
| "step": 410 | |
| }, | |
| { | |
| "loss": 1.1365, | |
| "grad_norm": 0.6199079751968384, | |
| "learning_rate": 7.745314833479833e-07, | |
| "epoch": 0.3851002865329513, | |
| "step": 420 | |
| }, | |
| { | |
| "loss": 1.19, | |
| "grad_norm": 0.6242042779922485, | |
| "learning_rate": 7.6101032013445e-07, | |
| "epoch": 0.394269340974212, | |
| "step": 430 | |
| }, | |
| { | |
| "loss": 1.1405, | |
| "grad_norm": 0.657778263092041, | |
| "learning_rate": 7.472214976483451e-07, | |
| "epoch": 0.4034383954154728, | |
| "step": 440 | |
| }, | |
| { | |
| "loss": 1.0791, | |
| "grad_norm": 0.5634785890579224, | |
| "learning_rate": 7.331791559673269e-07, | |
| "epoch": 0.41260744985673353, | |
| "step": 450 | |
| }, | |
| { | |
| "eval_loss": 1.1434489488601685, | |
| "eval_runtime": 153.2068, | |
| "eval_samples_per_second": 6.057, | |
| "eval_steps_per_second": 3.029, | |
| "epoch": 0.41260744985673353, | |
| "step": 450 | |
| }, | |
| { | |
| "loss": 1.1352, | |
| "grad_norm": 0.7913809418678284, | |
| "learning_rate": 7.188976951463723e-07, | |
| "epoch": 0.4217765042979943, | |
| "step": 460 | |
| }, | |
| { | |
| "loss": 1.3036, | |
| "grad_norm": 0.6191056966781616, | |
| "learning_rate": 7.043917604508971e-07, | |
| "epoch": 0.430945558739255, | |
| "step": 470 | |
| }, | |
| { | |
| "loss": 1.121, | |
| "grad_norm": 0.6955880522727966, | |
| "learning_rate": 6.896762273384178e-07, | |
| "epoch": 0.44011461318051576, | |
| "step": 480 | |
| }, | |
| { | |
| "loss": 1.0997, | |
| "grad_norm": 0.8450888395309448, | |
| "learning_rate": 6.747661862041585e-07, | |
| "epoch": 0.4492836676217765, | |
| "step": 490 | |
| }, | |
| { | |
| "loss": 1.3604, | |
| "grad_norm": 0.8735764622688293, | |
| "learning_rate": 6.596769269062443e-07, | |
| "epoch": 0.4584527220630373, | |
| "step": 500 | |
| }, | |
| { | |
| "eval_loss": 1.1247467994689941, | |
| "eval_runtime": 152.4716, | |
| "eval_samples_per_second": 6.086, | |
| "eval_steps_per_second": 3.043, | |
| "epoch": 0.4584527220630373, | |
| "step": 500 | |
| }, | |
| { | |
| "loss": 1.171, | |
| "grad_norm": 0.7228217124938965, | |
| "learning_rate": 6.444239230863504e-07, | |
| "epoch": 0.467621776504298, | |
| "step": 510 | |
| }, | |
| { | |
| "loss": 1.03, | |
| "grad_norm": 0.6075210571289062, | |
| "learning_rate": 6.290228163018867e-07, | |
| "epoch": 0.47679083094555874, | |
| "step": 520 | |
| }, | |
| { | |
| "loss": 1.0345, | |
| "grad_norm": 0.7692680954933167, | |
| "learning_rate": 6.134893999859886e-07, | |
| "epoch": 0.4859598853868195, | |
| "step": 530 | |
| }, | |
| { | |
| "loss": 1.2594, | |
| "grad_norm": 0.7600648403167725, | |
| "learning_rate": 5.978396032517639e-07, | |
| "epoch": 0.4951289398280802, | |
| "step": 540 | |
| }, | |
| { | |
| "loss": 1.0254, | |
| "grad_norm": 0.6178115010261536, | |
| "learning_rate": 5.820894745574025e-07, | |
| "epoch": 0.504297994269341, | |
| "step": 550 | |
| }, | |
| { | |
| "eval_loss": 1.1089264154434204, | |
| "eval_runtime": 151.8982, | |
| "eval_samples_per_second": 6.109, | |
| "eval_steps_per_second": 3.055, | |
| "epoch": 0.504297994269341, | |
| "step": 550 | |
| }, | |
| { | |
| "loss": 1.1049, | |
| "grad_norm": 0.5585054159164429, | |
| "learning_rate": 5.662551652489008e-07, | |
| "epoch": 0.5134670487106017, | |
| "step": 560 | |
| }, | |
| { | |
| "loss": 1.0898, | |
| "grad_norm": 0.6844518780708313, | |
| "learning_rate": 5.503529129972792e-07, | |
| "epoch": 0.5226361031518625, | |
| "step": 570 | |
| }, | |
| { | |
| "loss": 1.1037, | |
| "grad_norm": 0.8425552845001221, | |
| "learning_rate": 5.34399025147273e-07, | |
| "epoch": 0.5318051575931232, | |
| "step": 580 | |
| }, | |
| { | |
| "loss": 1.1019, | |
| "grad_norm": 0.648064136505127, | |
| "learning_rate": 5.18409861994576e-07, | |
| "epoch": 0.540974212034384, | |
| "step": 590 | |
| }, | |
| { | |
| "loss": 1.1863, | |
| "grad_norm": 0.5788621306419373, | |
| "learning_rate": 5.024018200087854e-07, | |
| "epoch": 0.5501432664756447, | |
| "step": 600 | |
| }, | |
| { | |
| "eval_loss": 1.095629096031189, | |
| "eval_runtime": 150.8171, | |
| "eval_samples_per_second": 6.153, | |
| "eval_steps_per_second": 3.077, | |
| "epoch": 0.5501432664756447, | |
| "step": 600 | |
| }, | |
| { | |
| "loss": 1.1025, | |
| "grad_norm": 0.6422027349472046, | |
| "learning_rate": 4.86391315019248e-07, | |
| "epoch": 0.5593123209169054, | |
| "step": 610 | |
| }, | |
| { | |
| "loss": 1.0666, | |
| "grad_norm": 0.6005454063415527, | |
| "learning_rate": 4.703947653810575e-07, | |
| "epoch": 0.5684813753581662, | |
| "step": 620 | |
| }, | |
| { | |
| "loss": 1.215, | |
| "grad_norm": 0.6145904064178467, | |
| "learning_rate": 4.544285751384584e-07, | |
| "epoch": 0.5776504297994269, | |
| "step": 630 | |
| }, | |
| { | |
| "loss": 1.1613, | |
| "grad_norm": 0.8756449818611145, | |
| "learning_rate": 4.385091172029275e-07, | |
| "epoch": 0.5868194842406876, | |
| "step": 640 | |
| }, | |
| { | |
| "loss": 1.1092, | |
| "grad_norm": 0.7930067181587219, | |
| "learning_rate": 4.2265271656318e-07, | |
| "epoch": 0.5959885386819485, | |
| "step": 650 | |
| }, | |
| { | |
| "eval_loss": 1.0845845937728882, | |
| "eval_runtime": 151.0566, | |
| "eval_samples_per_second": 6.143, | |
| "eval_steps_per_second": 3.072, | |
| "epoch": 0.5959885386819485, | |
| "step": 650 | |
| }, | |
| { | |
| "loss": 1.214, | |
| "grad_norm": 1.1016592979431152, | |
| "learning_rate": 4.068756335443198e-07, | |
| "epoch": 0.6051575931232092, | |
| "step": 660 | |
| }, | |
| { | |
| "loss": 1.3335, | |
| "grad_norm": 0.7920063138008118, | |
| "learning_rate": 3.9119404713330013e-07, | |
| "epoch": 0.6143266475644699, | |
| "step": 670 | |
| }, | |
| { | |
| "loss": 1.123, | |
| "grad_norm": 0.792630136013031, | |
| "learning_rate": 3.7562403838779467e-07, | |
| "epoch": 0.6234957020057307, | |
| "step": 680 | |
| }, | |
| { | |
| "loss": 1.2098, | |
| "grad_norm": 0.8105105757713318, | |
| "learning_rate": 3.601815739454928e-07, | |
| "epoch": 0.6326647564469914, | |
| "step": 690 | |
| }, | |
| { | |
| "loss": 1.0302, | |
| "grad_norm": 0.6204477548599243, | |
| "learning_rate": 3.448824896507292e-07, | |
| "epoch": 0.6418338108882522, | |
| "step": 700 | |
| }, | |
| { | |
| "eval_loss": 1.0753319263458252, | |
| "eval_runtime": 159.1632, | |
| "eval_samples_per_second": 5.83, | |
| "eval_steps_per_second": 2.915, | |
| "epoch": 0.6418338108882522, | |
| "step": 700 | |
| }, | |
| { | |
| "loss": 1.109, | |
| "grad_norm": 0.6277522444725037, | |
| "learning_rate": 3.297424743152381e-07, | |
| "epoch": 0.6510028653295129, | |
| "step": 710 | |
| }, | |
| { | |
| "loss": 1.0808, | |
| "grad_norm": 0.7879471182823181, | |
| "learning_rate": 3.1477705362968696e-07, | |
| "epoch": 0.6601719197707736, | |
| "step": 720 | |
| }, | |
| { | |
| "loss": 1.0842, | |
| "grad_norm": 0.8374884128570557, | |
| "learning_rate": 3.000015742424857e-07, | |
| "epoch": 0.6693409742120344, | |
| "step": 730 | |
| }, | |
| { | |
| "loss": 1.2445, | |
| "grad_norm": 0.7892112731933594, | |
| "learning_rate": 2.85431188022199e-07, | |
| "epoch": 0.6785100286532951, | |
| "step": 740 | |
| }, | |
| { | |
| "loss": 1.1506, | |
| "grad_norm": 0.5540062785148621, | |
| "learning_rate": 2.710808365197e-07, | |
| "epoch": 0.6876790830945558, | |
| "step": 750 | |
| }, | |
| { | |
| "eval_loss": 1.0686043500900269, | |
| "eval_runtime": 154.055, | |
| "eval_samples_per_second": 6.024, | |
| "eval_steps_per_second": 3.012, | |
| "epoch": 0.6876790830945558, | |
| "step": 750 | |
| }, | |
| { | |
| "loss": 1.1023, | |
| "grad_norm": 0.5218796133995056, | |
| "learning_rate": 2.569652356460007e-07, | |
| "epoch": 0.6968481375358166, | |
| "step": 760 | |
| }, | |
| { | |
| "loss": 1.1801, | |
| "grad_norm": 0.7433627247810364, | |
| "learning_rate": 2.430988605814691e-07, | |
| "epoch": 0.7060171919770774, | |
| "step": 770 | |
| }, | |
| { | |
| "loss": 1.0276, | |
| "grad_norm": 0.629487931728363, | |
| "learning_rate": 2.294959309319086e-07, | |
| "epoch": 0.7151862464183381, | |
| "step": 780 | |
| }, | |
| { | |
| "loss": 1.1164, | |
| "grad_norm": 0.6667075157165527, | |
| "learning_rate": 2.1617039614672378e-07, | |
| "epoch": 0.7243553008595989, | |
| "step": 790 | |
| }, | |
| { | |
| "loss": 1.0869, | |
| "grad_norm": 0.6930222511291504, | |
| "learning_rate": 2.0313592121412464e-07, | |
| "epoch": 0.7335243553008596, | |
| "step": 800 | |
| }, | |
| { | |
| "eval_loss": 1.0634101629257202, | |
| "eval_runtime": 154.1494, | |
| "eval_samples_per_second": 6.02, | |
| "eval_steps_per_second": 3.01, | |
| "epoch": 0.7335243553008596, | |
| "step": 800 | |
| }, | |
| { | |
| "loss": 1.0937, | |
| "grad_norm": 0.7793363332748413, | |
| "learning_rate": 1.904058726480367e-07, | |
| "epoch": 0.7426934097421204, | |
| "step": 810 | |
| }, | |
| { | |
| "loss": 0.9728, | |
| "grad_norm": 0.5570642948150635, | |
| "learning_rate": 1.7799330478109026e-07, | |
| "epoch": 0.7518624641833811, | |
| "step": 820 | |
| }, | |
| { | |
| "loss": 1.2568, | |
| "grad_norm": 0.9017201662063599, | |
| "learning_rate": 1.65910946377743e-07, | |
| "epoch": 0.7610315186246418, | |
| "step": 830 | |
| }, | |
| { | |
| "loss": 1.0049, | |
| "grad_norm": 0.7178328633308411, | |
| "learning_rate": 1.5417118758126408e-07, | |
| "epoch": 0.7702005730659026, | |
| "step": 840 | |
| }, | |
| { | |
| "loss": 1.0576, | |
| "grad_norm": 1.031610369682312, | |
| "learning_rate": 1.4278606720796543e-07, | |
| "epoch": 0.7793696275071633, | |
| "step": 850 | |
| }, | |
| { | |
| "eval_loss": 1.0599370002746582, | |
| "eval_runtime": 154.7364, | |
| "eval_samples_per_second": 5.997, | |
| "eval_steps_per_second": 2.999, | |
| "epoch": 0.7793696275071633, | |
| "step": 850 | |
| }, | |
| { | |
| "loss": 1.1797, | |
| "grad_norm": 1.0518614053726196, | |
| "learning_rate": 1.3176726040171e-07, | |
| "epoch": 0.788538681948424, | |
| "step": 860 | |
| }, | |
| { | |
| "loss": 1.2085, | |
| "grad_norm": 0.7290861010551453, | |
| "learning_rate": 1.21126066661356e-07, | |
| "epoch": 0.7977077363896848, | |
| "step": 870 | |
| }, | |
| { | |
| "loss": 1.1165, | |
| "grad_norm": 0.6315222382545471, | |
| "learning_rate": 1.108733982534159e-07, | |
| "epoch": 0.8068767908309455, | |
| "step": 880 | |
| }, | |
| { | |
| "loss": 1.158, | |
| "grad_norm": 0.685243546962738, | |
| "learning_rate": 1.0101976902181225e-07, | |
| "epoch": 0.8160458452722062, | |
| "step": 890 | |
| }, | |
| { | |
| "loss": 1.0584, | |
| "grad_norm": 0.7780338525772095, | |
| "learning_rate": 9.157528360620415e-08, | |
| "epoch": 0.8252148997134671, | |
| "step": 900 | |
| }, | |
| { | |
| "eval_loss": 1.057593584060669, | |
| "eval_runtime": 153.5904, | |
| "eval_samples_per_second": 6.042, | |
| "eval_steps_per_second": 3.021, | |
| "epoch": 0.8252148997134671, | |
| "step": 900 | |
| }, | |
| { | |
| "loss": 1.1489, | |
| "grad_norm": 0.6839588284492493, | |
| "learning_rate": 8.254962707994373e-08, | |
| "epoch": 0.8343839541547278, | |
| "step": 910 | |
| }, | |
| { | |
| "loss": 1.1096, | |
| "grad_norm": 0.9299020171165466, | |
| "learning_rate": 7.395205501828577e-08, | |
| "epoch": 0.8435530085959886, | |
| "step": 920 | |
| }, | |
| { | |
| "loss": 1.1224, | |
| "grad_norm": 0.791289746761322, | |
| "learning_rate": 6.579138400703715e-08, | |
| "epoch": 0.8527220630372493, | |
| "step": 930 | |
| }, | |
| { | |
| "loss": 1.038, | |
| "grad_norm": 0.6159808039665222, | |
| "learning_rate": 5.807598260137758e-08, | |
| "epoch": 0.86189111747851, | |
| "step": 940 | |
| }, | |
| { | |
| "loss": 1.0708, | |
| "grad_norm": 0.7773894667625427, | |
| "learning_rate": 5.08137627441253e-08, | |
| "epoch": 0.8710601719197708, | |
| "step": 950 | |
| }, | |
| { | |
| "eval_loss": 1.0561405420303345, | |
| "eval_runtime": 154.3615, | |
| "eval_samples_per_second": 6.012, | |
| "eval_steps_per_second": 3.006, | |
| "epoch": 0.8710601719197708, | |
| "step": 950 | |
| }, | |
| { | |
| "loss": 1.0656, | |
| "grad_norm": 0.707645833492279, | |
| "learning_rate": 4.401217165224563e-08, | |
| "epoch": 0.8802292263610315, | |
| "step": 960 | |
| }, | |
| { | |
| "loss": 1.0164, | |
| "grad_norm": 0.6336905360221863, | |
| "learning_rate": 3.767818417992446e-08, | |
| "epoch": 0.8893982808022922, | |
| "step": 970 | |
| }, | |
| { | |
| "loss": 1.0766, | |
| "grad_norm": 0.8207520842552185, | |
| "learning_rate": 3.181829566603772e-08, | |
| "epoch": 0.898567335243553, | |
| "step": 980 | |
| }, | |
| { | |
| "loss": 1.0929, | |
| "grad_norm": 0.6286782026290894, | |
| "learning_rate": 2.643851527335006e-08, | |
| "epoch": 0.9077363896848137, | |
| "step": 990 | |
| }, | |
| { | |
| "loss": 1.2541, | |
| "grad_norm": 0.817637026309967, | |
| "learning_rate": 2.1544359826275726e-08, | |
| "epoch": 0.9169054441260746, | |
| "step": 1000 | |
| }, | |
| { | |
| "eval_loss": 1.0553829669952393, | |
| "eval_runtime": 155.0837, | |
| "eval_samples_per_second": 5.984, | |
| "eval_steps_per_second": 2.992, | |
| "epoch": 0.9169054441260746, | |
| "step": 1000 | |
| }, | |
| { | |
| "loss": 1.0413, | |
| "grad_norm": 0.9485034942626953, | |
| "learning_rate": 1.714084815351913e-08, | |
| "epoch": 0.9260744985673353, | |
| "step": 1010 | |
| }, | |
| { | |
| "loss": 1.0516, | |
| "grad_norm": 0.6737267971038818, | |
| "learning_rate": 1.3232495941396637e-08, | |
| "epoch": 0.935243553008596, | |
| "step": 1020 | |
| }, | |
| { | |
| "loss": 1.0517, | |
| "grad_norm": 0.9414446353912354, | |
| "learning_rate": 9.82331110311857e-09, | |
| "epoch": 0.9444126074498568, | |
| "step": 1030 | |
| }, | |
| { | |
| "loss": 1.1016, | |
| "grad_norm": 0.8654493689537048, | |
| "learning_rate": 6.916789668778122e-09, | |
| "epoch": 0.9535816618911175, | |
| "step": 1040 | |
| }, | |
| { | |
| "loss": 1.1055, | |
| "grad_norm": 0.8262504935264587, | |
| "learning_rate": 4.515912200264427e-09, | |
| "epoch": 0.9627507163323782, | |
| "step": 1050 | |
| }, | |
| { | |
| "eval_loss": 1.0550979375839233, | |
| "eval_runtime": 156.3052, | |
| "eval_samples_per_second": 5.937, | |
| "eval_steps_per_second": 2.969, | |
| "epoch": 0.9627507163323782, | |
| "step": 1050 | |
| }, | |
| { | |
| "loss": 1.1321, | |
| "grad_norm": 0.7707592844963074, | |
| "learning_rate": 2.6231407347736546e-09, | |
| "epoch": 0.971919770773639, | |
| "step": 1060 | |
| }, | |
| { | |
| "loss": 1.1039, | |
| "grad_norm": 0.7415518760681152, | |
| "learning_rate": 1.2404162600541113e-09, | |
| "epoch": 0.9810888252148997, | |
| "step": 1070 | |
| }, | |
| { | |
| "loss": 1.0673, | |
| "grad_norm": 0.6835209727287292, | |
| "learning_rate": 3.6915672397436204e-10, | |
| "epoch": 0.9902578796561604, | |
| "step": 1080 | |
| }, | |
| { | |
| "loss": 1.1213, | |
| "grad_norm": 0.6404680013656616, | |
| "learning_rate": 1.0255580454254786e-11, | |
| "epoch": 0.9994269340974212, | |
| "step": 1090 | |
| }, | |
| { | |
| "train_runtime": 10741.972, | |
| "train_samples_per_second": 1.624, | |
| "train_steps_per_second": 0.102, | |
| "total_flos": 1.5816368624117146e+17, | |
| "train_loss": 1.2491859223840436, | |
| "epoch": 1.0, | |
| "step": 1091, | |
| "total_runtime_sec": 10743.41768693924 | |
| } | |
| ] | |
| } | |
| } |