sample / all_experiments_details.json
rgb255's picture
Upload LoRA adapter (Fixed README metadata)
56a0d37 verified
{
"Experiment_1": {
"config": {
"BASE_MODEL_ID": "Qwen/Qwen3-4B-Instruct-2507",
"DATASET_ID": [
"u-10bei/structured_data_with_cot_dataset_512",
"u-10bei/structured_data_with_cot_dataset_512_v2",
"u-10bei/structured_data_with_cot_dataset_512_v4",
"u-10bei/structured_data_with_cot_dataset_512_v5",
"u-10bei/structured_data_with_cot_dataset_v2"
],
"BASE_OUT_DIR": "./lora_experiments",
"SEED": 3407,
"VAL_RATIO": 0.05,
"MAX_SEQ_LEN": 512,
"LORA_R": 80,
"LORA_ALPHA": 160,
"LORA_DROPOUT": 0.0,
"LORA_TARGET_MODULES": [
"q_proj",
"k_proj",
"v_proj",
"o_proj",
"gate_proj",
"up_proj",
"down_proj"
],
"EPOCHS": 1,
"PER_DEVICE_TRAIN_BS": 2,
"PER_DEVICE_EVAL_BS": 2,
"GRAD_ACCUM": 8,
"LR": 1e-06,
"WARMUP_RATIO": 0.1,
"WEIGHT_DECAY": 0.05,
"MAX_STEPS": -1,
"LOGGING_STEPS": 10,
"EVAL_STEPS": 50,
"SAVE_STEPS": 100,
"SAVE_TOTAL_LIMIT": 2,
"MASK_COT": true,
"OUTPUT_MARKERS": [
"Output:",
"OUTPUT:",
"Final:",
"Answer:",
"Result:",
"Response:"
],
"OUTPUT_LEARN_MODE": "after_marker",
"UPSAMPLE_ENABLE": false,
"UPSAMPLE_RULES_JSON": "{\"xml_to_yaml\": 2.0}",
"Experiment_Name": "R_64_ALPHA_128"
},
"history": [
{
"loss": 1.9238,
"grad_norm": 5.567883491516113,
"learning_rate": 8.181818181818182e-08,
"epoch": 0.009169054441260744,
"step": 10
},
{
"loss": 1.7942,
"grad_norm": 3.81577467918396,
"learning_rate": 1.7272727272727272e-07,
"epoch": 0.01833810888252149,
"step": 20
},
{
"loss": 1.9151,
"grad_norm": 4.943554878234863,
"learning_rate": 2.636363636363636e-07,
"epoch": 0.027507163323782235,
"step": 30
},
{
"loss": 1.7679,
"grad_norm": 4.759264945983887,
"learning_rate": 3.545454545454545e-07,
"epoch": 0.03667621776504298,
"step": 40
},
{
"loss": 2.0977,
"grad_norm": 6.200092315673828,
"learning_rate": 4.4545454545454544e-07,
"epoch": 0.045845272206303724,
"step": 50
},
{
"eval_loss": 1.8969855308532715,
"eval_runtime": 165.3674,
"eval_samples_per_second": 5.612,
"eval_steps_per_second": 2.806,
"epoch": 0.045845272206303724,
"step": 50
},
{
"loss": 1.8792,
"grad_norm": 4.762349605560303,
"learning_rate": 5.363636363636363e-07,
"epoch": 0.05501432664756447,
"step": 60
},
{
"loss": 1.8231,
"grad_norm": 4.353812217712402,
"learning_rate": 6.272727272727273e-07,
"epoch": 0.06418338108882521,
"step": 70
},
{
"loss": 1.8868,
"grad_norm": 4.538625240325928,
"learning_rate": 7.181818181818181e-07,
"epoch": 0.07335243553008595,
"step": 80
},
{
"loss": 1.6794,
"grad_norm": 3.11306095123291,
"learning_rate": 8.09090909090909e-07,
"epoch": 0.0825214899713467,
"step": 90
},
{
"loss": 1.7012,
"grad_norm": 3.303393602371216,
"learning_rate": 9e-07,
"epoch": 0.09169054441260745,
"step": 100
},
{
"eval_loss": 1.6190643310546875,
"eval_runtime": 163.7929,
"eval_samples_per_second": 5.666,
"eval_steps_per_second": 2.833,
"epoch": 0.09169054441260745,
"step": 100
},
{
"loss": 1.6738,
"grad_norm": 2.3432679176330566,
"learning_rate": 9.909090909090909e-07,
"epoch": 0.1008595988538682,
"step": 110
},
{
"loss": 1.5342,
"grad_norm": 1.411333441734314,
"learning_rate": 9.997923381619255e-07,
"epoch": 0.11002865329512894,
"step": 120
},
{
"loss": 1.4696,
"grad_norm": 1.3321274518966675,
"learning_rate": 9.990747162241872e-07,
"epoch": 0.11919770773638969,
"step": 130
},
{
"loss": 1.4552,
"grad_norm": 1.0566920042037964,
"learning_rate": 9.978453061876695e-07,
"epoch": 0.12836676217765042,
"step": 140
},
{
"loss": 1.5192,
"grad_norm": 0.9345868229866028,
"learning_rate": 9.96105368780285e-07,
"epoch": 0.13753581661891118,
"step": 150
},
{
"eval_loss": 1.3909412622451782,
"eval_runtime": 170.2558,
"eval_samples_per_second": 5.451,
"eval_steps_per_second": 2.725,
"epoch": 0.13753581661891118,
"step": 150
},
{
"loss": 1.4217,
"grad_norm": 0.8553086519241333,
"learning_rate": 9.938566882624436e-07,
"epoch": 0.1467048710601719,
"step": 160
},
{
"loss": 1.4101,
"grad_norm": 0.8422027230262756,
"learning_rate": 9.911015705973398e-07,
"epoch": 0.15587392550143267,
"step": 170
},
{
"loss": 1.4131,
"grad_norm": 0.6763940453529358,
"learning_rate": 9.878428410862482e-07,
"epoch": 0.1650429799426934,
"step": 180
},
{
"loss": 1.4201,
"grad_norm": 0.5860380530357361,
"learning_rate": 9.8408384147125e-07,
"epoch": 0.17421203438395416,
"step": 190
},
{
"loss": 1.2782,
"grad_norm": 0.7507234215736389,
"learning_rate": 9.79828426508364e-07,
"epoch": 0.1833810888252149,
"step": 200
},
{
"eval_loss": 1.3075143098831177,
"eval_runtime": 168.9052,
"eval_samples_per_second": 5.494,
"eval_steps_per_second": 2.747,
"epoch": 0.1833810888252149,
"step": 200
},
{
"loss": 1.3912,
"grad_norm": 0.6225572228431702,
"learning_rate": 9.750809600145952e-07,
"epoch": 0.19255014326647565,
"step": 210
},
{
"loss": 1.2743,
"grad_norm": 0.5334329009056091,
"learning_rate": 9.698463103929541e-07,
"epoch": 0.2017191977077364,
"step": 220
},
{
"loss": 1.2105,
"grad_norm": 0.730050265789032,
"learning_rate": 9.641298456400363e-07,
"epoch": 0.21088825214899715,
"step": 230
},
{
"loss": 1.3692,
"grad_norm": 0.646460235118866,
"learning_rate": 9.579374278412817e-07,
"epoch": 0.22005730659025788,
"step": 240
},
{
"loss": 1.2971,
"grad_norm": 0.5395148992538452,
"learning_rate": 9.512754071595603e-07,
"epoch": 0.22922636103151864,
"step": 250
},
{
"eval_loss": 1.2624306678771973,
"eval_runtime": 157.9571,
"eval_samples_per_second": 5.875,
"eval_steps_per_second": 2.938,
"epoch": 0.22922636103151864,
"step": 250
},
{
"loss": 1.3346,
"grad_norm": 0.5858215093612671,
"learning_rate": 9.441506153232442e-07,
"epoch": 0.23839541547277937,
"step": 260
},
{
"loss": 1.2195,
"grad_norm": 0.5456379055976868,
"learning_rate": 9.365703586204494e-07,
"epoch": 0.2475644699140401,
"step": 270
},
{
"loss": 1.3763,
"grad_norm": 0.8651963472366333,
"learning_rate": 9.285424104066275e-07,
"epoch": 0.25673352435530084,
"step": 280
},
{
"loss": 1.2256,
"grad_norm": 0.5938352942466736,
"learning_rate": 9.20075003133193e-07,
"epoch": 0.2659025787965616,
"step": 290
},
{
"loss": 1.3137,
"grad_norm": 0.6355459690093994,
"learning_rate": 9.111768199053586e-07,
"epoch": 0.27507163323782235,
"step": 300
},
{
"eval_loss": 1.2268821001052856,
"eval_runtime": 164.1638,
"eval_samples_per_second": 5.653,
"eval_steps_per_second": 2.826,
"epoch": 0.27507163323782235,
"step": 300
},
{
"loss": 1.4409,
"grad_norm": 0.9009450078010559,
"learning_rate": 9.018569855778383e-07,
"epoch": 0.2842406876790831,
"step": 310
},
{
"loss": 1.2566,
"grad_norm": 0.5912006497383118,
"learning_rate": 8.921250573975455e-07,
"epoch": 0.2934097421203438,
"step": 320
},
{
"loss": 1.1593,
"grad_norm": 0.6118663549423218,
"learning_rate": 8.81991015202887e-07,
"epoch": 0.3025787965616046,
"step": 330
},
{
"loss": 1.2555,
"grad_norm": 0.581721305847168,
"learning_rate": 8.714652511896993e-07,
"epoch": 0.31174785100286534,
"step": 340
},
{
"loss": 1.1209,
"grad_norm": 0.5230151414871216,
"learning_rate": 8.605585592543211e-07,
"epoch": 0.3209169054441261,
"step": 350
},
{
"eval_loss": 1.1941955089569092,
"eval_runtime": 162.9119,
"eval_samples_per_second": 5.696,
"eval_steps_per_second": 2.848,
"epoch": 0.3209169054441261,
"step": 350
},
{
"loss": 1.3564,
"grad_norm": 0.565862238407135,
"learning_rate": 8.492821239247363e-07,
"epoch": 0.3300859598853868,
"step": 360
},
{
"loss": 1.3997,
"grad_norm": 0.6057285666465759,
"learning_rate": 8.376475088911317e-07,
"epoch": 0.33925501432664756,
"step": 370
},
{
"loss": 1.0878,
"grad_norm": 0.7517871260643005,
"learning_rate": 8.256666451476336e-07,
"epoch": 0.3484240687679083,
"step": 380
},
{
"loss": 1.3009,
"grad_norm": 0.7168652415275574,
"learning_rate": 8.133518187573862e-07,
"epoch": 0.35759312320916903,
"step": 390
},
{
"loss": 1.1606,
"grad_norm": 0.7250906825065613,
"learning_rate": 8.007156582535131e-07,
"epoch": 0.3667621776504298,
"step": 400
},
{
"eval_loss": 1.166169285774231,
"eval_runtime": 158.6155,
"eval_samples_per_second": 5.851,
"eval_steps_per_second": 2.925,
"epoch": 0.3667621776504298,
"step": 400
},
{
"loss": 1.2593,
"grad_norm": 0.6665163636207581,
"learning_rate": 7.877711216888867e-07,
"epoch": 0.37593123209169055,
"step": 410
},
{
"loss": 1.1365,
"grad_norm": 0.6199079751968384,
"learning_rate": 7.745314833479833e-07,
"epoch": 0.3851002865329513,
"step": 420
},
{
"loss": 1.19,
"grad_norm": 0.6242042779922485,
"learning_rate": 7.6101032013445e-07,
"epoch": 0.394269340974212,
"step": 430
},
{
"loss": 1.1405,
"grad_norm": 0.657778263092041,
"learning_rate": 7.472214976483451e-07,
"epoch": 0.4034383954154728,
"step": 440
},
{
"loss": 1.0791,
"grad_norm": 0.5634785890579224,
"learning_rate": 7.331791559673269e-07,
"epoch": 0.41260744985673353,
"step": 450
},
{
"eval_loss": 1.1434489488601685,
"eval_runtime": 153.2068,
"eval_samples_per_second": 6.057,
"eval_steps_per_second": 3.029,
"epoch": 0.41260744985673353,
"step": 450
},
{
"loss": 1.1352,
"grad_norm": 0.7913809418678284,
"learning_rate": 7.188976951463723e-07,
"epoch": 0.4217765042979943,
"step": 460
},
{
"loss": 1.3036,
"grad_norm": 0.6191056966781616,
"learning_rate": 7.043917604508971e-07,
"epoch": 0.430945558739255,
"step": 470
},
{
"loss": 1.121,
"grad_norm": 0.6955880522727966,
"learning_rate": 6.896762273384178e-07,
"epoch": 0.44011461318051576,
"step": 480
},
{
"loss": 1.0997,
"grad_norm": 0.8450888395309448,
"learning_rate": 6.747661862041585e-07,
"epoch": 0.4492836676217765,
"step": 490
},
{
"loss": 1.3604,
"grad_norm": 0.8735764622688293,
"learning_rate": 6.596769269062443e-07,
"epoch": 0.4584527220630373,
"step": 500
},
{
"eval_loss": 1.1247467994689941,
"eval_runtime": 152.4716,
"eval_samples_per_second": 6.086,
"eval_steps_per_second": 3.043,
"epoch": 0.4584527220630373,
"step": 500
},
{
"loss": 1.171,
"grad_norm": 0.7228217124938965,
"learning_rate": 6.444239230863504e-07,
"epoch": 0.467621776504298,
"step": 510
},
{
"loss": 1.03,
"grad_norm": 0.6075210571289062,
"learning_rate": 6.290228163018867e-07,
"epoch": 0.47679083094555874,
"step": 520
},
{
"loss": 1.0345,
"grad_norm": 0.7692680954933167,
"learning_rate": 6.134893999859886e-07,
"epoch": 0.4859598853868195,
"step": 530
},
{
"loss": 1.2594,
"grad_norm": 0.7600648403167725,
"learning_rate": 5.978396032517639e-07,
"epoch": 0.4951289398280802,
"step": 540
},
{
"loss": 1.0254,
"grad_norm": 0.6178115010261536,
"learning_rate": 5.820894745574025e-07,
"epoch": 0.504297994269341,
"step": 550
},
{
"eval_loss": 1.1089264154434204,
"eval_runtime": 151.8982,
"eval_samples_per_second": 6.109,
"eval_steps_per_second": 3.055,
"epoch": 0.504297994269341,
"step": 550
},
{
"loss": 1.1049,
"grad_norm": 0.5585054159164429,
"learning_rate": 5.662551652489008e-07,
"epoch": 0.5134670487106017,
"step": 560
},
{
"loss": 1.0898,
"grad_norm": 0.6844518780708313,
"learning_rate": 5.503529129972792e-07,
"epoch": 0.5226361031518625,
"step": 570
},
{
"loss": 1.1037,
"grad_norm": 0.8425552845001221,
"learning_rate": 5.34399025147273e-07,
"epoch": 0.5318051575931232,
"step": 580
},
{
"loss": 1.1019,
"grad_norm": 0.648064136505127,
"learning_rate": 5.18409861994576e-07,
"epoch": 0.540974212034384,
"step": 590
},
{
"loss": 1.1863,
"grad_norm": 0.5788621306419373,
"learning_rate": 5.024018200087854e-07,
"epoch": 0.5501432664756447,
"step": 600
},
{
"eval_loss": 1.095629096031189,
"eval_runtime": 150.8171,
"eval_samples_per_second": 6.153,
"eval_steps_per_second": 3.077,
"epoch": 0.5501432664756447,
"step": 600
},
{
"loss": 1.1025,
"grad_norm": 0.6422027349472046,
"learning_rate": 4.86391315019248e-07,
"epoch": 0.5593123209169054,
"step": 610
},
{
"loss": 1.0666,
"grad_norm": 0.6005454063415527,
"learning_rate": 4.703947653810575e-07,
"epoch": 0.5684813753581662,
"step": 620
},
{
"loss": 1.215,
"grad_norm": 0.6145904064178467,
"learning_rate": 4.544285751384584e-07,
"epoch": 0.5776504297994269,
"step": 630
},
{
"loss": 1.1613,
"grad_norm": 0.8756449818611145,
"learning_rate": 4.385091172029275e-07,
"epoch": 0.5868194842406876,
"step": 640
},
{
"loss": 1.1092,
"grad_norm": 0.7930067181587219,
"learning_rate": 4.2265271656318e-07,
"epoch": 0.5959885386819485,
"step": 650
},
{
"eval_loss": 1.0845845937728882,
"eval_runtime": 151.0566,
"eval_samples_per_second": 6.143,
"eval_steps_per_second": 3.072,
"epoch": 0.5959885386819485,
"step": 650
},
{
"loss": 1.214,
"grad_norm": 1.1016592979431152,
"learning_rate": 4.068756335443198e-07,
"epoch": 0.6051575931232092,
"step": 660
},
{
"loss": 1.3335,
"grad_norm": 0.7920063138008118,
"learning_rate": 3.9119404713330013e-07,
"epoch": 0.6143266475644699,
"step": 670
},
{
"loss": 1.123,
"grad_norm": 0.792630136013031,
"learning_rate": 3.7562403838779467e-07,
"epoch": 0.6234957020057307,
"step": 680
},
{
"loss": 1.2098,
"grad_norm": 0.8105105757713318,
"learning_rate": 3.601815739454928e-07,
"epoch": 0.6326647564469914,
"step": 690
},
{
"loss": 1.0302,
"grad_norm": 0.6204477548599243,
"learning_rate": 3.448824896507292e-07,
"epoch": 0.6418338108882522,
"step": 700
},
{
"eval_loss": 1.0753319263458252,
"eval_runtime": 159.1632,
"eval_samples_per_second": 5.83,
"eval_steps_per_second": 2.915,
"epoch": 0.6418338108882522,
"step": 700
},
{
"loss": 1.109,
"grad_norm": 0.6277522444725037,
"learning_rate": 3.297424743152381e-07,
"epoch": 0.6510028653295129,
"step": 710
},
{
"loss": 1.0808,
"grad_norm": 0.7879471182823181,
"learning_rate": 3.1477705362968696e-07,
"epoch": 0.6601719197707736,
"step": 720
},
{
"loss": 1.0842,
"grad_norm": 0.8374884128570557,
"learning_rate": 3.000015742424857e-07,
"epoch": 0.6693409742120344,
"step": 730
},
{
"loss": 1.2445,
"grad_norm": 0.7892112731933594,
"learning_rate": 2.85431188022199e-07,
"epoch": 0.6785100286532951,
"step": 740
},
{
"loss": 1.1506,
"grad_norm": 0.5540062785148621,
"learning_rate": 2.710808365197e-07,
"epoch": 0.6876790830945558,
"step": 750
},
{
"eval_loss": 1.0686043500900269,
"eval_runtime": 154.055,
"eval_samples_per_second": 6.024,
"eval_steps_per_second": 3.012,
"epoch": 0.6876790830945558,
"step": 750
},
{
"loss": 1.1023,
"grad_norm": 0.5218796133995056,
"learning_rate": 2.569652356460007e-07,
"epoch": 0.6968481375358166,
"step": 760
},
{
"loss": 1.1801,
"grad_norm": 0.7433627247810364,
"learning_rate": 2.430988605814691e-07,
"epoch": 0.7060171919770774,
"step": 770
},
{
"loss": 1.0276,
"grad_norm": 0.629487931728363,
"learning_rate": 2.294959309319086e-07,
"epoch": 0.7151862464183381,
"step": 780
},
{
"loss": 1.1164,
"grad_norm": 0.6667075157165527,
"learning_rate": 2.1617039614672378e-07,
"epoch": 0.7243553008595989,
"step": 790
},
{
"loss": 1.0869,
"grad_norm": 0.6930222511291504,
"learning_rate": 2.0313592121412464e-07,
"epoch": 0.7335243553008596,
"step": 800
},
{
"eval_loss": 1.0634101629257202,
"eval_runtime": 154.1494,
"eval_samples_per_second": 6.02,
"eval_steps_per_second": 3.01,
"epoch": 0.7335243553008596,
"step": 800
},
{
"loss": 1.0937,
"grad_norm": 0.7793363332748413,
"learning_rate": 1.904058726480367e-07,
"epoch": 0.7426934097421204,
"step": 810
},
{
"loss": 0.9728,
"grad_norm": 0.5570642948150635,
"learning_rate": 1.7799330478109026e-07,
"epoch": 0.7518624641833811,
"step": 820
},
{
"loss": 1.2568,
"grad_norm": 0.9017201662063599,
"learning_rate": 1.65910946377743e-07,
"epoch": 0.7610315186246418,
"step": 830
},
{
"loss": 1.0049,
"grad_norm": 0.7178328633308411,
"learning_rate": 1.5417118758126408e-07,
"epoch": 0.7702005730659026,
"step": 840
},
{
"loss": 1.0576,
"grad_norm": 1.031610369682312,
"learning_rate": 1.4278606720796543e-07,
"epoch": 0.7793696275071633,
"step": 850
},
{
"eval_loss": 1.0599370002746582,
"eval_runtime": 154.7364,
"eval_samples_per_second": 5.997,
"eval_steps_per_second": 2.999,
"epoch": 0.7793696275071633,
"step": 850
},
{
"loss": 1.1797,
"grad_norm": 1.0518614053726196,
"learning_rate": 1.3176726040171e-07,
"epoch": 0.788538681948424,
"step": 860
},
{
"loss": 1.2085,
"grad_norm": 0.7290861010551453,
"learning_rate": 1.21126066661356e-07,
"epoch": 0.7977077363896848,
"step": 870
},
{
"loss": 1.1165,
"grad_norm": 0.6315222382545471,
"learning_rate": 1.108733982534159e-07,
"epoch": 0.8068767908309455,
"step": 880
},
{
"loss": 1.158,
"grad_norm": 0.685243546962738,
"learning_rate": 1.0101976902181225e-07,
"epoch": 0.8160458452722062,
"step": 890
},
{
"loss": 1.0584,
"grad_norm": 0.7780338525772095,
"learning_rate": 9.157528360620415e-08,
"epoch": 0.8252148997134671,
"step": 900
},
{
"eval_loss": 1.057593584060669,
"eval_runtime": 153.5904,
"eval_samples_per_second": 6.042,
"eval_steps_per_second": 3.021,
"epoch": 0.8252148997134671,
"step": 900
},
{
"loss": 1.1489,
"grad_norm": 0.6839588284492493,
"learning_rate": 8.254962707994373e-08,
"epoch": 0.8343839541547278,
"step": 910
},
{
"loss": 1.1096,
"grad_norm": 0.9299020171165466,
"learning_rate": 7.395205501828577e-08,
"epoch": 0.8435530085959886,
"step": 920
},
{
"loss": 1.1224,
"grad_norm": 0.791289746761322,
"learning_rate": 6.579138400703715e-08,
"epoch": 0.8527220630372493,
"step": 930
},
{
"loss": 1.038,
"grad_norm": 0.6159808039665222,
"learning_rate": 5.807598260137758e-08,
"epoch": 0.86189111747851,
"step": 940
},
{
"loss": 1.0708,
"grad_norm": 0.7773894667625427,
"learning_rate": 5.08137627441253e-08,
"epoch": 0.8710601719197708,
"step": 950
},
{
"eval_loss": 1.0561405420303345,
"eval_runtime": 154.3615,
"eval_samples_per_second": 6.012,
"eval_steps_per_second": 3.006,
"epoch": 0.8710601719197708,
"step": 950
},
{
"loss": 1.0656,
"grad_norm": 0.707645833492279,
"learning_rate": 4.401217165224563e-08,
"epoch": 0.8802292263610315,
"step": 960
},
{
"loss": 1.0164,
"grad_norm": 0.6336905360221863,
"learning_rate": 3.767818417992446e-08,
"epoch": 0.8893982808022922,
"step": 970
},
{
"loss": 1.0766,
"grad_norm": 0.8207520842552185,
"learning_rate": 3.181829566603772e-08,
"epoch": 0.898567335243553,
"step": 980
},
{
"loss": 1.0929,
"grad_norm": 0.6286782026290894,
"learning_rate": 2.643851527335006e-08,
"epoch": 0.9077363896848137,
"step": 990
},
{
"loss": 1.2541,
"grad_norm": 0.817637026309967,
"learning_rate": 2.1544359826275726e-08,
"epoch": 0.9169054441260746,
"step": 1000
},
{
"eval_loss": 1.0553829669952393,
"eval_runtime": 155.0837,
"eval_samples_per_second": 5.984,
"eval_steps_per_second": 2.992,
"epoch": 0.9169054441260746,
"step": 1000
},
{
"loss": 1.0413,
"grad_norm": 0.9485034942626953,
"learning_rate": 1.714084815351913e-08,
"epoch": 0.9260744985673353,
"step": 1010
},
{
"loss": 1.0516,
"grad_norm": 0.6737267971038818,
"learning_rate": 1.3232495941396637e-08,
"epoch": 0.935243553008596,
"step": 1020
},
{
"loss": 1.0517,
"grad_norm": 0.9414446353912354,
"learning_rate": 9.82331110311857e-09,
"epoch": 0.9444126074498568,
"step": 1030
},
{
"loss": 1.1016,
"grad_norm": 0.8654493689537048,
"learning_rate": 6.916789668778122e-09,
"epoch": 0.9535816618911175,
"step": 1040
},
{
"loss": 1.1055,
"grad_norm": 0.8262504935264587,
"learning_rate": 4.515912200264427e-09,
"epoch": 0.9627507163323782,
"step": 1050
},
{
"eval_loss": 1.0550979375839233,
"eval_runtime": 156.3052,
"eval_samples_per_second": 5.937,
"eval_steps_per_second": 2.969,
"epoch": 0.9627507163323782,
"step": 1050
},
{
"loss": 1.1321,
"grad_norm": 0.7707592844963074,
"learning_rate": 2.6231407347736546e-09,
"epoch": 0.971919770773639,
"step": 1060
},
{
"loss": 1.1039,
"grad_norm": 0.7415518760681152,
"learning_rate": 1.2404162600541113e-09,
"epoch": 0.9810888252148997,
"step": 1070
},
{
"loss": 1.0673,
"grad_norm": 0.6835209727287292,
"learning_rate": 3.6915672397436204e-10,
"epoch": 0.9902578796561604,
"step": 1080
},
{
"loss": 1.1213,
"grad_norm": 0.6404680013656616,
"learning_rate": 1.0255580454254786e-11,
"epoch": 0.9994269340974212,
"step": 1090
},
{
"train_runtime": 10741.972,
"train_samples_per_second": 1.624,
"train_steps_per_second": 0.102,
"total_flos": 1.5816368624117146e+17,
"train_loss": 1.2491859223840436,
"epoch": 1.0,
"step": 1091,
"total_runtime_sec": 10743.41768693924
}
]
}
}