| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.0977734753146176, |
| "eval_steps": 500, |
| "global_step": 1000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.030977734753146177, |
| "grad_norm": 99.0, |
| "learning_rate": 1.0000000000000002e-06, |
| "loss": 1.9661, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.061955469506292354, |
| "grad_norm": 374.0, |
| "learning_rate": 2.0000000000000003e-06, |
| "loss": 2.2124, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.09293320425943853, |
| "grad_norm": 1392640.0, |
| "learning_rate": 3e-06, |
| "loss": 2.2122, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.12391093901258471, |
| "grad_norm": 78.5, |
| "learning_rate": 4.000000000000001e-06, |
| "loss": 1.9339, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.15488867376573087, |
| "grad_norm": 13500416.0, |
| "learning_rate": 5e-06, |
| "loss": 1.8448, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.18586640851887706, |
| "grad_norm": 98.0, |
| "learning_rate": 6e-06, |
| "loss": 1.9577, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.21684414327202323, |
| "grad_norm": 11337728.0, |
| "learning_rate": 7.000000000000001e-06, |
| "loss": 2.089, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.24782187802516942, |
| "grad_norm": 20736.0, |
| "learning_rate": 8.000000000000001e-06, |
| "loss": 1.8571, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.2787996127783156, |
| "grad_norm": 8512.0, |
| "learning_rate": 9e-06, |
| "loss": 2.0639, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.30977734753146174, |
| "grad_norm": 1004.0, |
| "learning_rate": 1e-05, |
| "loss": 1.978, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.34075508228460794, |
| "grad_norm": 470.0, |
| "learning_rate": 1.1000000000000001e-05, |
| "loss": 1.9845, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.3717328170377541, |
| "grad_norm": 268435456.0, |
| "learning_rate": 1.2e-05, |
| "loss": 1.9825, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.4027105517909003, |
| "grad_norm": 50593792.0, |
| "learning_rate": 1.3000000000000001e-05, |
| "loss": 2.0309, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.43368828654404645, |
| "grad_norm": 942080.0, |
| "learning_rate": 1.4000000000000001e-05, |
| "loss": 1.804, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.46466602129719264, |
| "grad_norm": 93.5, |
| "learning_rate": 1.5e-05, |
| "loss": 1.9188, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.49564375605033884, |
| "grad_norm": 138.0, |
| "learning_rate": 1.6000000000000003e-05, |
| "loss": 1.7755, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.526621490803485, |
| "grad_norm": 2310144.0, |
| "learning_rate": 1.7000000000000003e-05, |
| "loss": 1.9404, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.5575992255566312, |
| "grad_norm": 252928.0, |
| "learning_rate": 1.8e-05, |
| "loss": 1.9115, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.5885769603097774, |
| "grad_norm": 36096.0, |
| "learning_rate": 1.9e-05, |
| "loss": 2.0781, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.6195546950629235, |
| "grad_norm": 143.0, |
| "learning_rate": 2e-05, |
| "loss": 1.8338, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.6505324298160697, |
| "grad_norm": 222208.0, |
| "learning_rate": 2.1e-05, |
| "loss": 1.9065, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.6815101645692159, |
| "grad_norm": 11599872.0, |
| "learning_rate": 2.2000000000000003e-05, |
| "loss": 1.8142, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.712487899322362, |
| "grad_norm": 3568.0, |
| "learning_rate": 2.3000000000000003e-05, |
| "loss": 1.7811, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.7434656340755083, |
| "grad_norm": 81.5, |
| "learning_rate": 2.4e-05, |
| "loss": 1.7512, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.7744433688286544, |
| "grad_norm": 3696.0, |
| "learning_rate": 2.5e-05, |
| "loss": 1.8697, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.8054211035818006, |
| "grad_norm": 63744.0, |
| "learning_rate": 2.6000000000000002e-05, |
| "loss": 1.7677, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.8363988383349468, |
| "grad_norm": 55.5, |
| "learning_rate": 2.7000000000000002e-05, |
| "loss": 1.7307, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.8673765730880929, |
| "grad_norm": 202.0, |
| "learning_rate": 2.8000000000000003e-05, |
| "loss": 1.9074, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.8983543078412392, |
| "grad_norm": 348.0, |
| "learning_rate": 2.9e-05, |
| "loss": 1.7037, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.9293320425943853, |
| "grad_norm": 342.0, |
| "learning_rate": 3e-05, |
| "loss": 1.5886, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.9603097773475314, |
| "grad_norm": 62.5, |
| "learning_rate": 3.1e-05, |
| "loss": 1.6942, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.9912875121006777, |
| "grad_norm": 91.5, |
| "learning_rate": 3.2000000000000005e-05, |
| "loss": 1.5326, |
| "step": 320 |
| }, |
| { |
| "epoch": 1.0222652468538238, |
| "grad_norm": 6560.0, |
| "learning_rate": 3.3e-05, |
| "loss": 1.6642, |
| "step": 330 |
| }, |
| { |
| "epoch": 1.05324298160697, |
| "grad_norm": 51.5, |
| "learning_rate": 3.4000000000000007e-05, |
| "loss": 1.5624, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.084220716360116, |
| "grad_norm": 1752.0, |
| "learning_rate": 3.5e-05, |
| "loss": 1.447, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.1151984511132624, |
| "grad_norm": 48.25, |
| "learning_rate": 3.6e-05, |
| "loss": 1.6133, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.1461761858664086, |
| "grad_norm": 52.25, |
| "learning_rate": 3.7e-05, |
| "loss": 1.4596, |
| "step": 370 |
| }, |
| { |
| "epoch": 1.1771539206195547, |
| "grad_norm": 12032.0, |
| "learning_rate": 3.8e-05, |
| "loss": 1.5244, |
| "step": 380 |
| }, |
| { |
| "epoch": 1.2081316553727008, |
| "grad_norm": 49.0, |
| "learning_rate": 3.9000000000000006e-05, |
| "loss": 1.5972, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.239109390125847, |
| "grad_norm": 39.25, |
| "learning_rate": 4e-05, |
| "loss": 1.2712, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.2700871248789931, |
| "grad_norm": 1120.0, |
| "learning_rate": 4.1e-05, |
| "loss": 1.4318, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.3010648596321395, |
| "grad_norm": 28160.0, |
| "learning_rate": 4.2e-05, |
| "loss": 1.3211, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.3320425943852856, |
| "grad_norm": 12845056.0, |
| "learning_rate": 4.3e-05, |
| "loss": 1.4051, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.3630203291384317, |
| "grad_norm": 6979584.0, |
| "learning_rate": 4.4000000000000006e-05, |
| "loss": 1.2505, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.3939980638915779, |
| "grad_norm": 27.75, |
| "learning_rate": 4.5e-05, |
| "loss": 1.1342, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.424975798644724, |
| "grad_norm": 73.0, |
| "learning_rate": 4.600000000000001e-05, |
| "loss": 1.2342, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.4559535333978704, |
| "grad_norm": 19.75, |
| "learning_rate": 4.7e-05, |
| "loss": 1.0688, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.4869312681510165, |
| "grad_norm": 604.0, |
| "learning_rate": 4.8e-05, |
| "loss": 1.0641, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.5179090029041626, |
| "grad_norm": 79.5, |
| "learning_rate": 4.9e-05, |
| "loss": 1.0869, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.5488867376573088, |
| "grad_norm": 2144.0, |
| "learning_rate": 5e-05, |
| "loss": 1.0356, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.5488867376573088, |
| "eval_loss": 0.21533620357513428, |
| "eval_runtime": 145.9664, |
| "eval_samples_per_second": 10.276, |
| "eval_steps_per_second": 2.569, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.579864472410455, |
| "grad_norm": 125.0, |
| "learning_rate": 5.1000000000000006e-05, |
| "loss": 1.0417, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.6108422071636013, |
| "grad_norm": 17.875, |
| "learning_rate": 5.2000000000000004e-05, |
| "loss": 1.0518, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.6418199419167472, |
| "grad_norm": 68.0, |
| "learning_rate": 5.300000000000001e-05, |
| "loss": 1.1404, |
| "step": 530 |
| }, |
| { |
| "epoch": 1.6727976766698935, |
| "grad_norm": 19.5, |
| "learning_rate": 5.4000000000000005e-05, |
| "loss": 0.9938, |
| "step": 540 |
| }, |
| { |
| "epoch": 1.7037754114230397, |
| "grad_norm": 290.0, |
| "learning_rate": 5.500000000000001e-05, |
| "loss": 0.9374, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.7347531461761858, |
| "grad_norm": 58.5, |
| "learning_rate": 5.6000000000000006e-05, |
| "loss": 1.0777, |
| "step": 560 |
| }, |
| { |
| "epoch": 1.7657308809293322, |
| "grad_norm": 422.0, |
| "learning_rate": 5.6999999999999996e-05, |
| "loss": 1.059, |
| "step": 570 |
| }, |
| { |
| "epoch": 1.796708615682478, |
| "grad_norm": 28.625, |
| "learning_rate": 5.8e-05, |
| "loss": 0.965, |
| "step": 580 |
| }, |
| { |
| "epoch": 1.8276863504356244, |
| "grad_norm": 43.75, |
| "learning_rate": 5.9e-05, |
| "loss": 0.9527, |
| "step": 590 |
| }, |
| { |
| "epoch": 1.8586640851887706, |
| "grad_norm": 12.75, |
| "learning_rate": 6e-05, |
| "loss": 0.8296, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.8896418199419167, |
| "grad_norm": 16.0, |
| "learning_rate": 6.1e-05, |
| "loss": 0.933, |
| "step": 610 |
| }, |
| { |
| "epoch": 1.920619554695063, |
| "grad_norm": 11.875, |
| "learning_rate": 6.2e-05, |
| "loss": 0.8117, |
| "step": 620 |
| }, |
| { |
| "epoch": 1.951597289448209, |
| "grad_norm": 50.0, |
| "learning_rate": 6.3e-05, |
| "loss": 0.9475, |
| "step": 630 |
| }, |
| { |
| "epoch": 1.9825750242013553, |
| "grad_norm": 143.0, |
| "learning_rate": 6.400000000000001e-05, |
| "loss": 0.8241, |
| "step": 640 |
| }, |
| { |
| "epoch": 2.0135527589545013, |
| "grad_norm": 108.5, |
| "learning_rate": 6.500000000000001e-05, |
| "loss": 0.7861, |
| "step": 650 |
| }, |
| { |
| "epoch": 2.0445304937076476, |
| "grad_norm": 13184.0, |
| "learning_rate": 6.6e-05, |
| "loss": 0.8384, |
| "step": 660 |
| }, |
| { |
| "epoch": 2.075508228460794, |
| "grad_norm": 1736704.0, |
| "learning_rate": 6.7e-05, |
| "loss": 0.8896, |
| "step": 670 |
| }, |
| { |
| "epoch": 2.10648596321394, |
| "grad_norm": 48.75, |
| "learning_rate": 6.800000000000001e-05, |
| "loss": 0.9377, |
| "step": 680 |
| }, |
| { |
| "epoch": 2.1374636979670862, |
| "grad_norm": 2816.0, |
| "learning_rate": 6.9e-05, |
| "loss": 0.8322, |
| "step": 690 |
| }, |
| { |
| "epoch": 2.168441432720232, |
| "grad_norm": 3.125, |
| "learning_rate": 7e-05, |
| "loss": 0.8397, |
| "step": 700 |
| }, |
| { |
| "epoch": 2.1994191674733785, |
| "grad_norm": 8.6875, |
| "learning_rate": 7.1e-05, |
| "loss": 0.9265, |
| "step": 710 |
| }, |
| { |
| "epoch": 2.230396902226525, |
| "grad_norm": 9.875, |
| "learning_rate": 7.2e-05, |
| "loss": 0.8141, |
| "step": 720 |
| }, |
| { |
| "epoch": 2.261374636979671, |
| "grad_norm": 14.8125, |
| "learning_rate": 7.3e-05, |
| "loss": 0.7629, |
| "step": 730 |
| }, |
| { |
| "epoch": 2.292352371732817, |
| "grad_norm": 6.4375, |
| "learning_rate": 7.4e-05, |
| "loss": 0.85, |
| "step": 740 |
| }, |
| { |
| "epoch": 2.323330106485963, |
| "grad_norm": 3.796875, |
| "learning_rate": 7.500000000000001e-05, |
| "loss": 0.9116, |
| "step": 750 |
| }, |
| { |
| "epoch": 2.3543078412391094, |
| "grad_norm": 544768.0, |
| "learning_rate": 7.6e-05, |
| "loss": 0.8437, |
| "step": 760 |
| }, |
| { |
| "epoch": 2.3852855759922553, |
| "grad_norm": 117760.0, |
| "learning_rate": 7.7e-05, |
| "loss": 0.9072, |
| "step": 770 |
| }, |
| { |
| "epoch": 2.4162633107454017, |
| "grad_norm": 22.75, |
| "learning_rate": 7.800000000000001e-05, |
| "loss": 1.0168, |
| "step": 780 |
| }, |
| { |
| "epoch": 2.447241045498548, |
| "grad_norm": 2211840.0, |
| "learning_rate": 7.900000000000001e-05, |
| "loss": 1.1433, |
| "step": 790 |
| }, |
| { |
| "epoch": 2.478218780251694, |
| "grad_norm": 41.5, |
| "learning_rate": 8e-05, |
| "loss": 0.7485, |
| "step": 800 |
| }, |
| { |
| "epoch": 2.5091965150048403, |
| "grad_norm": 1810432.0, |
| "learning_rate": 8.1e-05, |
| "loss": 0.9516, |
| "step": 810 |
| }, |
| { |
| "epoch": 2.5401742497579862, |
| "grad_norm": 1081344.0, |
| "learning_rate": 8.2e-05, |
| "loss": 1.0742, |
| "step": 820 |
| }, |
| { |
| "epoch": 2.5711519845111326, |
| "grad_norm": 337641472.0, |
| "learning_rate": 8.3e-05, |
| "loss": 1.023, |
| "step": 830 |
| }, |
| { |
| "epoch": 2.602129719264279, |
| "grad_norm": 14548992.0, |
| "learning_rate": 8.4e-05, |
| "loss": 1.185, |
| "step": 840 |
| }, |
| { |
| "epoch": 2.633107454017425, |
| "grad_norm": 4.71875, |
| "learning_rate": 8.5e-05, |
| "loss": 1.3584, |
| "step": 850 |
| }, |
| { |
| "epoch": 2.664085188770571, |
| "grad_norm": 618496.0, |
| "learning_rate": 8.6e-05, |
| "loss": 0.9947, |
| "step": 860 |
| }, |
| { |
| "epoch": 2.695062923523717, |
| "grad_norm": 22151168.0, |
| "learning_rate": 8.7e-05, |
| "loss": 1.0296, |
| "step": 870 |
| }, |
| { |
| "epoch": 2.7260406582768635, |
| "grad_norm": 77824.0, |
| "learning_rate": 8.800000000000001e-05, |
| "loss": 0.8889, |
| "step": 880 |
| }, |
| { |
| "epoch": 2.75701839303001, |
| "grad_norm": 2.640625, |
| "learning_rate": 8.900000000000001e-05, |
| "loss": 0.733, |
| "step": 890 |
| }, |
| { |
| "epoch": 2.7879961277831558, |
| "grad_norm": 18.625, |
| "learning_rate": 9e-05, |
| "loss": 0.7786, |
| "step": 900 |
| }, |
| { |
| "epoch": 2.818973862536302, |
| "grad_norm": 9.0625, |
| "learning_rate": 9.1e-05, |
| "loss": 0.7221, |
| "step": 910 |
| }, |
| { |
| "epoch": 2.849951597289448, |
| "grad_norm": 10.4375, |
| "learning_rate": 9.200000000000001e-05, |
| "loss": 0.6316, |
| "step": 920 |
| }, |
| { |
| "epoch": 2.8809293320425944, |
| "grad_norm": 4.84375, |
| "learning_rate": 9.300000000000001e-05, |
| "loss": 0.7015, |
| "step": 930 |
| }, |
| { |
| "epoch": 2.9119070667957407, |
| "grad_norm": 4.1875, |
| "learning_rate": 9.4e-05, |
| "loss": 0.7161, |
| "step": 940 |
| }, |
| { |
| "epoch": 2.9428848015488867, |
| "grad_norm": 2.8125, |
| "learning_rate": 9.5e-05, |
| "loss": 0.7325, |
| "step": 950 |
| }, |
| { |
| "epoch": 2.973862536302033, |
| "grad_norm": 5.90625, |
| "learning_rate": 9.6e-05, |
| "loss": 0.6447, |
| "step": 960 |
| }, |
| { |
| "epoch": 3.004840271055179, |
| "grad_norm": 1.6015625, |
| "learning_rate": 9.7e-05, |
| "loss": 0.7079, |
| "step": 970 |
| }, |
| { |
| "epoch": 3.0358180058083253, |
| "grad_norm": 3.8125, |
| "learning_rate": 9.8e-05, |
| "loss": 0.6075, |
| "step": 980 |
| }, |
| { |
| "epoch": 3.0667957405614716, |
| "grad_norm": 1.9140625, |
| "learning_rate": 9.900000000000001e-05, |
| "loss": 0.697, |
| "step": 990 |
| }, |
| { |
| "epoch": 3.0977734753146176, |
| "grad_norm": 1.140625, |
| "learning_rate": 0.0001, |
| "loss": 0.7175, |
| "step": 1000 |
| }, |
| { |
| "epoch": 3.0977734753146176, |
| "eval_loss": 0.1426076889038086, |
| "eval_runtime": 146.0018, |
| "eval_samples_per_second": 10.274, |
| "eval_steps_per_second": 2.568, |
| "step": 1000 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 9660, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 30, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.0946451186709955e+18, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|