| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.33288948069241014, |
| "eval_steps": 500, |
| "global_step": 1000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.003328894806924101, |
| "grad_norm": 37.75, |
| "learning_rate": 1.9933422103861518e-05, |
| "loss": 1.4084, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.006657789613848202, |
| "grad_norm": 12.125, |
| "learning_rate": 1.9866844207723038e-05, |
| "loss": 1.0474, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.009986684420772303, |
| "grad_norm": 45.25, |
| "learning_rate": 1.9800266311584554e-05, |
| "loss": 1.2455, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.013315579227696404, |
| "grad_norm": 13.9375, |
| "learning_rate": 1.9733688415446073e-05, |
| "loss": 1.1969, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.016644474034620507, |
| "grad_norm": 32.0, |
| "learning_rate": 1.966711051930759e-05, |
| "loss": 1.1659, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.019973368841544607, |
| "grad_norm": 19.0, |
| "learning_rate": 1.960053262316911e-05, |
| "loss": 1.1159, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.02330226364846871, |
| "grad_norm": 27.5, |
| "learning_rate": 1.953395472703063e-05, |
| "loss": 1.1861, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.02663115845539281, |
| "grad_norm": 17.5, |
| "learning_rate": 1.9467376830892145e-05, |
| "loss": 1.1073, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.02996005326231691, |
| "grad_norm": 33.75, |
| "learning_rate": 1.9400798934753665e-05, |
| "loss": 1.1506, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.033288948069241014, |
| "grad_norm": 9.5, |
| "learning_rate": 1.933422103861518e-05, |
| "loss": 1.246, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.03661784287616511, |
| "grad_norm": 11.3125, |
| "learning_rate": 1.92676431424767e-05, |
| "loss": 1.1708, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.03994673768308921, |
| "grad_norm": 14.375, |
| "learning_rate": 1.9201065246338217e-05, |
| "loss": 1.1302, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.043275632490013316, |
| "grad_norm": 20.625, |
| "learning_rate": 1.9134487350199737e-05, |
| "loss": 1.0537, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.04660452729693742, |
| "grad_norm": 21.75, |
| "learning_rate": 1.9067909454061253e-05, |
| "loss": 1.0669, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.049933422103861515, |
| "grad_norm": 14.125, |
| "learning_rate": 1.900133155792277e-05, |
| "loss": 1.0482, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.05326231691078562, |
| "grad_norm": 27.625, |
| "learning_rate": 1.893475366178429e-05, |
| "loss": 1.1468, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.05659121171770972, |
| "grad_norm": 4.53125, |
| "learning_rate": 1.8868175765645805e-05, |
| "loss": 1.092, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.05992010652463382, |
| "grad_norm": 32.5, |
| "learning_rate": 1.8801597869507325e-05, |
| "loss": 1.2208, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.06324900133155792, |
| "grad_norm": 56.0, |
| "learning_rate": 1.873501997336884e-05, |
| "loss": 1.2359, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.06657789613848203, |
| "grad_norm": 8.375, |
| "learning_rate": 1.866844207723036e-05, |
| "loss": 1.0712, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.06990679094540612, |
| "grad_norm": 26.375, |
| "learning_rate": 1.860186418109188e-05, |
| "loss": 1.0156, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.07323568575233022, |
| "grad_norm": 12.0, |
| "learning_rate": 1.8535286284953397e-05, |
| "loss": 1.0407, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.07656458055925433, |
| "grad_norm": 13.75, |
| "learning_rate": 1.8468708388814916e-05, |
| "loss": 1.0161, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.07989347536617843, |
| "grad_norm": 27.125, |
| "learning_rate": 1.8402130492676432e-05, |
| "loss": 1.1466, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.08322237017310254, |
| "grad_norm": 26.125, |
| "learning_rate": 1.8335552596537952e-05, |
| "loss": 1.3192, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.08655126498002663, |
| "grad_norm": 25.75, |
| "learning_rate": 1.826897470039947e-05, |
| "loss": 1.0628, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.08988015978695073, |
| "grad_norm": 29.25, |
| "learning_rate": 1.8202396804260988e-05, |
| "loss": 0.967, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.09320905459387484, |
| "grad_norm": 29.75, |
| "learning_rate": 1.8135818908122504e-05, |
| "loss": 1.0356, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.09653794940079893, |
| "grad_norm": 11.6875, |
| "learning_rate": 1.806924101198402e-05, |
| "loss": 0.9853, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.09986684420772303, |
| "grad_norm": 27.5, |
| "learning_rate": 1.800266311584554e-05, |
| "loss": 1.0279, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.10319573901464714, |
| "grad_norm": 5.65625, |
| "learning_rate": 1.7936085219707056e-05, |
| "loss": 1.0115, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.10652463382157124, |
| "grad_norm": 8.1875, |
| "learning_rate": 1.7869507323568576e-05, |
| "loss": 0.942, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.10985352862849534, |
| "grad_norm": 65.0, |
| "learning_rate": 1.7802929427430096e-05, |
| "loss": 1.0208, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.11318242343541944, |
| "grad_norm": 9.625, |
| "learning_rate": 1.7736351531291612e-05, |
| "loss": 1.1729, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.11651131824234354, |
| "grad_norm": 17.5, |
| "learning_rate": 1.766977363515313e-05, |
| "loss": 0.9322, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.11984021304926765, |
| "grad_norm": 28.125, |
| "learning_rate": 1.7603195739014648e-05, |
| "loss": 1.0417, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.12316910785619174, |
| "grad_norm": 18.25, |
| "learning_rate": 1.7536617842876168e-05, |
| "loss": 1.2491, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.12649800266311584, |
| "grad_norm": 27.375, |
| "learning_rate": 1.7470039946737684e-05, |
| "loss": 0.9388, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.12982689747003995, |
| "grad_norm": 22.125, |
| "learning_rate": 1.7403462050599203e-05, |
| "loss": 1.0488, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.13315579227696406, |
| "grad_norm": 21.25, |
| "learning_rate": 1.733688415446072e-05, |
| "loss": 0.9951, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.13648468708388814, |
| "grad_norm": 11.75, |
| "learning_rate": 1.727030625832224e-05, |
| "loss": 0.9212, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.13981358189081225, |
| "grad_norm": 16.25, |
| "learning_rate": 1.7203728362183756e-05, |
| "loss": 1.0548, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.14314247669773636, |
| "grad_norm": 19.875, |
| "learning_rate": 1.7137150466045275e-05, |
| "loss": 1.0238, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.14647137150466044, |
| "grad_norm": 18.875, |
| "learning_rate": 1.707057256990679e-05, |
| "loss": 1.0327, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.14980026631158455, |
| "grad_norm": 7.84375, |
| "learning_rate": 1.7003994673768308e-05, |
| "loss": 1.1155, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.15312916111850866, |
| "grad_norm": 14.125, |
| "learning_rate": 1.693741677762983e-05, |
| "loss": 0.9627, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.15645805592543274, |
| "grad_norm": 7.4375, |
| "learning_rate": 1.6870838881491347e-05, |
| "loss": 1.0216, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.15978695073235685, |
| "grad_norm": 9.375, |
| "learning_rate": 1.6804260985352863e-05, |
| "loss": 1.0489, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.16311584553928096, |
| "grad_norm": 19.5, |
| "learning_rate": 1.6737683089214383e-05, |
| "loss": 1.1254, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.16644474034620507, |
| "grad_norm": 26.125, |
| "learning_rate": 1.66711051930759e-05, |
| "loss": 1.0134, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.16644474034620507, |
| "eval_accuracy": 0.5015762402521985, |
| "eval_loss": 0.9942083358764648, |
| "eval_runtime": 211.8123, |
| "eval_samples_per_second": 113.818, |
| "eval_steps_per_second": 28.454, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.16977363515312915, |
| "grad_norm": 12.3125, |
| "learning_rate": 1.660452729693742e-05, |
| "loss": 1.0164, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.17310252996005326, |
| "grad_norm": 38.75, |
| "learning_rate": 1.6537949400798935e-05, |
| "loss": 1.0042, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.17643142476697737, |
| "grad_norm": 20.125, |
| "learning_rate": 1.6471371504660455e-05, |
| "loss": 0.9786, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.17976031957390146, |
| "grad_norm": 11.4375, |
| "learning_rate": 1.640479360852197e-05, |
| "loss": 0.9678, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.18308921438082557, |
| "grad_norm": 19.375, |
| "learning_rate": 1.633821571238349e-05, |
| "loss": 1.0477, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.18641810918774968, |
| "grad_norm": 16.75, |
| "learning_rate": 1.6271637816245007e-05, |
| "loss": 1.02, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.18974700399467376, |
| "grad_norm": 15.5, |
| "learning_rate": 1.6205059920106527e-05, |
| "loss": 0.9864, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.19307589880159787, |
| "grad_norm": 9.75, |
| "learning_rate": 1.6138482023968043e-05, |
| "loss": 1.0019, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.19640479360852198, |
| "grad_norm": 12.125, |
| "learning_rate": 1.6071904127829563e-05, |
| "loss": 0.9483, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.19973368841544606, |
| "grad_norm": 30.875, |
| "learning_rate": 1.6005326231691082e-05, |
| "loss": 1.0692, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.20306258322237017, |
| "grad_norm": 10.125, |
| "learning_rate": 1.59387483355526e-05, |
| "loss": 0.9279, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.20639147802929428, |
| "grad_norm": 11.6875, |
| "learning_rate": 1.5872170439414115e-05, |
| "loss": 0.941, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.2097203728362184, |
| "grad_norm": 3.03125, |
| "learning_rate": 1.5805592543275634e-05, |
| "loss": 1.0259, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.21304926764314247, |
| "grad_norm": 10.4375, |
| "learning_rate": 1.573901464713715e-05, |
| "loss": 0.9687, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.21637816245006658, |
| "grad_norm": 12.8125, |
| "learning_rate": 1.567243675099867e-05, |
| "loss": 0.9512, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.2197070572569907, |
| "grad_norm": 9.75, |
| "learning_rate": 1.5605858854860187e-05, |
| "loss": 0.9569, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.22303595206391477, |
| "grad_norm": 17.375, |
| "learning_rate": 1.5539280958721706e-05, |
| "loss": 0.9694, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.22636484687083888, |
| "grad_norm": 9.625, |
| "learning_rate": 1.5472703062583222e-05, |
| "loss": 0.97, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.229693741677763, |
| "grad_norm": 21.0, |
| "learning_rate": 1.5406125166444742e-05, |
| "loss": 1.0506, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.23302263648468707, |
| "grad_norm": 10.3125, |
| "learning_rate": 1.533954727030626e-05, |
| "loss": 0.9184, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.23635153129161118, |
| "grad_norm": 21.5, |
| "learning_rate": 1.5272969374167778e-05, |
| "loss": 1.005, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.2396804260985353, |
| "grad_norm": 4.40625, |
| "learning_rate": 1.5206391478029296e-05, |
| "loss": 0.8624, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.24300932090545938, |
| "grad_norm": 13.9375, |
| "learning_rate": 1.5139813581890814e-05, |
| "loss": 1.0051, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.24633821571238348, |
| "grad_norm": 16.625, |
| "learning_rate": 1.5073235685752332e-05, |
| "loss": 1.0378, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.2496671105193076, |
| "grad_norm": 6.4375, |
| "learning_rate": 1.500665778961385e-05, |
| "loss": 0.9191, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.2529960053262317, |
| "grad_norm": 5.5625, |
| "learning_rate": 1.4940079893475368e-05, |
| "loss": 0.9722, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.2563249001331558, |
| "grad_norm": 22.625, |
| "learning_rate": 1.4873501997336886e-05, |
| "loss": 1.014, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.2596537949400799, |
| "grad_norm": 21.0, |
| "learning_rate": 1.4806924101198404e-05, |
| "loss": 1.0986, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.262982689747004, |
| "grad_norm": 10.4375, |
| "learning_rate": 1.4740346205059922e-05, |
| "loss": 0.9672, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.2663115845539281, |
| "grad_norm": 17.0, |
| "learning_rate": 1.467376830892144e-05, |
| "loss": 0.9605, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.26964047936085217, |
| "grad_norm": 31.75, |
| "learning_rate": 1.4607190412782957e-05, |
| "loss": 1.0308, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.2729693741677763, |
| "grad_norm": 12.4375, |
| "learning_rate": 1.4540612516644474e-05, |
| "loss": 0.9268, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.2762982689747004, |
| "grad_norm": 17.125, |
| "learning_rate": 1.4474034620505992e-05, |
| "loss": 0.9656, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.2796271637816245, |
| "grad_norm": 4.53125, |
| "learning_rate": 1.440745672436751e-05, |
| "loss": 0.9865, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.2829560585885486, |
| "grad_norm": 22.625, |
| "learning_rate": 1.434087882822903e-05, |
| "loss": 1.1118, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.2862849533954727, |
| "grad_norm": 13.0, |
| "learning_rate": 1.4274300932090547e-05, |
| "loss": 1.0043, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.28961384820239683, |
| "grad_norm": 9.0625, |
| "learning_rate": 1.4207723035952065e-05, |
| "loss": 0.9768, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.2929427430093209, |
| "grad_norm": 6.75, |
| "learning_rate": 1.4141145139813583e-05, |
| "loss": 1.0237, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.296271637816245, |
| "grad_norm": 10.875, |
| "learning_rate": 1.4074567243675101e-05, |
| "loss": 1.1597, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.2996005326231691, |
| "grad_norm": 5.75, |
| "learning_rate": 1.4007989347536619e-05, |
| "loss": 0.7993, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.3029294274300932, |
| "grad_norm": 31.5, |
| "learning_rate": 1.3941411451398137e-05, |
| "loss": 0.9802, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.3062583222370173, |
| "grad_norm": 12.25, |
| "learning_rate": 1.3874833555259655e-05, |
| "loss": 0.9565, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.30958721704394143, |
| "grad_norm": 11.8125, |
| "learning_rate": 1.3808255659121173e-05, |
| "loss": 1.0024, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.3129161118508655, |
| "grad_norm": 16.625, |
| "learning_rate": 1.3741677762982691e-05, |
| "loss": 1.0209, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.3162450066577896, |
| "grad_norm": 13.375, |
| "learning_rate": 1.3675099866844209e-05, |
| "loss": 0.8767, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.3195739014647137, |
| "grad_norm": 20.875, |
| "learning_rate": 1.3608521970705725e-05, |
| "loss": 0.9642, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.3229027962716378, |
| "grad_norm": 17.25, |
| "learning_rate": 1.3541944074567243e-05, |
| "loss": 0.9508, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.3262316910785619, |
| "grad_norm": 22.25, |
| "learning_rate": 1.3475366178428764e-05, |
| "loss": 1.1324, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.32956058588548603, |
| "grad_norm": 11.5, |
| "learning_rate": 1.3408788282290282e-05, |
| "loss": 1.0655, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.33288948069241014, |
| "grad_norm": 15.625, |
| "learning_rate": 1.3342210386151799e-05, |
| "loss": 0.9608, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.33288948069241014, |
| "eval_accuracy": 0.507300481168077, |
| "eval_loss": 0.9467151165008545, |
| "eval_runtime": 211.5004, |
| "eval_samples_per_second": 113.986, |
| "eval_steps_per_second": 28.496, |
| "step": 1000 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 3004, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|