| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 248, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.004032258064516129, | |
| "grad_norm": 23.183565139770508, | |
| "learning_rate": 0.0, | |
| "loss": 1.0275, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.008064516129032258, | |
| "grad_norm": 23.37139320373535, | |
| "learning_rate": 6.666666666666668e-06, | |
| "loss": 1.0547, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.012096774193548387, | |
| "grad_norm": 22.907167434692383, | |
| "learning_rate": 1.0566416671474378e-05, | |
| "loss": 1.0348, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.016129032258064516, | |
| "grad_norm": 15.111400604248047, | |
| "learning_rate": 1.3333333333333337e-05, | |
| "loss": 0.6538, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.020161290322580645, | |
| "grad_norm": 6.891965389251709, | |
| "learning_rate": 1.5479520632582417e-05, | |
| "loss": 0.253, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.024193548387096774, | |
| "grad_norm": 5.94376802444458, | |
| "learning_rate": 1.7233083338141044e-05, | |
| "loss": 0.3932, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.028225806451612902, | |
| "grad_norm": 2.6044232845306396, | |
| "learning_rate": 1.8715699480384028e-05, | |
| "loss": 0.2468, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.03225806451612903, | |
| "grad_norm": 5.8222126960754395, | |
| "learning_rate": 2e-05, | |
| "loss": 0.3847, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.036290322580645164, | |
| "grad_norm": 2.5366413593292236, | |
| "learning_rate": 2e-05, | |
| "loss": 0.306, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.04032258064516129, | |
| "grad_norm": 1.6535553932189941, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2243, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.04435483870967742, | |
| "grad_norm": 1.9359959363937378, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2214, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.04838709677419355, | |
| "grad_norm": 2.255415201187134, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2351, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.05241935483870968, | |
| "grad_norm": 1.2205442190170288, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1992, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.056451612903225805, | |
| "grad_norm": 2.05025053024292, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2232, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.06048387096774194, | |
| "grad_norm": 1.5063331127166748, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2107, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.06451612903225806, | |
| "grad_norm": 1.1306530237197876, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2015, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.06854838709677419, | |
| "grad_norm": 0.8307346701622009, | |
| "learning_rate": 2e-05, | |
| "loss": 0.191, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.07258064516129033, | |
| "grad_norm": 0.6416309475898743, | |
| "learning_rate": 2e-05, | |
| "loss": 0.184, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.07661290322580645, | |
| "grad_norm": 1.3595116138458252, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.08064516129032258, | |
| "grad_norm": 0.6382694840431213, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1793, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0846774193548387, | |
| "grad_norm": 1.1086512804031372, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1925, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.08870967741935484, | |
| "grad_norm": 0.6290364861488342, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1828, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.09274193548387097, | |
| "grad_norm": 1.621184229850769, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1849, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.0967741935483871, | |
| "grad_norm": 1.1934734582901, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1978, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.10080645161290322, | |
| "grad_norm": 0.5095123648643494, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1823, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.10483870967741936, | |
| "grad_norm": 0.9218118786811829, | |
| "learning_rate": 2e-05, | |
| "loss": 0.191, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.10887096774193548, | |
| "grad_norm": 0.7978827357292175, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1766, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.11290322580645161, | |
| "grad_norm": 0.8179630637168884, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1868, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.11693548387096774, | |
| "grad_norm": 0.5699785351753235, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1869, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.12096774193548387, | |
| "grad_norm": 0.7835913300514221, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1973, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.125, | |
| "grad_norm": 0.5692117810249329, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1759, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.12903225806451613, | |
| "grad_norm": 0.803184449672699, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1907, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.13306451612903225, | |
| "grad_norm": 0.5545604228973389, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1676, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.13709677419354838, | |
| "grad_norm": 0.3685874342918396, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1766, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.14112903225806453, | |
| "grad_norm": 0.6002175807952881, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1836, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.14516129032258066, | |
| "grad_norm": 0.5726589560508728, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1757, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.14919354838709678, | |
| "grad_norm": 0.5990753769874573, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1814, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.1532258064516129, | |
| "grad_norm": 0.5180577039718628, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1725, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.15725806451612903, | |
| "grad_norm": 0.6645565629005432, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1856, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.16129032258064516, | |
| "grad_norm": 0.7106342911720276, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1679, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.16532258064516128, | |
| "grad_norm": 0.8131007552146912, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1806, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.1693548387096774, | |
| "grad_norm": 0.8143223524093628, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1689, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.17338709677419356, | |
| "grad_norm": 0.6998667120933533, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1681, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.1774193548387097, | |
| "grad_norm": 0.9026826620101929, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1597, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.1814516129032258, | |
| "grad_norm": 1.212770700454712, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1707, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.18548387096774194, | |
| "grad_norm": 0.8376269936561584, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1552, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.18951612903225806, | |
| "grad_norm": 0.9025837182998657, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1633, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.1935483870967742, | |
| "grad_norm": 0.9542744159698486, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1749, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.1975806451612903, | |
| "grad_norm": 0.7638697624206543, | |
| "learning_rate": 2e-05, | |
| "loss": 0.158, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.20161290322580644, | |
| "grad_norm": 0.5949487090110779, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1487, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.2056451612903226, | |
| "grad_norm": 1.0373241901397705, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1519, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.20967741935483872, | |
| "grad_norm": 0.5316594243049622, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1364, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.21370967741935484, | |
| "grad_norm": 0.624768078327179, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1449, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.21774193548387097, | |
| "grad_norm": 0.7126561403274536, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1426, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.2217741935483871, | |
| "grad_norm": 1.0926051139831543, | |
| "learning_rate": 2e-05, | |
| "loss": 0.142, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.22580645161290322, | |
| "grad_norm": 1.873496651649475, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1578, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.22983870967741934, | |
| "grad_norm": 0.8202502727508545, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1462, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.23387096774193547, | |
| "grad_norm": 0.6349180936813354, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1326, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.23790322580645162, | |
| "grad_norm": 1.0204631090164185, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1385, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.24193548387096775, | |
| "grad_norm": 0.8092764616012573, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1341, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.24596774193548387, | |
| "grad_norm": 1.0302892923355103, | |
| "learning_rate": 2e-05, | |
| "loss": 0.144, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 1.2825901508331299, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1391, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.2540322580645161, | |
| "grad_norm": 0.873502790927887, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1331, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.25806451612903225, | |
| "grad_norm": 0.8886832594871521, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1248, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.2620967741935484, | |
| "grad_norm": 0.7013624906539917, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1164, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.2661290322580645, | |
| "grad_norm": 0.7485561966896057, | |
| "learning_rate": 2e-05, | |
| "loss": 0.118, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.2701612903225806, | |
| "grad_norm": 0.7874916791915894, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1116, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.27419354838709675, | |
| "grad_norm": 0.8042868375778198, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1215, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.2782258064516129, | |
| "grad_norm": 0.604430615901947, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1109, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.28225806451612906, | |
| "grad_norm": 0.976264476776123, | |
| "learning_rate": 2e-05, | |
| "loss": 0.124, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.2862903225806452, | |
| "grad_norm": 1.0005311965942383, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1112, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.2903225806451613, | |
| "grad_norm": 0.6228518486022949, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1049, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.29435483870967744, | |
| "grad_norm": 0.7674490809440613, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1091, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.29838709677419356, | |
| "grad_norm": 1.027273416519165, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1147, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.3024193548387097, | |
| "grad_norm": 0.6840062737464905, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0962, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.3064516129032258, | |
| "grad_norm": 0.5666499137878418, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0956, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.31048387096774194, | |
| "grad_norm": 0.594052791595459, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0968, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.31451612903225806, | |
| "grad_norm": 0.7595533132553101, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0927, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.3185483870967742, | |
| "grad_norm": 0.7487107515335083, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1027, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.3225806451612903, | |
| "grad_norm": 0.5936404466629028, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0884, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.32661290322580644, | |
| "grad_norm": 0.4667339622974396, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0946, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.33064516129032256, | |
| "grad_norm": 0.8685793280601501, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1011, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.3346774193548387, | |
| "grad_norm": 0.7018740177154541, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0872, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.3387096774193548, | |
| "grad_norm": 1.0336928367614746, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1042, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.34274193548387094, | |
| "grad_norm": 0.6793813705444336, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0924, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.3467741935483871, | |
| "grad_norm": 1.0464022159576416, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0955, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.35080645161290325, | |
| "grad_norm": 0.7015179991722107, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0928, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.3548387096774194, | |
| "grad_norm": 0.7370674014091492, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0895, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.3588709677419355, | |
| "grad_norm": 0.5556283593177795, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0895, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.3629032258064516, | |
| "grad_norm": 0.6646509766578674, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0871, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.36693548387096775, | |
| "grad_norm": 0.7947157025337219, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0939, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.3709677419354839, | |
| "grad_norm": 0.6044544577598572, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0857, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.375, | |
| "grad_norm": 0.7086596488952637, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0919, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.3790322580645161, | |
| "grad_norm": 0.6559664011001587, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0856, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.38306451612903225, | |
| "grad_norm": 0.784209132194519, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0836, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.3870967741935484, | |
| "grad_norm": 0.5902017951011658, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0747, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.3911290322580645, | |
| "grad_norm": 0.697828471660614, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0776, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.3951612903225806, | |
| "grad_norm": 0.5101798176765442, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0777, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.39919354838709675, | |
| "grad_norm": 0.8497079610824585, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0832, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.4032258064516129, | |
| "grad_norm": 0.5198425054550171, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0786, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.40725806451612906, | |
| "grad_norm": 0.6342234015464783, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0801, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.4112903225806452, | |
| "grad_norm": 0.4612491726875305, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0805, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.4153225806451613, | |
| "grad_norm": 0.8742281198501587, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0882, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.41935483870967744, | |
| "grad_norm": 0.3069051206111908, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0762, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.42338709677419356, | |
| "grad_norm": 0.7006452083587646, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0744, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.4274193548387097, | |
| "grad_norm": 0.514578640460968, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0827, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.4314516129032258, | |
| "grad_norm": 1.0400453805923462, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0819, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.43548387096774194, | |
| "grad_norm": 0.7127644419670105, | |
| "learning_rate": 2e-05, | |
| "loss": 0.083, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.43951612903225806, | |
| "grad_norm": 0.617011308670044, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0705, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.4435483870967742, | |
| "grad_norm": 0.5836071968078613, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0777, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.4475806451612903, | |
| "grad_norm": 0.6622437238693237, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0747, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.45161290322580644, | |
| "grad_norm": 0.7056003212928772, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0715, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.45564516129032256, | |
| "grad_norm": 0.6626383662223816, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0776, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.4596774193548387, | |
| "grad_norm": 0.7465190291404724, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0856, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.4637096774193548, | |
| "grad_norm": 0.5531803369522095, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0679, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.46774193548387094, | |
| "grad_norm": 0.4788318872451782, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0648, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.4717741935483871, | |
| "grad_norm": 0.6184081435203552, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0801, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.47580645161290325, | |
| "grad_norm": 0.6424548029899597, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0789, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.4798387096774194, | |
| "grad_norm": 0.7118510007858276, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0735, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.4838709677419355, | |
| "grad_norm": 0.4841958284378052, | |
| "learning_rate": 2e-05, | |
| "loss": 0.061, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.4879032258064516, | |
| "grad_norm": 0.8846139311790466, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0747, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.49193548387096775, | |
| "grad_norm": 0.5449007153511047, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0711, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.4959677419354839, | |
| "grad_norm": 0.767926037311554, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0736, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.5696377158164978, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0671, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.5040322580645161, | |
| "grad_norm": 0.6430863738059998, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0679, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.5080645161290323, | |
| "grad_norm": 0.7779257893562317, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0713, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.5120967741935484, | |
| "grad_norm": 0.7092922329902649, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0765, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.5161290322580645, | |
| "grad_norm": 0.5975173711776733, | |
| "learning_rate": 2e-05, | |
| "loss": 0.066, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.5201612903225806, | |
| "grad_norm": 0.5376009941101074, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0642, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.5241935483870968, | |
| "grad_norm": 0.4406221807003021, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0594, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.5282258064516129, | |
| "grad_norm": 0.530074954032898, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0731, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.532258064516129, | |
| "grad_norm": 0.5786536335945129, | |
| "learning_rate": 2e-05, | |
| "loss": 0.06, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.5362903225806451, | |
| "grad_norm": 0.5356053113937378, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0659, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.5403225806451613, | |
| "grad_norm": 0.3962647318840027, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0618, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.5443548387096774, | |
| "grad_norm": 0.3608771860599518, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0643, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.5483870967741935, | |
| "grad_norm": 0.5634734034538269, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0615, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.5524193548387096, | |
| "grad_norm": 0.5571008324623108, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0663, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.5564516129032258, | |
| "grad_norm": 0.5018740296363831, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0613, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.5604838709677419, | |
| "grad_norm": 0.664557933807373, | |
| "learning_rate": 2e-05, | |
| "loss": 0.067, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.5645161290322581, | |
| "grad_norm": 0.6537980437278748, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0593, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.5685483870967742, | |
| "grad_norm": 0.8715218901634216, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0693, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.5725806451612904, | |
| "grad_norm": 0.5582900047302246, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0605, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.5766129032258065, | |
| "grad_norm": 0.4657461941242218, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0594, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.5806451612903226, | |
| "grad_norm": 0.5373775959014893, | |
| "learning_rate": 2e-05, | |
| "loss": 0.07, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.5846774193548387, | |
| "grad_norm": 0.4283169209957123, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0536, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.5887096774193549, | |
| "grad_norm": 0.6403968930244446, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0667, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.592741935483871, | |
| "grad_norm": 0.32464203238487244, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0548, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.5967741935483871, | |
| "grad_norm": 0.648133397102356, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0612, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.6008064516129032, | |
| "grad_norm": 0.47770267724990845, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0544, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.6048387096774194, | |
| "grad_norm": 0.9105427861213684, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0684, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.6088709677419355, | |
| "grad_norm": 0.6342010498046875, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0601, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.6129032258064516, | |
| "grad_norm": 0.8317110538482666, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0584, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.6169354838709677, | |
| "grad_norm": 0.57545405626297, | |
| "learning_rate": 2e-05, | |
| "loss": 0.059, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.6209677419354839, | |
| "grad_norm": 0.46788084506988525, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0552, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.625, | |
| "grad_norm": 0.5528416633605957, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0579, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.6290322580645161, | |
| "grad_norm": 0.45801204442977905, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0539, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.6330645161290323, | |
| "grad_norm": 0.47493261098861694, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0585, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.6370967741935484, | |
| "grad_norm": 0.46749451756477356, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0537, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.6411290322580645, | |
| "grad_norm": 0.5712094306945801, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0636, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.6451612903225806, | |
| "grad_norm": 0.474437952041626, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0539, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.6491935483870968, | |
| "grad_norm": 0.5955020785331726, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0686, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.6532258064516129, | |
| "grad_norm": 0.5444841980934143, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0514, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.657258064516129, | |
| "grad_norm": 0.585702657699585, | |
| "learning_rate": 2e-05, | |
| "loss": 0.057, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.6612903225806451, | |
| "grad_norm": 0.6098143458366394, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0624, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.6653225806451613, | |
| "grad_norm": 0.5105492472648621, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0524, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.6693548387096774, | |
| "grad_norm": 0.3543269634246826, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0427, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.6733870967741935, | |
| "grad_norm": 0.40186411142349243, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0513, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.6774193548387096, | |
| "grad_norm": 0.4863409101963043, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0615, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.6814516129032258, | |
| "grad_norm": 0.35418546199798584, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0534, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.6854838709677419, | |
| "grad_norm": 0.4265013039112091, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0424, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.6895161290322581, | |
| "grad_norm": 0.4792309105396271, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0534, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.6935483870967742, | |
| "grad_norm": 0.9275990724563599, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0605, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.6975806451612904, | |
| "grad_norm": 0.5802022218704224, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0541, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.7016129032258065, | |
| "grad_norm": 0.8620706796646118, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0617, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.7056451612903226, | |
| "grad_norm": 0.6036432981491089, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0626, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.7096774193548387, | |
| "grad_norm": 0.5247609615325928, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0553, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.7137096774193549, | |
| "grad_norm": 0.5166157484054565, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0549, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.717741935483871, | |
| "grad_norm": 0.4395121932029724, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0526, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.7217741935483871, | |
| "grad_norm": 0.47025758028030396, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0558, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.7258064516129032, | |
| "grad_norm": 0.5386791229248047, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0539, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.7298387096774194, | |
| "grad_norm": 0.5612148642539978, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0591, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.7338709677419355, | |
| "grad_norm": 0.4585655927658081, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0546, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.7379032258064516, | |
| "grad_norm": 0.5998373627662659, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0583, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.7419354838709677, | |
| "grad_norm": 0.38647782802581787, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0494, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.7459677419354839, | |
| "grad_norm": 0.567383348941803, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0487, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 0.5236309766769409, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0552, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.7540322580645161, | |
| "grad_norm": 0.3990425765514374, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0512, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.7580645161290323, | |
| "grad_norm": 0.5519928336143494, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0449, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.7620967741935484, | |
| "grad_norm": 0.43356701731681824, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0413, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.7661290322580645, | |
| "grad_norm": 0.46121910214424133, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0441, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.7701612903225806, | |
| "grad_norm": 0.5286686420440674, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0506, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.7741935483870968, | |
| "grad_norm": 0.6215876340866089, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0538, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.7782258064516129, | |
| "grad_norm": 0.7031762003898621, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0507, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.782258064516129, | |
| "grad_norm": 0.4998103678226471, | |
| "learning_rate": 2e-05, | |
| "loss": 0.055, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.7862903225806451, | |
| "grad_norm": 0.4593054950237274, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0468, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.7903225806451613, | |
| "grad_norm": 0.6475517749786377, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0559, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.7943548387096774, | |
| "grad_norm": 0.523537278175354, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0464, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.7983870967741935, | |
| "grad_norm": 0.6223071813583374, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0464, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.8024193548387096, | |
| "grad_norm": 0.40836507081985474, | |
| "learning_rate": 2e-05, | |
| "loss": 0.049, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.8064516129032258, | |
| "grad_norm": 0.6119136810302734, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0536, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.8104838709677419, | |
| "grad_norm": 0.4265545904636383, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0502, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.8145161290322581, | |
| "grad_norm": 0.44581177830696106, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0471, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.8185483870967742, | |
| "grad_norm": 0.4306443929672241, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0444, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.8225806451612904, | |
| "grad_norm": 0.402327299118042, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0415, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.8266129032258065, | |
| "grad_norm": 0.4216252863407135, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0465, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.8306451612903226, | |
| "grad_norm": 0.3738255202770233, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0415, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.8346774193548387, | |
| "grad_norm": 0.5387892723083496, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0543, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.8387096774193549, | |
| "grad_norm": 0.5584475994110107, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0457, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.842741935483871, | |
| "grad_norm": 0.5456405878067017, | |
| "learning_rate": 2e-05, | |
| "loss": 0.048, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.8467741935483871, | |
| "grad_norm": 0.5054622888565063, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0476, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.8508064516129032, | |
| "grad_norm": 0.41379377245903015, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0436, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.8548387096774194, | |
| "grad_norm": 0.3779892921447754, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0478, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.8588709677419355, | |
| "grad_norm": 0.4135122001171112, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0422, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.8629032258064516, | |
| "grad_norm": 0.5435640215873718, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0484, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.8669354838709677, | |
| "grad_norm": 0.5836952924728394, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0493, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.8709677419354839, | |
| "grad_norm": 0.4919867515563965, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0503, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.875, | |
| "grad_norm": 0.4889490008354187, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0475, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.8790322580645161, | |
| "grad_norm": 0.4471587538719177, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0381, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.8830645161290323, | |
| "grad_norm": 0.40294429659843445, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0438, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.8870967741935484, | |
| "grad_norm": 0.46678218245506287, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0442, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.8911290322580645, | |
| "grad_norm": 0.622652530670166, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0412, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.8951612903225806, | |
| "grad_norm": 0.41154831647872925, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0451, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.8991935483870968, | |
| "grad_norm": 0.36561766266822815, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0453, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 0.9032258064516129, | |
| "grad_norm": 0.619911789894104, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0481, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.907258064516129, | |
| "grad_norm": 0.543843686580658, | |
| "learning_rate": 2e-05, | |
| "loss": 0.043, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.9112903225806451, | |
| "grad_norm": 0.546393871307373, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0435, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.9153225806451613, | |
| "grad_norm": 0.3940606713294983, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0406, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 0.9193548387096774, | |
| "grad_norm": 0.31918397545814514, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0384, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.9233870967741935, | |
| "grad_norm": 0.35918116569519043, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0366, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 0.9274193548387096, | |
| "grad_norm": 0.39295467734336853, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0395, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.9314516129032258, | |
| "grad_norm": 0.34643733501434326, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0405, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 0.9354838709677419, | |
| "grad_norm": 0.3488601744174957, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0435, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.9395161290322581, | |
| "grad_norm": 0.4448557496070862, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0459, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 0.9435483870967742, | |
| "grad_norm": 0.4407562017440796, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0447, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.9475806451612904, | |
| "grad_norm": 0.5757035613059998, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0458, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.9516129032258065, | |
| "grad_norm": 0.29268836975097656, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0441, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.9556451612903226, | |
| "grad_norm": 0.39647752046585083, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0382, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 0.9596774193548387, | |
| "grad_norm": 0.4112660884857178, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0382, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.9637096774193549, | |
| "grad_norm": 0.4475345313549042, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0458, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 0.967741935483871, | |
| "grad_norm": 0.26978054642677307, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0419, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.9717741935483871, | |
| "grad_norm": 0.4117030203342438, | |
| "learning_rate": 2e-05, | |
| "loss": 0.043, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 0.9758064516129032, | |
| "grad_norm": 0.28733769059181213, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0387, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.9798387096774194, | |
| "grad_norm": 0.32847997546195984, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0407, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 0.9838709677419355, | |
| "grad_norm": 0.4303770661354065, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0417, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.9879032258064516, | |
| "grad_norm": 0.36009445786476135, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0391, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.9919354838709677, | |
| "grad_norm": 0.46317991614341736, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0477, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.9959677419354839, | |
| "grad_norm": 0.48081448674201965, | |
| "learning_rate": 2e-05, | |
| "loss": 0.035, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.3577556908130646, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0435, | |
| "step": 248 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 248, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 284755222462464.0, | |
| "train_batch_size": 48, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |