| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.2857142857142857, | |
| "eval_steps": 500, | |
| "global_step": 1000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 2.379179924726486, | |
| "epoch": 0.002857142857142857, | |
| "grad_norm": 0.248466357588768, | |
| "learning_rate": 5.142857142857143e-06, | |
| "loss": 2.1112184524536133, | |
| "mean_token_accuracy": 0.521992451697588, | |
| "num_tokens": 490221.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 2.275108999013901, | |
| "epoch": 0.005714285714285714, | |
| "grad_norm": 0.16482312977313995, | |
| "learning_rate": 1.0857142857142858e-05, | |
| "loss": 2.0270994186401365, | |
| "mean_token_accuracy": 0.5319361314177513, | |
| "num_tokens": 980654.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 2.1243143171072005, | |
| "epoch": 0.008571428571428572, | |
| "grad_norm": 0.14482936263084412, | |
| "learning_rate": 1.657142857142857e-05, | |
| "loss": 1.947333526611328, | |
| "mean_token_accuracy": 0.5449633926153183, | |
| "num_tokens": 1471155.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 2.0759591698646545, | |
| "epoch": 0.011428571428571429, | |
| "grad_norm": 0.13348551094532013, | |
| "learning_rate": 2.2285714285714287e-05, | |
| "loss": 1.8653924942016602, | |
| "mean_token_accuracy": 0.5575317889451981, | |
| "num_tokens": 1961803.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 2.002236345410347, | |
| "epoch": 0.014285714285714285, | |
| "grad_norm": 0.12573261559009552, | |
| "learning_rate": 2.8000000000000003e-05, | |
| "loss": 1.8171972274780273, | |
| "mean_token_accuracy": 0.5663816690444946, | |
| "num_tokens": 2451485.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 1.9370298832654953, | |
| "epoch": 0.017142857142857144, | |
| "grad_norm": 0.12988947331905365, | |
| "learning_rate": 3.3714285714285716e-05, | |
| "loss": 1.7517566680908203, | |
| "mean_token_accuracy": 0.5784050762653351, | |
| "num_tokens": 2940490.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 1.9337659150362014, | |
| "epoch": 0.02, | |
| "grad_norm": 0.1357634961605072, | |
| "learning_rate": 3.942857142857143e-05, | |
| "loss": 1.7462808609008789, | |
| "mean_token_accuracy": 0.578274418413639, | |
| "num_tokens": 3430891.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 1.853649941086769, | |
| "epoch": 0.022857142857142857, | |
| "grad_norm": 0.14244548976421356, | |
| "learning_rate": 4.514285714285714e-05, | |
| "loss": 1.675507164001465, | |
| "mean_token_accuracy": 0.5902435079216957, | |
| "num_tokens": 3921132.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 1.8723271518945694, | |
| "epoch": 0.025714285714285714, | |
| "grad_norm": 0.1481512039899826, | |
| "learning_rate": 5.085714285714286e-05, | |
| "loss": 1.7015415191650392, | |
| "mean_token_accuracy": 0.5883241996169091, | |
| "num_tokens": 4412262.0, | |
| "step": 90 | |
| }, | |
| { | |
| "entropy": 1.8528674066066741, | |
| "epoch": 0.02857142857142857, | |
| "grad_norm": 0.15897610783576965, | |
| "learning_rate": 5.6571428571428574e-05, | |
| "loss": 1.6771810531616211, | |
| "mean_token_accuracy": 0.5901058956980705, | |
| "num_tokens": 4902976.0, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 1.8283027499914168, | |
| "epoch": 0.03142857142857143, | |
| "grad_norm": 0.15304645895957947, | |
| "learning_rate": 6.22857142857143e-05, | |
| "loss": 1.6524412155151367, | |
| "mean_token_accuracy": 0.5927876815199852, | |
| "num_tokens": 5393172.0, | |
| "step": 110 | |
| }, | |
| { | |
| "entropy": 1.8105649381875992, | |
| "epoch": 0.03428571428571429, | |
| "grad_norm": 0.1467505544424057, | |
| "learning_rate": 6.800000000000001e-05, | |
| "loss": 1.6439619064331055, | |
| "mean_token_accuracy": 0.5953322932124138, | |
| "num_tokens": 5883816.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 1.8044609040021897, | |
| "epoch": 0.037142857142857144, | |
| "grad_norm": 0.14967863261699677, | |
| "learning_rate": 7.371428571428572e-05, | |
| "loss": 1.6340011596679687, | |
| "mean_token_accuracy": 0.5963180974125862, | |
| "num_tokens": 6373950.0, | |
| "step": 130 | |
| }, | |
| { | |
| "entropy": 1.7858079195022583, | |
| "epoch": 0.04, | |
| "grad_norm": 0.16043871641159058, | |
| "learning_rate": 7.942857142857143e-05, | |
| "loss": 1.6211162567138673, | |
| "mean_token_accuracy": 0.5996491178870201, | |
| "num_tokens": 6864677.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 1.7554930090904235, | |
| "epoch": 0.04285714285714286, | |
| "grad_norm": 0.17714525759220123, | |
| "learning_rate": 8.514285714285714e-05, | |
| "loss": 1.5993239402770996, | |
| "mean_token_accuracy": 0.6028685554862022, | |
| "num_tokens": 7354704.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 1.7631021201610566, | |
| "epoch": 0.045714285714285714, | |
| "grad_norm": 0.1533629149198532, | |
| "learning_rate": 9.085714285714286e-05, | |
| "loss": 1.6020626068115233, | |
| "mean_token_accuracy": 0.6036920040845871, | |
| "num_tokens": 7844516.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 1.7572406560182572, | |
| "epoch": 0.04857142857142857, | |
| "grad_norm": 0.13726936280727386, | |
| "learning_rate": 9.657142857142858e-05, | |
| "loss": 1.6002904891967773, | |
| "mean_token_accuracy": 0.6049441516399383, | |
| "num_tokens": 8335216.0, | |
| "step": 170 | |
| }, | |
| { | |
| "entropy": 1.7278330773115158, | |
| "epoch": 0.05142857142857143, | |
| "grad_norm": 0.1435551792383194, | |
| "learning_rate": 9.999964291145356e-05, | |
| "loss": 1.5778950691223144, | |
| "mean_token_accuracy": 0.6091306760907174, | |
| "num_tokens": 8825337.0, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 1.7155232012271882, | |
| "epoch": 0.054285714285714284, | |
| "grad_norm": 0.12991751730442047, | |
| "learning_rate": 9.99956257238817e-05, | |
| "loss": 1.5530893325805664, | |
| "mean_token_accuracy": 0.6109549820423126, | |
| "num_tokens": 9315436.0, | |
| "step": 190 | |
| }, | |
| { | |
| "entropy": 1.7033455431461335, | |
| "epoch": 0.05714285714285714, | |
| "grad_norm": 0.12225023657083511, | |
| "learning_rate": 9.998714534787104e-05, | |
| "loss": 1.5497390747070312, | |
| "mean_token_accuracy": 0.6124480590224266, | |
| "num_tokens": 9806074.0, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 1.704897478222847, | |
| "epoch": 0.06, | |
| "grad_norm": 0.13096989691257477, | |
| "learning_rate": 9.997420254047814e-05, | |
| "loss": 1.5562244415283204, | |
| "mean_token_accuracy": 0.6124653443694115, | |
| "num_tokens": 10296321.0, | |
| "step": 210 | |
| }, | |
| { | |
| "entropy": 1.687455916404724, | |
| "epoch": 0.06285714285714286, | |
| "grad_norm": 0.12014620006084442, | |
| "learning_rate": 9.995679845712782e-05, | |
| "loss": 1.5376283645629882, | |
| "mean_token_accuracy": 0.612839862704277, | |
| "num_tokens": 10786325.0, | |
| "step": 220 | |
| }, | |
| { | |
| "entropy": 1.6887607038021089, | |
| "epoch": 0.06571428571428571, | |
| "grad_norm": 0.1178581714630127, | |
| "learning_rate": 9.99349346515101e-05, | |
| "loss": 1.5371188163757323, | |
| "mean_token_accuracy": 0.6150592401623726, | |
| "num_tokens": 11276505.0, | |
| "step": 230 | |
| }, | |
| { | |
| "entropy": 1.6583371013402939, | |
| "epoch": 0.06857142857142857, | |
| "grad_norm": 0.12618421018123627, | |
| "learning_rate": 9.990861307544141e-05, | |
| "loss": 1.512394905090332, | |
| "mean_token_accuracy": 0.6184149146080017, | |
| "num_tokens": 11767090.0, | |
| "step": 240 | |
| }, | |
| { | |
| "entropy": 1.6633832812309266, | |
| "epoch": 0.07142857142857142, | |
| "grad_norm": 0.1347818523645401, | |
| "learning_rate": 9.987783607869043e-05, | |
| "loss": 1.5205227851867675, | |
| "mean_token_accuracy": 0.6188683018088341, | |
| "num_tokens": 12256334.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 1.6662334859371186, | |
| "epoch": 0.07428571428571429, | |
| "grad_norm": 0.12107276916503906, | |
| "learning_rate": 9.984260640876821e-05, | |
| "loss": 1.5210847854614258, | |
| "mean_token_accuracy": 0.6181401565670968, | |
| "num_tokens": 12746242.0, | |
| "step": 260 | |
| }, | |
| { | |
| "entropy": 1.669426190853119, | |
| "epoch": 0.07714285714285714, | |
| "grad_norm": 0.13238908350467682, | |
| "learning_rate": 9.980292721068303e-05, | |
| "loss": 1.5255352020263673, | |
| "mean_token_accuracy": 0.6176877498626709, | |
| "num_tokens": 13236893.0, | |
| "step": 270 | |
| }, | |
| { | |
| "entropy": 1.6480093061923982, | |
| "epoch": 0.08, | |
| "grad_norm": 0.119101881980896, | |
| "learning_rate": 9.975880202665955e-05, | |
| "loss": 1.5022310256958007, | |
| "mean_token_accuracy": 0.6195305570960045, | |
| "num_tokens": 13727057.0, | |
| "step": 280 | |
| }, | |
| { | |
| "entropy": 1.6660549819469452, | |
| "epoch": 0.08285714285714285, | |
| "grad_norm": 0.12110637873411179, | |
| "learning_rate": 9.971023479582257e-05, | |
| "loss": 1.519071388244629, | |
| "mean_token_accuracy": 0.6170884057879448, | |
| "num_tokens": 14217233.0, | |
| "step": 290 | |
| }, | |
| { | |
| "entropy": 1.65420058965683, | |
| "epoch": 0.08571428571428572, | |
| "grad_norm": 0.1198863536119461, | |
| "learning_rate": 9.965722985384551e-05, | |
| "loss": 1.5074204444885253, | |
| "mean_token_accuracy": 0.6207710400223732, | |
| "num_tokens": 14706536.0, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 1.653561717271805, | |
| "epoch": 0.08857142857142856, | |
| "grad_norm": 0.11553779989480972, | |
| "learning_rate": 9.959979193256321e-05, | |
| "loss": 1.503816509246826, | |
| "mean_token_accuracy": 0.620441184937954, | |
| "num_tokens": 15196956.0, | |
| "step": 310 | |
| }, | |
| { | |
| "entropy": 1.6494231253862381, | |
| "epoch": 0.09142857142857143, | |
| "grad_norm": 0.12458748370409012, | |
| "learning_rate": 9.953792615954956e-05, | |
| "loss": 1.5046157836914062, | |
| "mean_token_accuracy": 0.6211020454764367, | |
| "num_tokens": 15687053.0, | |
| "step": 320 | |
| }, | |
| { | |
| "entropy": 1.6357726126909256, | |
| "epoch": 0.09428571428571429, | |
| "grad_norm": 0.1191626638174057, | |
| "learning_rate": 9.94716380576598e-05, | |
| "loss": 1.4901004791259767, | |
| "mean_token_accuracy": 0.623300464451313, | |
| "num_tokens": 16177605.0, | |
| "step": 330 | |
| }, | |
| { | |
| "entropy": 1.6310274928808213, | |
| "epoch": 0.09714285714285714, | |
| "grad_norm": 0.1165379136800766, | |
| "learning_rate": 9.940093354453745e-05, | |
| "loss": 1.4859237670898438, | |
| "mean_token_accuracy": 0.6221878513693809, | |
| "num_tokens": 16668078.0, | |
| "step": 340 | |
| }, | |
| { | |
| "entropy": 1.6244249641895294, | |
| "epoch": 0.1, | |
| "grad_norm": 0.11486439406871796, | |
| "learning_rate": 9.932581893208602e-05, | |
| "loss": 1.4822301864624023, | |
| "mean_token_accuracy": 0.6249096423387528, | |
| "num_tokens": 17158239.0, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 1.6261575251817704, | |
| "epoch": 0.10285714285714286, | |
| "grad_norm": 0.1269659847021103, | |
| "learning_rate": 9.924630092590552e-05, | |
| "loss": 1.4802236557006836, | |
| "mean_token_accuracy": 0.6244122132658958, | |
| "num_tokens": 17648337.0, | |
| "step": 360 | |
| }, | |
| { | |
| "entropy": 1.6262814104557037, | |
| "epoch": 0.10571428571428572, | |
| "grad_norm": 0.12204760313034058, | |
| "learning_rate": 9.916238662469393e-05, | |
| "loss": 1.4841039657592774, | |
| "mean_token_accuracy": 0.62349953353405, | |
| "num_tokens": 18139347.0, | |
| "step": 370 | |
| }, | |
| { | |
| "entropy": 1.6084025710821153, | |
| "epoch": 0.10857142857142857, | |
| "grad_norm": 0.11636750400066376, | |
| "learning_rate": 9.907408351961338e-05, | |
| "loss": 1.4650284767150878, | |
| "mean_token_accuracy": 0.6257219180464745, | |
| "num_tokens": 18629841.0, | |
| "step": 380 | |
| }, | |
| { | |
| "entropy": 1.6091126441955566, | |
| "epoch": 0.11142857142857143, | |
| "grad_norm": 0.12340579181909561, | |
| "learning_rate": 9.89813994936215e-05, | |
| "loss": 1.4654415130615235, | |
| "mean_token_accuracy": 0.627759400010109, | |
| "num_tokens": 19119573.0, | |
| "step": 390 | |
| }, | |
| { | |
| "entropy": 1.620647069811821, | |
| "epoch": 0.11428571428571428, | |
| "grad_norm": 0.1143694818019867, | |
| "learning_rate": 9.888434282076758e-05, | |
| "loss": 1.4784412384033203, | |
| "mean_token_accuracy": 0.6251529678702354, | |
| "num_tokens": 19609952.0, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 1.6150473326444625, | |
| "epoch": 0.11714285714285715, | |
| "grad_norm": 0.11208155006170273, | |
| "learning_rate": 9.878292216545406e-05, | |
| "loss": 1.4707134246826172, | |
| "mean_token_accuracy": 0.6262766167521476, | |
| "num_tokens": 20100484.0, | |
| "step": 410 | |
| }, | |
| { | |
| "entropy": 1.6126654744148254, | |
| "epoch": 0.12, | |
| "grad_norm": 0.11389093846082687, | |
| "learning_rate": 9.867714658166294e-05, | |
| "loss": 1.4687725067138673, | |
| "mean_token_accuracy": 0.6262835577130318, | |
| "num_tokens": 20591061.0, | |
| "step": 420 | |
| }, | |
| { | |
| "entropy": 1.6087138205766678, | |
| "epoch": 0.12285714285714286, | |
| "grad_norm": 0.11683399975299835, | |
| "learning_rate": 9.856702551214758e-05, | |
| "loss": 1.4735491752624512, | |
| "mean_token_accuracy": 0.6256266921758652, | |
| "num_tokens": 21081884.0, | |
| "step": 430 | |
| }, | |
| { | |
| "entropy": 1.6129542291164398, | |
| "epoch": 0.12571428571428572, | |
| "grad_norm": 0.1123637780547142, | |
| "learning_rate": 9.84525687875897e-05, | |
| "loss": 1.4666884422302247, | |
| "mean_token_accuracy": 0.6269908130168915, | |
| "num_tokens": 21572985.0, | |
| "step": 440 | |
| }, | |
| { | |
| "entropy": 1.5876862674951553, | |
| "epoch": 0.12857142857142856, | |
| "grad_norm": 0.11302825808525085, | |
| "learning_rate": 9.833378662572183e-05, | |
| "loss": 1.44884033203125, | |
| "mean_token_accuracy": 0.6304112330079079, | |
| "num_tokens": 22062380.0, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 1.6038869142532348, | |
| "epoch": 0.13142857142857142, | |
| "grad_norm": 0.11883237957954407, | |
| "learning_rate": 9.821068963041507e-05, | |
| "loss": 1.4640923500061036, | |
| "mean_token_accuracy": 0.6268730536103249, | |
| "num_tokens": 22553395.0, | |
| "step": 460 | |
| }, | |
| { | |
| "entropy": 1.616351565718651, | |
| "epoch": 0.13428571428571429, | |
| "grad_norm": 0.1148705706000328, | |
| "learning_rate": 9.808328879073251e-05, | |
| "loss": 1.4773920059204102, | |
| "mean_token_accuracy": 0.6249542534351349, | |
| "num_tokens": 23042805.0, | |
| "step": 470 | |
| }, | |
| { | |
| "entropy": 1.584571686387062, | |
| "epoch": 0.13714285714285715, | |
| "grad_norm": 0.12227737158536911, | |
| "learning_rate": 9.79515954799483e-05, | |
| "loss": 1.4490341186523437, | |
| "mean_token_accuracy": 0.6293571904301644, | |
| "num_tokens": 23532109.0, | |
| "step": 480 | |
| }, | |
| { | |
| "entropy": 1.5974321156740188, | |
| "epoch": 0.14, | |
| "grad_norm": 0.11681197583675385, | |
| "learning_rate": 9.781562145453212e-05, | |
| "loss": 1.458102035522461, | |
| "mean_token_accuracy": 0.6288253426551819, | |
| "num_tokens": 24022475.0, | |
| "step": 490 | |
| }, | |
| { | |
| "entropy": 1.5929092824459077, | |
| "epoch": 0.14285714285714285, | |
| "grad_norm": 0.11527484655380249, | |
| "learning_rate": 9.767537885309996e-05, | |
| "loss": 1.4527276039123536, | |
| "mean_token_accuracy": 0.6284167483448982, | |
| "num_tokens": 24512984.0, | |
| "step": 500 | |
| }, | |
| { | |
| "entropy": 1.5956292569637298, | |
| "epoch": 0.1457142857142857, | |
| "grad_norm": 0.11178956925868988, | |
| "learning_rate": 9.75308801953302e-05, | |
| "loss": 1.4558266639709472, | |
| "mean_token_accuracy": 0.628529217839241, | |
| "num_tokens": 25003761.0, | |
| "step": 510 | |
| }, | |
| { | |
| "entropy": 1.5775458842515946, | |
| "epoch": 0.14857142857142858, | |
| "grad_norm": 0.11768428981304169, | |
| "learning_rate": 9.738213838084621e-05, | |
| "loss": 1.4407743453979491, | |
| "mean_token_accuracy": 0.6300915256142616, | |
| "num_tokens": 25493579.0, | |
| "step": 520 | |
| }, | |
| { | |
| "entropy": 1.5778144687414168, | |
| "epoch": 0.15142857142857144, | |
| "grad_norm": 0.11278413981199265, | |
| "learning_rate": 9.722916668806454e-05, | |
| "loss": 1.4403908729553223, | |
| "mean_token_accuracy": 0.6314086809754371, | |
| "num_tokens": 25984228.0, | |
| "step": 530 | |
| }, | |
| { | |
| "entropy": 1.570985835790634, | |
| "epoch": 0.15428571428571428, | |
| "grad_norm": 0.1122731938958168, | |
| "learning_rate": 9.707197877300974e-05, | |
| "loss": 1.426032543182373, | |
| "mean_token_accuracy": 0.6325461342930794, | |
| "num_tokens": 26473677.0, | |
| "step": 540 | |
| }, | |
| { | |
| "entropy": 1.5796766877174377, | |
| "epoch": 0.15714285714285714, | |
| "grad_norm": 0.11975421756505966, | |
| "learning_rate": 9.691058866809514e-05, | |
| "loss": 1.4357484817504882, | |
| "mean_token_accuracy": 0.6324031576514244, | |
| "num_tokens": 26964375.0, | |
| "step": 550 | |
| }, | |
| { | |
| "entropy": 1.5689537853002549, | |
| "epoch": 0.16, | |
| "grad_norm": 0.11165549606084824, | |
| "learning_rate": 9.674501078087018e-05, | |
| "loss": 1.4325990676879883, | |
| "mean_token_accuracy": 0.6315152242779731, | |
| "num_tokens": 27454872.0, | |
| "step": 560 | |
| }, | |
| { | |
| "entropy": 1.5615802317857743, | |
| "epoch": 0.16285714285714287, | |
| "grad_norm": 0.11403413116931915, | |
| "learning_rate": 9.657525989273428e-05, | |
| "loss": 1.424344253540039, | |
| "mean_token_accuracy": 0.6343739420175553, | |
| "num_tokens": 27945254.0, | |
| "step": 570 | |
| }, | |
| { | |
| "entropy": 1.568941941857338, | |
| "epoch": 0.1657142857142857, | |
| "grad_norm": 0.11258367449045181, | |
| "learning_rate": 9.640135115761721e-05, | |
| "loss": 1.4347594261169434, | |
| "mean_token_accuracy": 0.6329293712973595, | |
| "num_tokens": 28435475.0, | |
| "step": 580 | |
| }, | |
| { | |
| "entropy": 1.5669189006090165, | |
| "epoch": 0.16857142857142857, | |
| "grad_norm": 0.1104324460029602, | |
| "learning_rate": 9.622330010062632e-05, | |
| "loss": 1.4304702758789063, | |
| "mean_token_accuracy": 0.6331642985343933, | |
| "num_tokens": 28925167.0, | |
| "step": 590 | |
| }, | |
| { | |
| "entropy": 1.574634400010109, | |
| "epoch": 0.17142857142857143, | |
| "grad_norm": 0.11263342946767807, | |
| "learning_rate": 9.604112261666054e-05, | |
| "loss": 1.436413288116455, | |
| "mean_token_accuracy": 0.6318994089961052, | |
| "num_tokens": 29415502.0, | |
| "step": 600 | |
| }, | |
| { | |
| "entropy": 1.56950443983078, | |
| "epoch": 0.1742857142857143, | |
| "grad_norm": 0.11262984573841095, | |
| "learning_rate": 9.58548349689915e-05, | |
| "loss": 1.424701976776123, | |
| "mean_token_accuracy": 0.6350988477468491, | |
| "num_tokens": 29906208.0, | |
| "step": 610 | |
| }, | |
| { | |
| "entropy": 1.5514621257781982, | |
| "epoch": 0.17714285714285713, | |
| "grad_norm": 0.11094693094491959, | |
| "learning_rate": 9.566445378781162e-05, | |
| "loss": 1.4180520057678223, | |
| "mean_token_accuracy": 0.6351837337017059, | |
| "num_tokens": 30396946.0, | |
| "step": 620 | |
| }, | |
| { | |
| "entropy": 1.5565964162349701, | |
| "epoch": 0.18, | |
| "grad_norm": 0.110066719353199, | |
| "learning_rate": 9.546999606874947e-05, | |
| "loss": 1.4229997634887694, | |
| "mean_token_accuracy": 0.6346639677882194, | |
| "num_tokens": 30887292.0, | |
| "step": 630 | |
| }, | |
| { | |
| "entropy": 1.5571098238229752, | |
| "epoch": 0.18285714285714286, | |
| "grad_norm": 0.11187391728162766, | |
| "learning_rate": 9.527147917135265e-05, | |
| "loss": 1.4242905616760253, | |
| "mean_token_accuracy": 0.634743233025074, | |
| "num_tokens": 31377366.0, | |
| "step": 640 | |
| }, | |
| { | |
| "entropy": 1.5575890451669694, | |
| "epoch": 0.18571428571428572, | |
| "grad_norm": 0.12017875164747238, | |
| "learning_rate": 9.5068920817538e-05, | |
| "loss": 1.4210159301757812, | |
| "mean_token_accuracy": 0.6346415132284164, | |
| "num_tokens": 31867728.0, | |
| "step": 650 | |
| }, | |
| { | |
| "entropy": 1.5610258907079697, | |
| "epoch": 0.18857142857142858, | |
| "grad_norm": 0.11284279823303223, | |
| "learning_rate": 9.486233909000957e-05, | |
| "loss": 1.4286124229431152, | |
| "mean_token_accuracy": 0.633992238342762, | |
| "num_tokens": 32357951.0, | |
| "step": 660 | |
| }, | |
| { | |
| "entropy": 1.570546704530716, | |
| "epoch": 0.19142857142857142, | |
| "grad_norm": 0.11221849918365479, | |
| "learning_rate": 9.465175243064428e-05, | |
| "loss": 1.427887535095215, | |
| "mean_token_accuracy": 0.6341994881629944, | |
| "num_tokens": 32848000.0, | |
| "step": 670 | |
| }, | |
| { | |
| "entropy": 1.5625760078430175, | |
| "epoch": 0.19428571428571428, | |
| "grad_norm": 0.11504076421260834, | |
| "learning_rate": 9.443717963884569e-05, | |
| "loss": 1.4254456520080567, | |
| "mean_token_accuracy": 0.6330446928739548, | |
| "num_tokens": 33338988.0, | |
| "step": 680 | |
| }, | |
| { | |
| "entropy": 1.5441237717866898, | |
| "epoch": 0.19714285714285715, | |
| "grad_norm": 0.11827517300844193, | |
| "learning_rate": 9.42186398698657e-05, | |
| "loss": 1.409364604949951, | |
| "mean_token_accuracy": 0.6384268119931221, | |
| "num_tokens": 33829458.0, | |
| "step": 690 | |
| }, | |
| { | |
| "entropy": 1.5519202262163163, | |
| "epoch": 0.2, | |
| "grad_norm": 0.11421947181224823, | |
| "learning_rate": 9.399615263309448e-05, | |
| "loss": 1.4179234504699707, | |
| "mean_token_accuracy": 0.6360488340258599, | |
| "num_tokens": 34319687.0, | |
| "step": 700 | |
| }, | |
| { | |
| "entropy": 1.5406184524297715, | |
| "epoch": 0.20285714285714285, | |
| "grad_norm": 0.11246901750564575, | |
| "learning_rate": 9.37697377903189e-05, | |
| "loss": 1.4044339179992675, | |
| "mean_token_accuracy": 0.6360796883702278, | |
| "num_tokens": 34810861.0, | |
| "step": 710 | |
| }, | |
| { | |
| "entropy": 1.5405683517456055, | |
| "epoch": 0.2057142857142857, | |
| "grad_norm": 0.10953953862190247, | |
| "learning_rate": 9.353941555394946e-05, | |
| "loss": 1.4075803756713867, | |
| "mean_token_accuracy": 0.6373428180813789, | |
| "num_tokens": 35301651.0, | |
| "step": 720 | |
| }, | |
| { | |
| "entropy": 1.5303823828697205, | |
| "epoch": 0.20857142857142857, | |
| "grad_norm": 0.11503418534994125, | |
| "learning_rate": 9.330520648521581e-05, | |
| "loss": 1.3944478034973145, | |
| "mean_token_accuracy": 0.6395626902580261, | |
| "num_tokens": 35792660.0, | |
| "step": 730 | |
| }, | |
| { | |
| "entropy": 1.546586701273918, | |
| "epoch": 0.21142857142857144, | |
| "grad_norm": 0.11355750262737274, | |
| "learning_rate": 9.306713149233126e-05, | |
| "loss": 1.4133389472961426, | |
| "mean_token_accuracy": 0.6359729379415512, | |
| "num_tokens": 36283021.0, | |
| "step": 740 | |
| }, | |
| { | |
| "entropy": 1.5666495472192765, | |
| "epoch": 0.21428571428571427, | |
| "grad_norm": 0.11290405690670013, | |
| "learning_rate": 9.282521182862629e-05, | |
| "loss": 1.4316218376159668, | |
| "mean_token_accuracy": 0.6324604853987694, | |
| "num_tokens": 36773401.0, | |
| "step": 750 | |
| }, | |
| { | |
| "entropy": 1.5476279586553574, | |
| "epoch": 0.21714285714285714, | |
| "grad_norm": 0.11210077255964279, | |
| "learning_rate": 9.25794690906512e-05, | |
| "loss": 1.4119671821594237, | |
| "mean_token_accuracy": 0.6370165795087814, | |
| "num_tokens": 37263273.0, | |
| "step": 760 | |
| }, | |
| { | |
| "entropy": 1.5502693444490432, | |
| "epoch": 0.22, | |
| "grad_norm": 0.1148453801870346, | |
| "learning_rate": 9.23299252162482e-05, | |
| "loss": 1.4147866249084473, | |
| "mean_token_accuracy": 0.636229844391346, | |
| "num_tokens": 37753396.0, | |
| "step": 770 | |
| }, | |
| { | |
| "entropy": 1.5336053550243378, | |
| "epoch": 0.22285714285714286, | |
| "grad_norm": 0.12296847254037857, | |
| "learning_rate": 9.20766024825929e-05, | |
| "loss": 1.4014326095581056, | |
| "mean_token_accuracy": 0.6389749109745025, | |
| "num_tokens": 38243813.0, | |
| "step": 780 | |
| }, | |
| { | |
| "entropy": 1.5418373078107834, | |
| "epoch": 0.2257142857142857, | |
| "grad_norm": 0.11040552705526352, | |
| "learning_rate": 9.181952350420568e-05, | |
| "loss": 1.408182144165039, | |
| "mean_token_accuracy": 0.6375694692134857, | |
| "num_tokens": 38734087.0, | |
| "step": 790 | |
| }, | |
| { | |
| "entropy": 1.540384903550148, | |
| "epoch": 0.22857142857142856, | |
| "grad_norm": 0.11459089070558548, | |
| "learning_rate": 9.15587112309328e-05, | |
| "loss": 1.4076971054077148, | |
| "mean_token_accuracy": 0.6367619827389717, | |
| "num_tokens": 39223587.0, | |
| "step": 800 | |
| }, | |
| { | |
| "entropy": 1.5353645473718642, | |
| "epoch": 0.23142857142857143, | |
| "grad_norm": 0.10925675183534622, | |
| "learning_rate": 9.129418894589765e-05, | |
| "loss": 1.4029156684875488, | |
| "mean_token_accuracy": 0.6391762167215347, | |
| "num_tokens": 39713378.0, | |
| "step": 810 | |
| }, | |
| { | |
| "entropy": 1.537180870771408, | |
| "epoch": 0.2342857142857143, | |
| "grad_norm": 0.11116520315408707, | |
| "learning_rate": 9.102598026342224e-05, | |
| "loss": 1.4027939796447755, | |
| "mean_token_accuracy": 0.6384590819478035, | |
| "num_tokens": 40204035.0, | |
| "step": 820 | |
| }, | |
| { | |
| "entropy": 1.5364642143249512, | |
| "epoch": 0.23714285714285716, | |
| "grad_norm": 0.11325648427009583, | |
| "learning_rate": 9.075410912691907e-05, | |
| "loss": 1.4042522430419921, | |
| "mean_token_accuracy": 0.6376483544707299, | |
| "num_tokens": 40693853.0, | |
| "step": 830 | |
| }, | |
| { | |
| "entropy": 1.5172248750925064, | |
| "epoch": 0.24, | |
| "grad_norm": 0.11164099723100662, | |
| "learning_rate": 9.04785998067537e-05, | |
| "loss": 1.38507022857666, | |
| "mean_token_accuracy": 0.6411049708724021, | |
| "num_tokens": 41184741.0, | |
| "step": 840 | |
| }, | |
| { | |
| "entropy": 1.5312542200088501, | |
| "epoch": 0.24285714285714285, | |
| "grad_norm": 0.11361584812402725, | |
| "learning_rate": 9.019947689807812e-05, | |
| "loss": 1.401794147491455, | |
| "mean_token_accuracy": 0.6388043731451034, | |
| "num_tokens": 41675456.0, | |
| "step": 850 | |
| }, | |
| { | |
| "entropy": 1.5356624484062196, | |
| "epoch": 0.24571428571428572, | |
| "grad_norm": 0.1110292598605156, | |
| "learning_rate": 8.991676531863508e-05, | |
| "loss": 1.398195171356201, | |
| "mean_token_accuracy": 0.6395324870944024, | |
| "num_tokens": 42165958.0, | |
| "step": 860 | |
| }, | |
| { | |
| "entropy": 1.5131745964288712, | |
| "epoch": 0.24857142857142858, | |
| "grad_norm": 0.11180949956178665, | |
| "learning_rate": 8.963049030653357e-05, | |
| "loss": 1.386609935760498, | |
| "mean_token_accuracy": 0.6415100902318954, | |
| "num_tokens": 42655563.0, | |
| "step": 870 | |
| }, | |
| { | |
| "entropy": 1.5264964669942855, | |
| "epoch": 0.25142857142857145, | |
| "grad_norm": 0.11161798238754272, | |
| "learning_rate": 8.934067741799588e-05, | |
| "loss": 1.3911283493041993, | |
| "mean_token_accuracy": 0.6405024617910385, | |
| "num_tokens": 43145690.0, | |
| "step": 880 | |
| }, | |
| { | |
| "entropy": 1.5238522559404373, | |
| "epoch": 0.2542857142857143, | |
| "grad_norm": 0.11398132890462875, | |
| "learning_rate": 8.90473525250761e-05, | |
| "loss": 1.3967710494995118, | |
| "mean_token_accuracy": 0.6410051316022873, | |
| "num_tokens": 43636186.0, | |
| "step": 890 | |
| }, | |
| { | |
| "entropy": 1.5147892564535141, | |
| "epoch": 0.2571428571428571, | |
| "grad_norm": 0.1122528463602066, | |
| "learning_rate": 8.875054181335054e-05, | |
| "loss": 1.3884020805358888, | |
| "mean_token_accuracy": 0.6408766448497772, | |
| "num_tokens": 44126324.0, | |
| "step": 900 | |
| }, | |
| { | |
| "entropy": 1.51045164167881, | |
| "epoch": 0.26, | |
| "grad_norm": 0.11045881360769272, | |
| "learning_rate": 8.845027177958e-05, | |
| "loss": 1.3785322189331055, | |
| "mean_token_accuracy": 0.6415310442447663, | |
| "num_tokens": 44616075.0, | |
| "step": 910 | |
| }, | |
| { | |
| "entropy": 1.5210172832012177, | |
| "epoch": 0.26285714285714284, | |
| "grad_norm": 0.10973014682531357, | |
| "learning_rate": 8.814656922934445e-05, | |
| "loss": 1.3884805679321288, | |
| "mean_token_accuracy": 0.6404986321926117, | |
| "num_tokens": 45106497.0, | |
| "step": 920 | |
| }, | |
| { | |
| "entropy": 1.5296362102031709, | |
| "epoch": 0.26571428571428574, | |
| "grad_norm": 0.11496967822313309, | |
| "learning_rate": 8.783946127465001e-05, | |
| "loss": 1.3945645332336425, | |
| "mean_token_accuracy": 0.6402764439582824, | |
| "num_tokens": 45597040.0, | |
| "step": 930 | |
| }, | |
| { | |
| "entropy": 1.5171466410160064, | |
| "epoch": 0.26857142857142857, | |
| "grad_norm": 0.11463408172130585, | |
| "learning_rate": 8.752897533150868e-05, | |
| "loss": 1.387949275970459, | |
| "mean_token_accuracy": 0.6412890180945396, | |
| "num_tokens": 46087347.0, | |
| "step": 940 | |
| }, | |
| { | |
| "entropy": 1.5124164909124374, | |
| "epoch": 0.2714285714285714, | |
| "grad_norm": 0.11042758822441101, | |
| "learning_rate": 8.721513911749073e-05, | |
| "loss": 1.3821532249450683, | |
| "mean_token_accuracy": 0.6414273202419281, | |
| "num_tokens": 46577723.0, | |
| "step": 950 | |
| }, | |
| { | |
| "entropy": 1.5339836180210114, | |
| "epoch": 0.2742857142857143, | |
| "grad_norm": 0.115251325070858, | |
| "learning_rate": 8.689798064925049e-05, | |
| "loss": 1.4023088455200194, | |
| "mean_token_accuracy": 0.6390077367424964, | |
| "num_tokens": 47067566.0, | |
| "step": 960 | |
| }, | |
| { | |
| "entropy": 1.4907908529043197, | |
| "epoch": 0.27714285714285714, | |
| "grad_norm": 0.10888004302978516, | |
| "learning_rate": 8.657752824002512e-05, | |
| "loss": 1.3607192993164063, | |
| "mean_token_accuracy": 0.6449671313166618, | |
| "num_tokens": 47557094.0, | |
| "step": 970 | |
| }, | |
| { | |
| "entropy": 1.5133179754018784, | |
| "epoch": 0.28, | |
| "grad_norm": 0.10961966216564178, | |
| "learning_rate": 8.625381049710711e-05, | |
| "loss": 1.387069320678711, | |
| "mean_token_accuracy": 0.6418288230895997, | |
| "num_tokens": 48047288.0, | |
| "step": 980 | |
| }, | |
| { | |
| "entropy": 1.5235115170478821, | |
| "epoch": 0.28285714285714286, | |
| "grad_norm": 0.11490974575281143, | |
| "learning_rate": 8.592685631929041e-05, | |
| "loss": 1.3894145011901855, | |
| "mean_token_accuracy": 0.6398768693208694, | |
| "num_tokens": 48537707.0, | |
| "step": 990 | |
| }, | |
| { | |
| "entropy": 1.5072008579969407, | |
| "epoch": 0.2857142857142857, | |
| "grad_norm": 0.11175917088985443, | |
| "learning_rate": 8.55966948942907e-05, | |
| "loss": 1.3789593696594238, | |
| "mean_token_accuracy": 0.6438135176897049, | |
| "num_tokens": 49027502.0, | |
| "step": 1000 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 3500, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 9223372036854775807, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.73130249895936e+17, | |
| "train_batch_size": 12, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |