Instructions to use rovdetection/code-1b-instruct with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use rovdetection/code-1b-instruct with Transformers:
# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("rovdetection/code-1b-instruct", dtype="auto") - Notebooks
- Google Colab
- Kaggle
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 8.591446378680422, | |
| "eval_steps": 500, | |
| "global_step": 5000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 1.4190640166401862, | |
| "epoch": 0.017193208682570384, | |
| "grad_norm": 0.44122377038002014, | |
| "learning_rate": 0.00019964, | |
| "loss": 4.624512481689453, | |
| "mean_token_accuracy": 0.48470579609274866, | |
| "num_tokens": 59125.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 1.3190652154386044, | |
| "epoch": 0.03438641736514077, | |
| "grad_norm": 0.5728761553764343, | |
| "learning_rate": 0.00019924, | |
| "loss": 4.138050842285156, | |
| "mean_token_accuracy": 0.5170843195170164, | |
| "num_tokens": 121732.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 1.899017508327961, | |
| "epoch": 0.05157962604771115, | |
| "grad_norm": 0.5385442972183228, | |
| "learning_rate": 0.00019884000000000001, | |
| "loss": 3.410753631591797, | |
| "mean_token_accuracy": 0.5273028708994388, | |
| "num_tokens": 180496.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 2.605507677793503, | |
| "epoch": 0.06877283473028153, | |
| "grad_norm": 0.39409318566322327, | |
| "learning_rate": 0.00019844, | |
| "loss": 3.210155487060547, | |
| "mean_token_accuracy": 0.5257176972925663, | |
| "num_tokens": 239996.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 2.37664245814085, | |
| "epoch": 0.08596604341285193, | |
| "grad_norm": 0.32786861062049866, | |
| "learning_rate": 0.00019804, | |
| "loss": 2.821852111816406, | |
| "mean_token_accuracy": 0.5590792961418629, | |
| "num_tokens": 298043.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 2.395954003930092, | |
| "epoch": 0.1031592520954223, | |
| "grad_norm": 0.3047294616699219, | |
| "learning_rate": 0.00019764, | |
| "loss": 2.7597124099731447, | |
| "mean_token_accuracy": 0.5563199404627085, | |
| "num_tokens": 355876.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 2.411456014215946, | |
| "epoch": 0.12035246077799269, | |
| "grad_norm": 0.3513432741165161, | |
| "learning_rate": 0.00019724, | |
| "loss": 2.7809066772460938, | |
| "mean_token_accuracy": 0.5538371551781893, | |
| "num_tokens": 415356.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 2.396346817910671, | |
| "epoch": 0.13754566946056307, | |
| "grad_norm": 0.32153311371803284, | |
| "learning_rate": 0.00019684, | |
| "loss": 2.673196029663086, | |
| "mean_token_accuracy": 0.5653545051813126, | |
| "num_tokens": 472895.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 2.4005219876766204, | |
| "epoch": 0.15473887814313347, | |
| "grad_norm": 0.37381669878959656, | |
| "learning_rate": 0.00019644, | |
| "loss": 2.6783329010009767, | |
| "mean_token_accuracy": 0.565613779053092, | |
| "num_tokens": 531787.0, | |
| "step": 90 | |
| }, | |
| { | |
| "entropy": 2.291259491443634, | |
| "epoch": 0.17193208682570385, | |
| "grad_norm": 0.3424990475177765, | |
| "learning_rate": 0.00019604, | |
| "loss": 2.518219757080078, | |
| "mean_token_accuracy": 0.5815062165260315, | |
| "num_tokens": 591947.0, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 2.3215643003582955, | |
| "epoch": 0.18912529550827423, | |
| "grad_norm": 0.37565672397613525, | |
| "learning_rate": 0.00019564, | |
| "loss": 2.5377925872802733, | |
| "mean_token_accuracy": 0.5835155539214611, | |
| "num_tokens": 649619.0, | |
| "step": 110 | |
| }, | |
| { | |
| "entropy": 2.4258039399981497, | |
| "epoch": 0.2063185041908446, | |
| "grad_norm": 0.4689179062843323, | |
| "learning_rate": 0.00019524, | |
| "loss": 2.6080394744873048, | |
| "mean_token_accuracy": 0.5646311499178409, | |
| "num_tokens": 707340.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 2.317111870646477, | |
| "epoch": 0.223511712873415, | |
| "grad_norm": 0.3885783851146698, | |
| "learning_rate": 0.00019484, | |
| "loss": 2.477190399169922, | |
| "mean_token_accuracy": 0.5836748830974102, | |
| "num_tokens": 769505.0, | |
| "step": 130 | |
| }, | |
| { | |
| "entropy": 2.3055995970964434, | |
| "epoch": 0.24070492155598538, | |
| "grad_norm": 0.43443477153778076, | |
| "learning_rate": 0.00019444, | |
| "loss": 2.4609752655029298, | |
| "mean_token_accuracy": 0.5811029966920614, | |
| "num_tokens": 829367.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 2.2776345878839495, | |
| "epoch": 0.2578981302385558, | |
| "grad_norm": 0.47762158513069153, | |
| "learning_rate": 0.00019404, | |
| "loss": 2.4081003189086916, | |
| "mean_token_accuracy": 0.5853929311037064, | |
| "num_tokens": 886215.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 2.2963428094983103, | |
| "epoch": 0.27509133892112614, | |
| "grad_norm": 0.46731501817703247, | |
| "learning_rate": 0.00019364, | |
| "loss": 2.4675086975097655, | |
| "mean_token_accuracy": 0.5888176921755075, | |
| "num_tokens": 945101.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 2.2366374909877775, | |
| "epoch": 0.29228454760369654, | |
| "grad_norm": 0.4367876350879669, | |
| "learning_rate": 0.00019323999999999999, | |
| "loss": 2.3875659942626952, | |
| "mean_token_accuracy": 0.5896096613258124, | |
| "num_tokens": 1004603.0, | |
| "step": 170 | |
| }, | |
| { | |
| "entropy": 2.254362703859806, | |
| "epoch": 0.30947775628626695, | |
| "grad_norm": 0.4333887994289398, | |
| "learning_rate": 0.00019284, | |
| "loss": 2.4054040908813477, | |
| "mean_token_accuracy": 0.5891438674181699, | |
| "num_tokens": 1067233.0, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 2.2392648085951805, | |
| "epoch": 0.3266709649688373, | |
| "grad_norm": 0.45199301838874817, | |
| "learning_rate": 0.00019244000000000002, | |
| "loss": 2.3508541107177736, | |
| "mean_token_accuracy": 0.591214832291007, | |
| "num_tokens": 1124472.0, | |
| "step": 190 | |
| }, | |
| { | |
| "entropy": 2.149501931667328, | |
| "epoch": 0.3438641736514077, | |
| "grad_norm": 0.44520440697669983, | |
| "learning_rate": 0.00019204, | |
| "loss": 2.281546783447266, | |
| "mean_token_accuracy": 0.5986809592694045, | |
| "num_tokens": 1180382.0, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 2.2454783216118814, | |
| "epoch": 0.3610573823339781, | |
| "grad_norm": 0.5927078723907471, | |
| "learning_rate": 0.00019164000000000003, | |
| "loss": 2.4131927490234375, | |
| "mean_token_accuracy": 0.5888259880244732, | |
| "num_tokens": 1240228.0, | |
| "step": 210 | |
| }, | |
| { | |
| "entropy": 2.2189586043357847, | |
| "epoch": 0.37825059101654845, | |
| "grad_norm": 0.4634048044681549, | |
| "learning_rate": 0.00019124000000000002, | |
| "loss": 2.336981201171875, | |
| "mean_token_accuracy": 0.5954784743487835, | |
| "num_tokens": 1300889.0, | |
| "step": 220 | |
| }, | |
| { | |
| "entropy": 2.317907977104187, | |
| "epoch": 0.39544379969911886, | |
| "grad_norm": 0.492512971162796, | |
| "learning_rate": 0.00019084, | |
| "loss": 2.4568838119506835, | |
| "mean_token_accuracy": 0.5841229122132063, | |
| "num_tokens": 1357396.0, | |
| "step": 230 | |
| }, | |
| { | |
| "entropy": 2.2429257184267044, | |
| "epoch": 0.4126370083816892, | |
| "grad_norm": 0.515352725982666, | |
| "learning_rate": 0.00019044000000000003, | |
| "loss": 2.3383663177490233, | |
| "mean_token_accuracy": 0.5922708168625832, | |
| "num_tokens": 1417505.0, | |
| "step": 240 | |
| }, | |
| { | |
| "entropy": 2.2210252806544304, | |
| "epoch": 0.4298302170642596, | |
| "grad_norm": 0.4831322133541107, | |
| "learning_rate": 0.00019004000000000002, | |
| "loss": 2.4192705154418945, | |
| "mean_token_accuracy": 0.5948628049343825, | |
| "num_tokens": 1478753.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 2.274476508796215, | |
| "epoch": 0.44702342574683, | |
| "grad_norm": 0.49397456645965576, | |
| "learning_rate": 0.00018964, | |
| "loss": 2.3850372314453123, | |
| "mean_token_accuracy": 0.5882141895592212, | |
| "num_tokens": 1540514.0, | |
| "step": 260 | |
| }, | |
| { | |
| "entropy": 2.2252144277095796, | |
| "epoch": 0.46421663442940037, | |
| "grad_norm": 0.48239603638648987, | |
| "learning_rate": 0.00018924000000000002, | |
| "loss": 2.3765859603881836, | |
| "mean_token_accuracy": 0.597905408218503, | |
| "num_tokens": 1602363.0, | |
| "step": 270 | |
| }, | |
| { | |
| "entropy": 2.2030586138367654, | |
| "epoch": 0.48140984311197077, | |
| "grad_norm": 0.47931915521621704, | |
| "learning_rate": 0.00018884000000000001, | |
| "loss": 2.3378921508789063, | |
| "mean_token_accuracy": 0.5946248725056649, | |
| "num_tokens": 1663013.0, | |
| "step": 280 | |
| }, | |
| { | |
| "entropy": 2.193090632557869, | |
| "epoch": 0.4986030517945412, | |
| "grad_norm": 0.4267734885215759, | |
| "learning_rate": 0.00018844, | |
| "loss": 2.301167106628418, | |
| "mean_token_accuracy": 0.600122318789363, | |
| "num_tokens": 1724937.0, | |
| "step": 290 | |
| }, | |
| { | |
| "entropy": 2.2401370778679848, | |
| "epoch": 0.5157962604771116, | |
| "grad_norm": 0.49861446022987366, | |
| "learning_rate": 0.00018804000000000002, | |
| "loss": 2.3409378051757814, | |
| "mean_token_accuracy": 0.5942602109163999, | |
| "num_tokens": 1782528.0, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 2.2543238058686255, | |
| "epoch": 0.5329894691596819, | |
| "grad_norm": 0.5549066066741943, | |
| "learning_rate": 0.00018764, | |
| "loss": 2.397447967529297, | |
| "mean_token_accuracy": 0.5927667014300824, | |
| "num_tokens": 1841865.0, | |
| "step": 310 | |
| }, | |
| { | |
| "entropy": 2.1446528255939485, | |
| "epoch": 0.5501826778422523, | |
| "grad_norm": 0.6057388782501221, | |
| "learning_rate": 0.00018724, | |
| "loss": 2.2854946136474608, | |
| "mean_token_accuracy": 0.5992342013865709, | |
| "num_tokens": 1901375.0, | |
| "step": 320 | |
| }, | |
| { | |
| "entropy": 2.2384325101971627, | |
| "epoch": 0.5673758865248227, | |
| "grad_norm": 0.6120573282241821, | |
| "learning_rate": 0.00018684000000000002, | |
| "loss": 2.3560409545898438, | |
| "mean_token_accuracy": 0.5913927119225264, | |
| "num_tokens": 1959650.0, | |
| "step": 330 | |
| }, | |
| { | |
| "entropy": 2.103813698887825, | |
| "epoch": 0.5845690952073931, | |
| "grad_norm": 0.5133985280990601, | |
| "learning_rate": 0.00018644, | |
| "loss": 2.2106700897216798, | |
| "mean_token_accuracy": 0.6131400100886821, | |
| "num_tokens": 2020031.0, | |
| "step": 340 | |
| }, | |
| { | |
| "entropy": 2.11156629472971, | |
| "epoch": 0.6017623038899634, | |
| "grad_norm": 0.4958188235759735, | |
| "learning_rate": 0.00018604, | |
| "loss": 2.2597396850585936, | |
| "mean_token_accuracy": 0.6059262081980705, | |
| "num_tokens": 2080034.0, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 2.2104950502514837, | |
| "epoch": 0.6189555125725339, | |
| "grad_norm": 0.5096895098686218, | |
| "learning_rate": 0.00018564000000000002, | |
| "loss": 2.271474075317383, | |
| "mean_token_accuracy": 0.597117318212986, | |
| "num_tokens": 2137629.0, | |
| "step": 360 | |
| }, | |
| { | |
| "entropy": 2.134429484605789, | |
| "epoch": 0.6361487212551042, | |
| "grad_norm": 0.5682357549667358, | |
| "learning_rate": 0.00018524, | |
| "loss": 2.237981605529785, | |
| "mean_token_accuracy": 0.6059913612902165, | |
| "num_tokens": 2196125.0, | |
| "step": 370 | |
| }, | |
| { | |
| "entropy": 2.151672175526619, | |
| "epoch": 0.6533419299376746, | |
| "grad_norm": 0.5488378405570984, | |
| "learning_rate": 0.00018484000000000003, | |
| "loss": 2.264353942871094, | |
| "mean_token_accuracy": 0.5990234814584255, | |
| "num_tokens": 2256258.0, | |
| "step": 380 | |
| }, | |
| { | |
| "entropy": 2.1240685641765595, | |
| "epoch": 0.670535138620245, | |
| "grad_norm": 0.5736668705940247, | |
| "learning_rate": 0.00018444000000000002, | |
| "loss": 2.2498214721679686, | |
| "mean_token_accuracy": 0.6068044692277909, | |
| "num_tokens": 2317743.0, | |
| "step": 390 | |
| }, | |
| { | |
| "entropy": 2.110594576597214, | |
| "epoch": 0.6877283473028154, | |
| "grad_norm": 0.5833790302276611, | |
| "learning_rate": 0.00018404, | |
| "loss": 2.222176361083984, | |
| "mean_token_accuracy": 0.6128960818052291, | |
| "num_tokens": 2381669.0, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 2.1233398094773293, | |
| "epoch": 0.7049215559853858, | |
| "grad_norm": 0.5612857937812805, | |
| "learning_rate": 0.00018364000000000002, | |
| "loss": 2.2054920196533203, | |
| "mean_token_accuracy": 0.6041553311049939, | |
| "num_tokens": 2439094.0, | |
| "step": 410 | |
| }, | |
| { | |
| "entropy": 2.1500213012099265, | |
| "epoch": 0.7221147646679562, | |
| "grad_norm": 0.5670902132987976, | |
| "learning_rate": 0.00018324000000000001, | |
| "loss": 2.2393463134765623, | |
| "mean_token_accuracy": 0.6055185578763485, | |
| "num_tokens": 2498246.0, | |
| "step": 420 | |
| }, | |
| { | |
| "entropy": 2.03703038841486, | |
| "epoch": 0.7393079733505266, | |
| "grad_norm": 0.6171953082084656, | |
| "learning_rate": 0.00018284, | |
| "loss": 2.129292678833008, | |
| "mean_token_accuracy": 0.6159544993191958, | |
| "num_tokens": 2557403.0, | |
| "step": 430 | |
| }, | |
| { | |
| "entropy": 2.0861524820327757, | |
| "epoch": 0.7565011820330969, | |
| "grad_norm": 0.5753834247589111, | |
| "learning_rate": 0.00018244000000000002, | |
| "loss": 2.189851760864258, | |
| "mean_token_accuracy": 0.6077463660389185, | |
| "num_tokens": 2614912.0, | |
| "step": 440 | |
| }, | |
| { | |
| "entropy": 2.127491444349289, | |
| "epoch": 0.7736943907156673, | |
| "grad_norm": 0.5808554291725159, | |
| "learning_rate": 0.00018204, | |
| "loss": 2.2198902130126954, | |
| "mean_token_accuracy": 0.6067756544798613, | |
| "num_tokens": 2673278.0, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 2.10256717056036, | |
| "epoch": 0.7908875993982377, | |
| "grad_norm": 0.6682887077331543, | |
| "learning_rate": 0.00018164, | |
| "loss": 2.1562450408935545, | |
| "mean_token_accuracy": 0.6123722370713949, | |
| "num_tokens": 2733477.0, | |
| "step": 460 | |
| }, | |
| { | |
| "entropy": 2.0725875943899155, | |
| "epoch": 0.8080808080808081, | |
| "grad_norm": 0.5611984729766846, | |
| "learning_rate": 0.00018124000000000002, | |
| "loss": 2.1555084228515624, | |
| "mean_token_accuracy": 0.6104017984122038, | |
| "num_tokens": 2793130.0, | |
| "step": 470 | |
| }, | |
| { | |
| "entropy": 2.0911933913826943, | |
| "epoch": 0.8252740167633784, | |
| "grad_norm": 0.4832659661769867, | |
| "learning_rate": 0.00018084, | |
| "loss": 2.2131584167480467, | |
| "mean_token_accuracy": 0.6108913067728281, | |
| "num_tokens": 2856643.0, | |
| "step": 480 | |
| }, | |
| { | |
| "entropy": 2.089042477309704, | |
| "epoch": 0.8424672254459489, | |
| "grad_norm": 0.6528189182281494, | |
| "learning_rate": 0.00018044, | |
| "loss": 2.222637939453125, | |
| "mean_token_accuracy": 0.6140136975795031, | |
| "num_tokens": 2918669.0, | |
| "step": 490 | |
| }, | |
| { | |
| "entropy": 2.177516208589077, | |
| "epoch": 0.8596604341285192, | |
| "grad_norm": 0.511565089225769, | |
| "learning_rate": 0.00018004000000000002, | |
| "loss": 2.2838220596313477, | |
| "mean_token_accuracy": 0.6013776436448097, | |
| "num_tokens": 2977952.0, | |
| "step": 500 | |
| }, | |
| { | |
| "entropy": 2.0694020837545395, | |
| "epoch": 0.8768536428110896, | |
| "grad_norm": 0.6112110614776611, | |
| "learning_rate": 0.00017964, | |
| "loss": 2.1932716369628906, | |
| "mean_token_accuracy": 0.6094281867146492, | |
| "num_tokens": 3039858.0, | |
| "step": 510 | |
| }, | |
| { | |
| "entropy": 2.074494983255863, | |
| "epoch": 0.89404685149366, | |
| "grad_norm": 0.6264903545379639, | |
| "learning_rate": 0.00017924, | |
| "loss": 2.1746356964111326, | |
| "mean_token_accuracy": 0.6137691352516412, | |
| "num_tokens": 3095095.0, | |
| "step": 520 | |
| }, | |
| { | |
| "entropy": 2.1704643085598945, | |
| "epoch": 0.9112400601762304, | |
| "grad_norm": 0.5588786602020264, | |
| "learning_rate": 0.00017884000000000002, | |
| "loss": 2.256443977355957, | |
| "mean_token_accuracy": 0.6048448126763105, | |
| "num_tokens": 3154541.0, | |
| "step": 530 | |
| }, | |
| { | |
| "entropy": 2.0287919655442237, | |
| "epoch": 0.9284332688588007, | |
| "grad_norm": 0.5964768528938293, | |
| "learning_rate": 0.00017844, | |
| "loss": 2.1472768783569336, | |
| "mean_token_accuracy": 0.6183896280825139, | |
| "num_tokens": 3213587.0, | |
| "step": 540 | |
| }, | |
| { | |
| "entropy": 2.0924746826291085, | |
| "epoch": 0.9456264775413712, | |
| "grad_norm": 0.6021146774291992, | |
| "learning_rate": 0.00017804, | |
| "loss": 2.194413757324219, | |
| "mean_token_accuracy": 0.6094567842781544, | |
| "num_tokens": 3270420.0, | |
| "step": 550 | |
| }, | |
| { | |
| "entropy": 2.2008628591895105, | |
| "epoch": 0.9628196862239415, | |
| "grad_norm": 0.5850458741188049, | |
| "learning_rate": 0.00017764000000000001, | |
| "loss": 2.325449752807617, | |
| "mean_token_accuracy": 0.5982100035995245, | |
| "num_tokens": 3329358.0, | |
| "step": 560 | |
| }, | |
| { | |
| "entropy": 2.065661500394344, | |
| "epoch": 0.9800128949065119, | |
| "grad_norm": 0.5808996558189392, | |
| "learning_rate": 0.00017724, | |
| "loss": 2.197132873535156, | |
| "mean_token_accuracy": 0.614894449710846, | |
| "num_tokens": 3389408.0, | |
| "step": 570 | |
| }, | |
| { | |
| "entropy": 2.0561472952365873, | |
| "epoch": 0.9972061035890823, | |
| "grad_norm": 0.5550947189331055, | |
| "learning_rate": 0.00017684, | |
| "loss": 2.1736166000366213, | |
| "mean_token_accuracy": 0.6167749039828777, | |
| "num_tokens": 3448108.0, | |
| "step": 580 | |
| }, | |
| { | |
| "entropy": 2.017703249857023, | |
| "epoch": 1.0137545669460564, | |
| "grad_norm": 0.5885869860649109, | |
| "learning_rate": 0.00017644, | |
| "loss": 2.093535614013672, | |
| "mean_token_accuracy": 0.623307110427262, | |
| "num_tokens": 3510556.0, | |
| "step": 590 | |
| }, | |
| { | |
| "entropy": 2.0724975898861886, | |
| "epoch": 1.0309477756286267, | |
| "grad_norm": 0.6038488745689392, | |
| "learning_rate": 0.00017604, | |
| "loss": 2.1512643814086916, | |
| "mean_token_accuracy": 0.6218964882194996, | |
| "num_tokens": 3572796.0, | |
| "step": 600 | |
| }, | |
| { | |
| "entropy": 2.0171437337994576, | |
| "epoch": 1.048140984311197, | |
| "grad_norm": 0.5356580018997192, | |
| "learning_rate": 0.00017564, | |
| "loss": 2.1061470031738283, | |
| "mean_token_accuracy": 0.6229204799979925, | |
| "num_tokens": 3636040.0, | |
| "step": 610 | |
| }, | |
| { | |
| "entropy": 2.069349633157253, | |
| "epoch": 1.0653341929937674, | |
| "grad_norm": 0.6440969109535217, | |
| "learning_rate": 0.00017524, | |
| "loss": 2.1538244247436524, | |
| "mean_token_accuracy": 0.6151916943490505, | |
| "num_tokens": 3692494.0, | |
| "step": 620 | |
| }, | |
| { | |
| "entropy": 1.9605562821030618, | |
| "epoch": 1.0825274016763378, | |
| "grad_norm": 0.6160545349121094, | |
| "learning_rate": 0.00017484, | |
| "loss": 2.036081314086914, | |
| "mean_token_accuracy": 0.6326734255999327, | |
| "num_tokens": 3750269.0, | |
| "step": 630 | |
| }, | |
| { | |
| "entropy": 1.9889371052384377, | |
| "epoch": 1.0997206103589083, | |
| "grad_norm": 0.6164005398750305, | |
| "learning_rate": 0.00017444, | |
| "loss": 2.051021194458008, | |
| "mean_token_accuracy": 0.6301001563668251, | |
| "num_tokens": 3808993.0, | |
| "step": 640 | |
| }, | |
| { | |
| "entropy": 2.1208505019545556, | |
| "epoch": 1.1169138190414787, | |
| "grad_norm": 0.5549023151397705, | |
| "learning_rate": 0.00017404, | |
| "loss": 2.2218536376953124, | |
| "mean_token_accuracy": 0.6127480801194907, | |
| "num_tokens": 3875366.0, | |
| "step": 650 | |
| }, | |
| { | |
| "entropy": 2.0680324912071226, | |
| "epoch": 1.134107027724049, | |
| "grad_norm": 0.6039656400680542, | |
| "learning_rate": 0.00017364, | |
| "loss": 2.1767093658447267, | |
| "mean_token_accuracy": 0.61400815397501, | |
| "num_tokens": 3936258.0, | |
| "step": 660 | |
| }, | |
| { | |
| "entropy": 2.072723534703255, | |
| "epoch": 1.1513002364066194, | |
| "grad_norm": 0.5685736536979675, | |
| "learning_rate": 0.00017324000000000002, | |
| "loss": 2.16034049987793, | |
| "mean_token_accuracy": 0.6158209484070539, | |
| "num_tokens": 3996786.0, | |
| "step": 670 | |
| }, | |
| { | |
| "entropy": 2.0064551383256912, | |
| "epoch": 1.1684934450891897, | |
| "grad_norm": 0.6133168935775757, | |
| "learning_rate": 0.00017284, | |
| "loss": 2.0968536376953124, | |
| "mean_token_accuracy": 0.6204963065683842, | |
| "num_tokens": 4057237.0, | |
| "step": 680 | |
| }, | |
| { | |
| "entropy": 1.999112318456173, | |
| "epoch": 1.18568665377176, | |
| "grad_norm": 0.6392707228660583, | |
| "learning_rate": 0.00017244, | |
| "loss": 2.1093589782714846, | |
| "mean_token_accuracy": 0.6260342009365558, | |
| "num_tokens": 4115641.0, | |
| "step": 690 | |
| }, | |
| { | |
| "entropy": 1.9792790532112121, | |
| "epoch": 1.2028798624543304, | |
| "grad_norm": 0.5575782656669617, | |
| "learning_rate": 0.00017204, | |
| "loss": 2.097017288208008, | |
| "mean_token_accuracy": 0.6311597619205713, | |
| "num_tokens": 4179186.0, | |
| "step": 700 | |
| }, | |
| { | |
| "entropy": 2.0830755099654197, | |
| "epoch": 1.220073071136901, | |
| "grad_norm": 0.5660914182662964, | |
| "learning_rate": 0.00017164, | |
| "loss": 2.1811811447143556, | |
| "mean_token_accuracy": 0.6163320489227772, | |
| "num_tokens": 4239862.0, | |
| "step": 710 | |
| }, | |
| { | |
| "entropy": 2.0135093346238135, | |
| "epoch": 1.2372662798194713, | |
| "grad_norm": 0.535127580165863, | |
| "learning_rate": 0.00017124, | |
| "loss": 2.1185089111328126, | |
| "mean_token_accuracy": 0.6227246847003698, | |
| "num_tokens": 4299020.0, | |
| "step": 720 | |
| }, | |
| { | |
| "entropy": 2.0245131298899652, | |
| "epoch": 1.2544594885020417, | |
| "grad_norm": 0.5870150327682495, | |
| "learning_rate": 0.00017084, | |
| "loss": 2.1228567123413087, | |
| "mean_token_accuracy": 0.620493221282959, | |
| "num_tokens": 4357565.0, | |
| "step": 730 | |
| }, | |
| { | |
| "entropy": 2.0833021104335785, | |
| "epoch": 1.271652697184612, | |
| "grad_norm": 0.6691998243331909, | |
| "learning_rate": 0.00017044, | |
| "loss": 2.1688915252685548, | |
| "mean_token_accuracy": 0.6160433337092399, | |
| "num_tokens": 4416113.0, | |
| "step": 740 | |
| }, | |
| { | |
| "entropy": 1.9562881767749787, | |
| "epoch": 1.2888459058671824, | |
| "grad_norm": 0.5513840317726135, | |
| "learning_rate": 0.00017004, | |
| "loss": 2.038709831237793, | |
| "mean_token_accuracy": 0.6274638958275318, | |
| "num_tokens": 4479913.0, | |
| "step": 750 | |
| }, | |
| { | |
| "entropy": 2.0899336978793146, | |
| "epoch": 1.306039114549753, | |
| "grad_norm": 0.6334884166717529, | |
| "learning_rate": 0.00016964, | |
| "loss": 2.1464771270751952, | |
| "mean_token_accuracy": 0.6156820185482502, | |
| "num_tokens": 4538630.0, | |
| "step": 760 | |
| }, | |
| { | |
| "entropy": 2.006402041018009, | |
| "epoch": 1.3232323232323233, | |
| "grad_norm": 0.6608359813690186, | |
| "learning_rate": 0.00016924, | |
| "loss": 2.0910036087036135, | |
| "mean_token_accuracy": 0.6266825262457132, | |
| "num_tokens": 4596432.0, | |
| "step": 770 | |
| }, | |
| { | |
| "entropy": 1.981143780052662, | |
| "epoch": 1.3404255319148937, | |
| "grad_norm": 0.6512318849563599, | |
| "learning_rate": 0.00016884, | |
| "loss": 2.0733022689819336, | |
| "mean_token_accuracy": 0.6271612212061882, | |
| "num_tokens": 4653388.0, | |
| "step": 780 | |
| }, | |
| { | |
| "entropy": 2.046033799648285, | |
| "epoch": 1.357618740597464, | |
| "grad_norm": 0.657543957233429, | |
| "learning_rate": 0.00016844, | |
| "loss": 2.122422790527344, | |
| "mean_token_accuracy": 0.6139081876724959, | |
| "num_tokens": 4707749.0, | |
| "step": 790 | |
| }, | |
| { | |
| "entropy": 2.0472232535481454, | |
| "epoch": 1.3748119492800344, | |
| "grad_norm": 0.5705983638763428, | |
| "learning_rate": 0.00016804, | |
| "loss": 2.1095462799072267, | |
| "mean_token_accuracy": 0.6147159416228533, | |
| "num_tokens": 4768198.0, | |
| "step": 800 | |
| }, | |
| { | |
| "entropy": 2.0032111018896104, | |
| "epoch": 1.3920051579626047, | |
| "grad_norm": 0.6248787045478821, | |
| "learning_rate": 0.00016764, | |
| "loss": 2.06416015625, | |
| "mean_token_accuracy": 0.626778207719326, | |
| "num_tokens": 4830049.0, | |
| "step": 810 | |
| }, | |
| { | |
| "entropy": 2.008875849843025, | |
| "epoch": 1.409198366645175, | |
| "grad_norm": 0.6245584487915039, | |
| "learning_rate": 0.00016724000000000003, | |
| "loss": 2.096910285949707, | |
| "mean_token_accuracy": 0.626385198161006, | |
| "num_tokens": 4887278.0, | |
| "step": 820 | |
| }, | |
| { | |
| "entropy": 2.080148513615131, | |
| "epoch": 1.4263915753277456, | |
| "grad_norm": 0.6255568861961365, | |
| "learning_rate": 0.00016684000000000002, | |
| "loss": 2.1830982208251952, | |
| "mean_token_accuracy": 0.6152387134730816, | |
| "num_tokens": 4946401.0, | |
| "step": 830 | |
| }, | |
| { | |
| "entropy": 1.999508222937584, | |
| "epoch": 1.443584784010316, | |
| "grad_norm": 0.6275898218154907, | |
| "learning_rate": 0.00016644, | |
| "loss": 2.1005062103271483, | |
| "mean_token_accuracy": 0.6225087266415358, | |
| "num_tokens": 5007769.0, | |
| "step": 840 | |
| }, | |
| { | |
| "entropy": 1.937928880751133, | |
| "epoch": 1.4607779926928863, | |
| "grad_norm": 0.5719529986381531, | |
| "learning_rate": 0.00016604000000000003, | |
| "loss": 2.026857376098633, | |
| "mean_token_accuracy": 0.6374159537255764, | |
| "num_tokens": 5065155.0, | |
| "step": 850 | |
| }, | |
| { | |
| "entropy": 2.0230891808867453, | |
| "epoch": 1.4779712013754567, | |
| "grad_norm": 0.6000656485557556, | |
| "learning_rate": 0.00016564000000000002, | |
| "loss": 2.1284557342529298, | |
| "mean_token_accuracy": 0.6221452355384827, | |
| "num_tokens": 5127964.0, | |
| "step": 860 | |
| }, | |
| { | |
| "entropy": 2.0356423661112784, | |
| "epoch": 1.495164410058027, | |
| "grad_norm": 0.6452302932739258, | |
| "learning_rate": 0.00016524, | |
| "loss": 2.136619758605957, | |
| "mean_token_accuracy": 0.6180427376180887, | |
| "num_tokens": 5186867.0, | |
| "step": 870 | |
| }, | |
| { | |
| "entropy": 1.980474129319191, | |
| "epoch": 1.5123576187405976, | |
| "grad_norm": 0.6840422749519348, | |
| "learning_rate": 0.00016484000000000003, | |
| "loss": 2.1068920135498046, | |
| "mean_token_accuracy": 0.6276284489780665, | |
| "num_tokens": 5243496.0, | |
| "step": 880 | |
| }, | |
| { | |
| "entropy": 1.9933902084827424, | |
| "epoch": 1.5295508274231677, | |
| "grad_norm": 0.6497262716293335, | |
| "learning_rate": 0.00016444000000000002, | |
| "loss": 2.0951356887817383, | |
| "mean_token_accuracy": 0.625996507704258, | |
| "num_tokens": 5305224.0, | |
| "step": 890 | |
| }, | |
| { | |
| "entropy": 1.9787120044231414, | |
| "epoch": 1.5467440361057383, | |
| "grad_norm": 0.5943708419799805, | |
| "learning_rate": 0.00016404, | |
| "loss": 2.050846481323242, | |
| "mean_token_accuracy": 0.6277968011796474, | |
| "num_tokens": 5362047.0, | |
| "step": 900 | |
| }, | |
| { | |
| "entropy": 1.9207568421959877, | |
| "epoch": 1.5639372447883086, | |
| "grad_norm": 0.7813357710838318, | |
| "learning_rate": 0.00016364000000000003, | |
| "loss": 2.01483097076416, | |
| "mean_token_accuracy": 0.6335461936891079, | |
| "num_tokens": 5418646.0, | |
| "step": 910 | |
| }, | |
| { | |
| "entropy": 1.991297036409378, | |
| "epoch": 1.581130453470879, | |
| "grad_norm": 0.6333187818527222, | |
| "learning_rate": 0.00016324000000000002, | |
| "loss": 2.042116165161133, | |
| "mean_token_accuracy": 0.6277558326721191, | |
| "num_tokens": 5474990.0, | |
| "step": 920 | |
| }, | |
| { | |
| "entropy": 2.049289306998253, | |
| "epoch": 1.5983236621534493, | |
| "grad_norm": 0.6549156308174133, | |
| "learning_rate": 0.00016284, | |
| "loss": 2.160791778564453, | |
| "mean_token_accuracy": 0.6178662430495023, | |
| "num_tokens": 5536798.0, | |
| "step": 930 | |
| }, | |
| { | |
| "entropy": 2.0044725999236106, | |
| "epoch": 1.6155168708360197, | |
| "grad_norm": 0.7054678201675415, | |
| "learning_rate": 0.00016244000000000002, | |
| "loss": 2.1012857437133787, | |
| "mean_token_accuracy": 0.6249351866543293, | |
| "num_tokens": 5592825.0, | |
| "step": 940 | |
| }, | |
| { | |
| "entropy": 1.8883072763681412, | |
| "epoch": 1.6327100795185903, | |
| "grad_norm": 0.6150692701339722, | |
| "learning_rate": 0.00016204000000000001, | |
| "loss": 1.933417510986328, | |
| "mean_token_accuracy": 0.6382322389632463, | |
| "num_tokens": 5647293.0, | |
| "step": 950 | |
| }, | |
| { | |
| "entropy": 1.9773345232009887, | |
| "epoch": 1.6499032882011604, | |
| "grad_norm": 0.6604284048080444, | |
| "learning_rate": 0.00016164, | |
| "loss": 2.0537574768066404, | |
| "mean_token_accuracy": 0.6284147780388594, | |
| "num_tokens": 5707851.0, | |
| "step": 960 | |
| }, | |
| { | |
| "entropy": 1.9602369159460067, | |
| "epoch": 1.667096496883731, | |
| "grad_norm": 0.5877639055252075, | |
| "learning_rate": 0.00016124000000000002, | |
| "loss": 2.0338172912597656, | |
| "mean_token_accuracy": 0.6378967847675086, | |
| "num_tokens": 5766339.0, | |
| "step": 970 | |
| }, | |
| { | |
| "entropy": 2.0475835338234902, | |
| "epoch": 1.6842897055663013, | |
| "grad_norm": 0.6029936075210571, | |
| "learning_rate": 0.00016084, | |
| "loss": 2.161564254760742, | |
| "mean_token_accuracy": 0.6204672615975142, | |
| "num_tokens": 5827901.0, | |
| "step": 980 | |
| }, | |
| { | |
| "entropy": 1.9963667973876, | |
| "epoch": 1.7014829142488717, | |
| "grad_norm": 0.6379806399345398, | |
| "learning_rate": 0.00016044, | |
| "loss": 2.087441825866699, | |
| "mean_token_accuracy": 0.6302045777440071, | |
| "num_tokens": 5888724.0, | |
| "step": 990 | |
| }, | |
| { | |
| "entropy": 2.003598253428936, | |
| "epoch": 1.7186761229314422, | |
| "grad_norm": 0.6113580465316772, | |
| "learning_rate": 0.00016004000000000002, | |
| "loss": 2.136977195739746, | |
| "mean_token_accuracy": 0.6279609728604555, | |
| "num_tokens": 5948999.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "entropy": 2.009714852273464, | |
| "epoch": 1.7358693316140124, | |
| "grad_norm": 0.6397438645362854, | |
| "learning_rate": 0.00015964, | |
| "loss": 2.1206714630126955, | |
| "mean_token_accuracy": 0.6264089956879616, | |
| "num_tokens": 6010131.0, | |
| "step": 1010 | |
| }, | |
| { | |
| "entropy": 1.9690447196364402, | |
| "epoch": 1.753062540296583, | |
| "grad_norm": 0.6629226803779602, | |
| "learning_rate": 0.00015924, | |
| "loss": 2.0379661560058593, | |
| "mean_token_accuracy": 0.6306859996169806, | |
| "num_tokens": 6070105.0, | |
| "step": 1020 | |
| }, | |
| { | |
| "entropy": 1.9761606559157372, | |
| "epoch": 1.7702557489791533, | |
| "grad_norm": 0.6340969800949097, | |
| "learning_rate": 0.00015884000000000002, | |
| "loss": 2.0979576110839844, | |
| "mean_token_accuracy": 0.6255223523825407, | |
| "num_tokens": 6129369.0, | |
| "step": 1030 | |
| }, | |
| { | |
| "entropy": 2.0855698764324186, | |
| "epoch": 1.7874489576617236, | |
| "grad_norm": 0.6090606451034546, | |
| "learning_rate": 0.00015844, | |
| "loss": 2.1925222396850588, | |
| "mean_token_accuracy": 0.6140031859278678, | |
| "num_tokens": 6191194.0, | |
| "step": 1040 | |
| }, | |
| { | |
| "entropy": 2.0149157389998438, | |
| "epoch": 1.804642166344294, | |
| "grad_norm": 0.6012734174728394, | |
| "learning_rate": 0.00015804, | |
| "loss": 2.104481506347656, | |
| "mean_token_accuracy": 0.6318388734012842, | |
| "num_tokens": 6254136.0, | |
| "step": 1050 | |
| }, | |
| { | |
| "entropy": 1.9774292945861816, | |
| "epoch": 1.8218353750268643, | |
| "grad_norm": 0.5775039792060852, | |
| "learning_rate": 0.00015764000000000002, | |
| "loss": 2.0788572311401365, | |
| "mean_token_accuracy": 0.6315916679799557, | |
| "num_tokens": 6311831.0, | |
| "step": 1060 | |
| }, | |
| { | |
| "entropy": 2.0096932500600815, | |
| "epoch": 1.839028583709435, | |
| "grad_norm": 0.6441799998283386, | |
| "learning_rate": 0.00015724, | |
| "loss": 2.067369079589844, | |
| "mean_token_accuracy": 0.6272157531231641, | |
| "num_tokens": 6368999.0, | |
| "step": 1070 | |
| }, | |
| { | |
| "entropy": 1.9509294107556343, | |
| "epoch": 1.856221792392005, | |
| "grad_norm": 0.7183738946914673, | |
| "learning_rate": 0.00015684, | |
| "loss": 2.046398162841797, | |
| "mean_token_accuracy": 0.6311775099486112, | |
| "num_tokens": 6426522.0, | |
| "step": 1080 | |
| }, | |
| { | |
| "entropy": 2.0291523337364197, | |
| "epoch": 1.8734150010745756, | |
| "grad_norm": 0.6105868816375732, | |
| "learning_rate": 0.00015644, | |
| "loss": 2.103832244873047, | |
| "mean_token_accuracy": 0.6217470221221447, | |
| "num_tokens": 6488781.0, | |
| "step": 1090 | |
| }, | |
| { | |
| "entropy": 1.9441853806376457, | |
| "epoch": 1.890608209757146, | |
| "grad_norm": 0.6209670901298523, | |
| "learning_rate": 0.00015604, | |
| "loss": 1.9830631256103515, | |
| "mean_token_accuracy": 0.6288409855216741, | |
| "num_tokens": 6545378.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "entropy": 1.9924452617764472, | |
| "epoch": 1.9078014184397163, | |
| "grad_norm": 0.6691610813140869, | |
| "learning_rate": 0.00015564000000000002, | |
| "loss": 2.0985618591308595, | |
| "mean_token_accuracy": 0.6272476647049189, | |
| "num_tokens": 6605917.0, | |
| "step": 1110 | |
| }, | |
| { | |
| "entropy": 1.9443897798657417, | |
| "epoch": 1.9249946271222869, | |
| "grad_norm": 0.5956576466560364, | |
| "learning_rate": 0.00015524, | |
| "loss": 2.0018499374389647, | |
| "mean_token_accuracy": 0.6381504714488984, | |
| "num_tokens": 6661811.0, | |
| "step": 1120 | |
| }, | |
| { | |
| "entropy": 1.9805707216262818, | |
| "epoch": 1.942187835804857, | |
| "grad_norm": 0.5946056842803955, | |
| "learning_rate": 0.00015484, | |
| "loss": 2.0427513122558594, | |
| "mean_token_accuracy": 0.6258678704500198, | |
| "num_tokens": 6723336.0, | |
| "step": 1130 | |
| }, | |
| { | |
| "entropy": 1.9137805163860322, | |
| "epoch": 1.9593810444874276, | |
| "grad_norm": 0.6030678749084473, | |
| "learning_rate": 0.00015444000000000002, | |
| "loss": 1.9697463989257813, | |
| "mean_token_accuracy": 0.641855177283287, | |
| "num_tokens": 6781069.0, | |
| "step": 1140 | |
| }, | |
| { | |
| "entropy": 1.9368563026189805, | |
| "epoch": 1.9765742531699977, | |
| "grad_norm": 0.6237244009971619, | |
| "learning_rate": 0.00015404, | |
| "loss": 2.034241485595703, | |
| "mean_token_accuracy": 0.6384489141404629, | |
| "num_tokens": 6840129.0, | |
| "step": 1150 | |
| }, | |
| { | |
| "entropy": 1.8723769560456276, | |
| "epoch": 1.9937674618525683, | |
| "grad_norm": 0.6409602761268616, | |
| "learning_rate": 0.00015364, | |
| "loss": 1.9205358505249024, | |
| "mean_token_accuracy": 0.6437219835817813, | |
| "num_tokens": 6894472.0, | |
| "step": 1160 | |
| }, | |
| { | |
| "entropy": 1.8628178556244095, | |
| "epoch": 2.010315925209542, | |
| "grad_norm": 0.6614267230033875, | |
| "learning_rate": 0.00015324000000000002, | |
| "loss": 1.8996776580810546, | |
| "mean_token_accuracy": 0.6492502008165631, | |
| "num_tokens": 6952821.0, | |
| "step": 1170 | |
| }, | |
| { | |
| "entropy": 1.8918349608778953, | |
| "epoch": 2.0275091338921127, | |
| "grad_norm": 0.6476360559463501, | |
| "learning_rate": 0.00015284, | |
| "loss": 1.9498180389404296, | |
| "mean_token_accuracy": 0.6428254719823598, | |
| "num_tokens": 7014989.0, | |
| "step": 1180 | |
| }, | |
| { | |
| "entropy": 1.945443508028984, | |
| "epoch": 2.044702342574683, | |
| "grad_norm": 0.7725419402122498, | |
| "learning_rate": 0.00015244, | |
| "loss": 2.020453453063965, | |
| "mean_token_accuracy": 0.6328805617988109, | |
| "num_tokens": 7073087.0, | |
| "step": 1190 | |
| }, | |
| { | |
| "entropy": 2.039360311627388, | |
| "epoch": 2.0618955512572534, | |
| "grad_norm": 0.6981257200241089, | |
| "learning_rate": 0.00015204000000000001, | |
| "loss": 2.137688636779785, | |
| "mean_token_accuracy": 0.6245126206427812, | |
| "num_tokens": 7129761.0, | |
| "step": 1200 | |
| }, | |
| { | |
| "entropy": 1.9039745211601258, | |
| "epoch": 2.0790887599398236, | |
| "grad_norm": 0.6344786882400513, | |
| "learning_rate": 0.00015164, | |
| "loss": 1.945602798461914, | |
| "mean_token_accuracy": 0.6396142981946469, | |
| "num_tokens": 7187187.0, | |
| "step": 1210 | |
| }, | |
| { | |
| "entropy": 1.9067329421639443, | |
| "epoch": 2.096281968622394, | |
| "grad_norm": 0.6346563696861267, | |
| "learning_rate": 0.00015124, | |
| "loss": 1.990530014038086, | |
| "mean_token_accuracy": 0.6383092008531094, | |
| "num_tokens": 7247237.0, | |
| "step": 1220 | |
| }, | |
| { | |
| "entropy": 1.8649237960577012, | |
| "epoch": 2.1134751773049647, | |
| "grad_norm": 0.6211341023445129, | |
| "learning_rate": 0.00015084, | |
| "loss": 1.9000562667846679, | |
| "mean_token_accuracy": 0.6510101232677699, | |
| "num_tokens": 7301760.0, | |
| "step": 1230 | |
| }, | |
| { | |
| "entropy": 1.9364619553089142, | |
| "epoch": 2.130668385987535, | |
| "grad_norm": 0.6074926257133484, | |
| "learning_rate": 0.00015044, | |
| "loss": 2.0338695526123045, | |
| "mean_token_accuracy": 0.6359073251485825, | |
| "num_tokens": 7361681.0, | |
| "step": 1240 | |
| }, | |
| { | |
| "entropy": 1.86348085552454, | |
| "epoch": 2.1478615946701054, | |
| "grad_norm": 0.7472113370895386, | |
| "learning_rate": 0.00015004, | |
| "loss": 1.9359153747558593, | |
| "mean_token_accuracy": 0.6460991870611906, | |
| "num_tokens": 7424162.0, | |
| "step": 1250 | |
| }, | |
| { | |
| "entropy": 1.9580871596932412, | |
| "epoch": 2.1650548033526755, | |
| "grad_norm": 0.6993629336357117, | |
| "learning_rate": 0.00014964, | |
| "loss": 2.0203479766845702, | |
| "mean_token_accuracy": 0.630286343768239, | |
| "num_tokens": 7486191.0, | |
| "step": 1260 | |
| }, | |
| { | |
| "entropy": 1.9111438870429993, | |
| "epoch": 2.182248012035246, | |
| "grad_norm": 0.6560285687446594, | |
| "learning_rate": 0.00014924, | |
| "loss": 1.9584983825683593, | |
| "mean_token_accuracy": 0.6434897668659687, | |
| "num_tokens": 7544447.0, | |
| "step": 1270 | |
| }, | |
| { | |
| "entropy": 1.931545352935791, | |
| "epoch": 2.1994412207178167, | |
| "grad_norm": 0.6713767647743225, | |
| "learning_rate": 0.00014884, | |
| "loss": 2.106512451171875, | |
| "mean_token_accuracy": 0.6318223185837268, | |
| "num_tokens": 7604237.0, | |
| "step": 1280 | |
| }, | |
| { | |
| "entropy": 1.9441128447651863, | |
| "epoch": 2.216634429400387, | |
| "grad_norm": 0.6440369486808777, | |
| "learning_rate": 0.00014844, | |
| "loss": 2.0373985290527346, | |
| "mean_token_accuracy": 0.6320536743849516, | |
| "num_tokens": 7661475.0, | |
| "step": 1290 | |
| }, | |
| { | |
| "entropy": 1.949166515469551, | |
| "epoch": 2.2338276380829574, | |
| "grad_norm": 0.6829583644866943, | |
| "learning_rate": 0.00014804, | |
| "loss": 2.005051040649414, | |
| "mean_token_accuracy": 0.6337476786226034, | |
| "num_tokens": 7720352.0, | |
| "step": 1300 | |
| }, | |
| { | |
| "entropy": 1.8687394335865974, | |
| "epoch": 2.2510208467655275, | |
| "grad_norm": 0.6745384335517883, | |
| "learning_rate": 0.00014764, | |
| "loss": 1.9336997985839843, | |
| "mean_token_accuracy": 0.6435953237116336, | |
| "num_tokens": 7780071.0, | |
| "step": 1310 | |
| }, | |
| { | |
| "entropy": 1.8999060586094856, | |
| "epoch": 2.268214055448098, | |
| "grad_norm": 0.5983019471168518, | |
| "learning_rate": 0.00014724, | |
| "loss": 1.9348258972167969, | |
| "mean_token_accuracy": 0.6431682731956243, | |
| "num_tokens": 7839864.0, | |
| "step": 1320 | |
| }, | |
| { | |
| "entropy": 1.868900626897812, | |
| "epoch": 2.285407264130668, | |
| "grad_norm": 0.6673943400382996, | |
| "learning_rate": 0.00014684, | |
| "loss": 1.9038555145263671, | |
| "mean_token_accuracy": 0.6503972858190536, | |
| "num_tokens": 7901781.0, | |
| "step": 1330 | |
| }, | |
| { | |
| "entropy": 2.0273515924811365, | |
| "epoch": 2.3026004728132388, | |
| "grad_norm": 0.7098233103752136, | |
| "learning_rate": 0.00014644, | |
| "loss": 2.1501066207885744, | |
| "mean_token_accuracy": 0.6240016058087349, | |
| "num_tokens": 7962103.0, | |
| "step": 1340 | |
| }, | |
| { | |
| "entropy": 1.9079027369618415, | |
| "epoch": 2.3197936814958093, | |
| "grad_norm": 0.6738902926445007, | |
| "learning_rate": 0.00014604, | |
| "loss": 1.9681257247924804, | |
| "mean_token_accuracy": 0.6416124865412712, | |
| "num_tokens": 8021532.0, | |
| "step": 1350 | |
| }, | |
| { | |
| "entropy": 1.9230551555752755, | |
| "epoch": 2.3369868901783795, | |
| "grad_norm": 0.7348875999450684, | |
| "learning_rate": 0.00014564, | |
| "loss": 1.995201301574707, | |
| "mean_token_accuracy": 0.6393462974578142, | |
| "num_tokens": 8083084.0, | |
| "step": 1360 | |
| }, | |
| { | |
| "entropy": 1.9632413163781166, | |
| "epoch": 2.35418009886095, | |
| "grad_norm": 0.7093244194984436, | |
| "learning_rate": 0.00014523999999999998, | |
| "loss": 2.0105892181396485, | |
| "mean_token_accuracy": 0.6326792053878307, | |
| "num_tokens": 8144387.0, | |
| "step": 1370 | |
| }, | |
| { | |
| "entropy": 1.9028392255306243, | |
| "epoch": 2.37137330754352, | |
| "grad_norm": 0.7751646041870117, | |
| "learning_rate": 0.00014484, | |
| "loss": 1.9892047882080077, | |
| "mean_token_accuracy": 0.6404657427221536, | |
| "num_tokens": 8203720.0, | |
| "step": 1380 | |
| }, | |
| { | |
| "entropy": 1.860563676059246, | |
| "epoch": 2.3885665162260907, | |
| "grad_norm": 0.7243943214416504, | |
| "learning_rate": 0.00014444, | |
| "loss": 1.9201997756958007, | |
| "mean_token_accuracy": 0.6510312400758267, | |
| "num_tokens": 8263059.0, | |
| "step": 1390 | |
| }, | |
| { | |
| "entropy": 1.8883199632167815, | |
| "epoch": 2.405759724908661, | |
| "grad_norm": 0.6106081604957581, | |
| "learning_rate": 0.00014404, | |
| "loss": 1.9687911987304687, | |
| "mean_token_accuracy": 0.6493277192115784, | |
| "num_tokens": 8323826.0, | |
| "step": 1400 | |
| }, | |
| { | |
| "entropy": 1.9286921486258506, | |
| "epoch": 2.4229529335912314, | |
| "grad_norm": 0.697307825088501, | |
| "learning_rate": 0.00014364, | |
| "loss": 2.030810546875, | |
| "mean_token_accuracy": 0.6362422123551369, | |
| "num_tokens": 8383325.0, | |
| "step": 1410 | |
| }, | |
| { | |
| "entropy": 1.9122009217739104, | |
| "epoch": 2.440146142273802, | |
| "grad_norm": 0.7119978070259094, | |
| "learning_rate": 0.00014324, | |
| "loss": 1.9788457870483398, | |
| "mean_token_accuracy": 0.6400811962783337, | |
| "num_tokens": 8442393.0, | |
| "step": 1420 | |
| }, | |
| { | |
| "entropy": 1.9088031873106956, | |
| "epoch": 2.457339350956372, | |
| "grad_norm": 0.6792617440223694, | |
| "learning_rate": 0.00014284, | |
| "loss": 1.995138931274414, | |
| "mean_token_accuracy": 0.63965779915452, | |
| "num_tokens": 8501631.0, | |
| "step": 1430 | |
| }, | |
| { | |
| "entropy": 1.8871790513396263, | |
| "epoch": 2.4745325596389427, | |
| "grad_norm": 0.7191487550735474, | |
| "learning_rate": 0.00014244000000000003, | |
| "loss": 1.9728309631347656, | |
| "mean_token_accuracy": 0.642948642373085, | |
| "num_tokens": 8561457.0, | |
| "step": 1440 | |
| }, | |
| { | |
| "entropy": 1.8643269062042236, | |
| "epoch": 2.491725768321513, | |
| "grad_norm": 0.636345386505127, | |
| "learning_rate": 0.00014204000000000002, | |
| "loss": 1.9090641021728516, | |
| "mean_token_accuracy": 0.6537593178451061, | |
| "num_tokens": 8618281.0, | |
| "step": 1450 | |
| }, | |
| { | |
| "entropy": 1.90698651522398, | |
| "epoch": 2.5089189770040834, | |
| "grad_norm": 0.7444325685501099, | |
| "learning_rate": 0.00014164, | |
| "loss": 1.9631458282470704, | |
| "mean_token_accuracy": 0.6428062118589878, | |
| "num_tokens": 8675547.0, | |
| "step": 1460 | |
| }, | |
| { | |
| "entropy": 1.942822016775608, | |
| "epoch": 2.526112185686654, | |
| "grad_norm": 0.6707946062088013, | |
| "learning_rate": 0.00014124000000000002, | |
| "loss": 2.0348419189453124, | |
| "mean_token_accuracy": 0.6381070952862501, | |
| "num_tokens": 8735546.0, | |
| "step": 1470 | |
| }, | |
| { | |
| "entropy": 1.979990416765213, | |
| "epoch": 2.543305394369224, | |
| "grad_norm": 0.6958721280097961, | |
| "learning_rate": 0.00014084000000000001, | |
| "loss": 2.072053146362305, | |
| "mean_token_accuracy": 0.6283687971532345, | |
| "num_tokens": 8796404.0, | |
| "step": 1480 | |
| }, | |
| { | |
| "entropy": 1.9207274168729782, | |
| "epoch": 2.5604986030517947, | |
| "grad_norm": 0.6575210690498352, | |
| "learning_rate": 0.00014044, | |
| "loss": 1.9427066802978517, | |
| "mean_token_accuracy": 0.642588010430336, | |
| "num_tokens": 8853405.0, | |
| "step": 1490 | |
| }, | |
| { | |
| "entropy": 1.9980120360851288, | |
| "epoch": 2.577691811734365, | |
| "grad_norm": 0.7412211298942566, | |
| "learning_rate": 0.00014004000000000002, | |
| "loss": 2.1047718048095705, | |
| "mean_token_accuracy": 0.6264939974993468, | |
| "num_tokens": 8909416.0, | |
| "step": 1500 | |
| }, | |
| { | |
| "entropy": 1.8110749498009682, | |
| "epoch": 2.5948850204169354, | |
| "grad_norm": 0.7108538746833801, | |
| "learning_rate": 0.00013964, | |
| "loss": 1.8952640533447265, | |
| "mean_token_accuracy": 0.6537120632827282, | |
| "num_tokens": 8968510.0, | |
| "step": 1510 | |
| }, | |
| { | |
| "entropy": 1.977073846757412, | |
| "epoch": 2.612078229099506, | |
| "grad_norm": 0.7554802298545837, | |
| "learning_rate": 0.00013924, | |
| "loss": 2.0621898651123045, | |
| "mean_token_accuracy": 0.6327366977930069, | |
| "num_tokens": 9026884.0, | |
| "step": 1520 | |
| }, | |
| { | |
| "entropy": 1.8783492282032968, | |
| "epoch": 2.629271437782076, | |
| "grad_norm": 0.6592015027999878, | |
| "learning_rate": 0.00013884000000000002, | |
| "loss": 1.9230785369873047, | |
| "mean_token_accuracy": 0.6494536675512791, | |
| "num_tokens": 9085571.0, | |
| "step": 1530 | |
| }, | |
| { | |
| "entropy": 1.9282778173685073, | |
| "epoch": 2.6464646464646466, | |
| "grad_norm": 0.7717080116271973, | |
| "learning_rate": 0.00013844, | |
| "loss": 2.0319377899169924, | |
| "mean_token_accuracy": 0.6344667036086321, | |
| "num_tokens": 9147549.0, | |
| "step": 1540 | |
| }, | |
| { | |
| "entropy": 1.903467869758606, | |
| "epoch": 2.6636578551472168, | |
| "grad_norm": 0.6227516531944275, | |
| "learning_rate": 0.00013804000000000003, | |
| "loss": 1.9306724548339844, | |
| "mean_token_accuracy": 0.644033481925726, | |
| "num_tokens": 9204942.0, | |
| "step": 1550 | |
| }, | |
| { | |
| "entropy": 1.8967040538787843, | |
| "epoch": 2.6808510638297873, | |
| "grad_norm": 0.6684938073158264, | |
| "learning_rate": 0.00013764000000000002, | |
| "loss": 2.001560592651367, | |
| "mean_token_accuracy": 0.6470274899154902, | |
| "num_tokens": 9266446.0, | |
| "step": 1560 | |
| }, | |
| { | |
| "entropy": 1.8590586185455322, | |
| "epoch": 2.6980442725123575, | |
| "grad_norm": 0.6150694489479065, | |
| "learning_rate": 0.00013724, | |
| "loss": 1.9280338287353516, | |
| "mean_token_accuracy": 0.6484670951962471, | |
| "num_tokens": 9326109.0, | |
| "step": 1570 | |
| }, | |
| { | |
| "entropy": 1.9293041676282883, | |
| "epoch": 2.715237481194928, | |
| "grad_norm": 0.6057704091072083, | |
| "learning_rate": 0.00013684000000000002, | |
| "loss": 1.9943519592285157, | |
| "mean_token_accuracy": 0.6371258046478033, | |
| "num_tokens": 9385073.0, | |
| "step": 1580 | |
| }, | |
| { | |
| "entropy": 1.8843669161200522, | |
| "epoch": 2.732430689877498, | |
| "grad_norm": 0.6834639310836792, | |
| "learning_rate": 0.00013644000000000002, | |
| "loss": 1.9569879531860352, | |
| "mean_token_accuracy": 0.6437417894601822, | |
| "num_tokens": 9445137.0, | |
| "step": 1590 | |
| }, | |
| { | |
| "entropy": 1.8529930964112282, | |
| "epoch": 2.7496238985600687, | |
| "grad_norm": 0.6442180871963501, | |
| "learning_rate": 0.00013604, | |
| "loss": 1.8902450561523438, | |
| "mean_token_accuracy": 0.6518216013908387, | |
| "num_tokens": 9504160.0, | |
| "step": 1600 | |
| }, | |
| { | |
| "entropy": 1.939158782362938, | |
| "epoch": 2.7668171072426393, | |
| "grad_norm": 0.6240729689598083, | |
| "learning_rate": 0.00013564000000000002, | |
| "loss": 2.0188575744628907, | |
| "mean_token_accuracy": 0.63564417026937, | |
| "num_tokens": 9564675.0, | |
| "step": 1610 | |
| }, | |
| { | |
| "entropy": 1.9281259045004844, | |
| "epoch": 2.7840103159252094, | |
| "grad_norm": 0.750890851020813, | |
| "learning_rate": 0.00013524, | |
| "loss": 2.017038345336914, | |
| "mean_token_accuracy": 0.6387452960014344, | |
| "num_tokens": 9625026.0, | |
| "step": 1620 | |
| }, | |
| { | |
| "entropy": 1.873080413043499, | |
| "epoch": 2.80120352460778, | |
| "grad_norm": 0.776397168636322, | |
| "learning_rate": 0.00013484, | |
| "loss": 1.9759422302246095, | |
| "mean_token_accuracy": 0.6433901283890009, | |
| "num_tokens": 9685967.0, | |
| "step": 1630 | |
| }, | |
| { | |
| "entropy": 1.9089648619294166, | |
| "epoch": 2.81839673329035, | |
| "grad_norm": 0.6481618881225586, | |
| "learning_rate": 0.00013444000000000002, | |
| "loss": 1.956050491333008, | |
| "mean_token_accuracy": 0.6402542922645807, | |
| "num_tokens": 9745233.0, | |
| "step": 1640 | |
| }, | |
| { | |
| "entropy": 1.975960558652878, | |
| "epoch": 2.8355899419729207, | |
| "grad_norm": 0.6896694302558899, | |
| "learning_rate": 0.00013404, | |
| "loss": 2.0583721160888673, | |
| "mean_token_accuracy": 0.6340504981577396, | |
| "num_tokens": 9805150.0, | |
| "step": 1650 | |
| }, | |
| { | |
| "entropy": 1.945571132004261, | |
| "epoch": 2.8527831506554913, | |
| "grad_norm": 0.6386220455169678, | |
| "learning_rate": 0.00013364, | |
| "loss": 2.03116512298584, | |
| "mean_token_accuracy": 0.6365220382809639, | |
| "num_tokens": 9861196.0, | |
| "step": 1660 | |
| }, | |
| { | |
| "entropy": 1.9110410138964653, | |
| "epoch": 2.8699763593380614, | |
| "grad_norm": 0.7503199577331543, | |
| "learning_rate": 0.00013324000000000002, | |
| "loss": 1.9521196365356446, | |
| "mean_token_accuracy": 0.6381696432828903, | |
| "num_tokens": 9921155.0, | |
| "step": 1670 | |
| }, | |
| { | |
| "entropy": 1.849820225685835, | |
| "epoch": 2.887169568020632, | |
| "grad_norm": 0.6197855472564697, | |
| "learning_rate": 0.00013284, | |
| "loss": 1.8909440994262696, | |
| "mean_token_accuracy": 0.6426266122609376, | |
| "num_tokens": 9979351.0, | |
| "step": 1680 | |
| }, | |
| { | |
| "entropy": 1.8932805389165879, | |
| "epoch": 2.904362776703202, | |
| "grad_norm": 0.6703120470046997, | |
| "learning_rate": 0.00013244, | |
| "loss": 2.0233718872070314, | |
| "mean_token_accuracy": 0.646468547359109, | |
| "num_tokens": 10041238.0, | |
| "step": 1690 | |
| }, | |
| { | |
| "entropy": 1.8625088930130005, | |
| "epoch": 2.9215559853857727, | |
| "grad_norm": 0.73073410987854, | |
| "learning_rate": 0.00013204000000000002, | |
| "loss": 1.9317462921142579, | |
| "mean_token_accuracy": 0.6454428397119045, | |
| "num_tokens": 10099496.0, | |
| "step": 1700 | |
| }, | |
| { | |
| "entropy": 1.9354272544384004, | |
| "epoch": 2.9387491940683432, | |
| "grad_norm": 0.6566579937934875, | |
| "learning_rate": 0.00013164, | |
| "loss": 2.0027164459228515, | |
| "mean_token_accuracy": 0.6403926335275173, | |
| "num_tokens": 10161720.0, | |
| "step": 1710 | |
| }, | |
| { | |
| "entropy": 1.88578300178051, | |
| "epoch": 2.9559424027509134, | |
| "grad_norm": 0.7905747890472412, | |
| "learning_rate": 0.00013124, | |
| "loss": 1.9767372131347656, | |
| "mean_token_accuracy": 0.6421503167599439, | |
| "num_tokens": 10221734.0, | |
| "step": 1720 | |
| }, | |
| { | |
| "entropy": 1.870301403105259, | |
| "epoch": 2.973135611433484, | |
| "grad_norm": 0.7210419774055481, | |
| "learning_rate": 0.00013084000000000001, | |
| "loss": 1.9475433349609375, | |
| "mean_token_accuracy": 0.6528905872255564, | |
| "num_tokens": 10280223.0, | |
| "step": 1730 | |
| }, | |
| { | |
| "entropy": 1.8696911588311196, | |
| "epoch": 2.990328820116054, | |
| "grad_norm": 0.626354992389679, | |
| "learning_rate": 0.00013044, | |
| "loss": 1.926706314086914, | |
| "mean_token_accuracy": 0.6482070714235306, | |
| "num_tokens": 10339813.0, | |
| "step": 1740 | |
| }, | |
| { | |
| "entropy": 1.821205088844547, | |
| "epoch": 3.006877283473028, | |
| "grad_norm": 0.6353569030761719, | |
| "learning_rate": 0.00013004, | |
| "loss": 1.8657075881958007, | |
| "mean_token_accuracy": 0.6556356762136731, | |
| "num_tokens": 10398519.0, | |
| "step": 1750 | |
| }, | |
| { | |
| "entropy": 1.8890676617622375, | |
| "epoch": 3.0240704921555985, | |
| "grad_norm": 0.783729076385498, | |
| "learning_rate": 0.00012964, | |
| "loss": 1.9794137954711915, | |
| "mean_token_accuracy": 0.643126554042101, | |
| "num_tokens": 10456386.0, | |
| "step": 1760 | |
| }, | |
| { | |
| "entropy": 1.8766882956027984, | |
| "epoch": 3.041263700838169, | |
| "grad_norm": 0.7075045108795166, | |
| "learning_rate": 0.00012924, | |
| "loss": 1.9388771057128906, | |
| "mean_token_accuracy": 0.6462941512465477, | |
| "num_tokens": 10516721.0, | |
| "step": 1770 | |
| }, | |
| { | |
| "entropy": 1.7985384911298752, | |
| "epoch": 3.0584569095207392, | |
| "grad_norm": 0.7116262912750244, | |
| "learning_rate": 0.00012884, | |
| "loss": 1.8379974365234375, | |
| "mean_token_accuracy": 0.6582404263317585, | |
| "num_tokens": 10575553.0, | |
| "step": 1780 | |
| }, | |
| { | |
| "entropy": 1.8475583091378212, | |
| "epoch": 3.07565011820331, | |
| "grad_norm": 0.69736248254776, | |
| "learning_rate": 0.00012844, | |
| "loss": 1.9197765350341798, | |
| "mean_token_accuracy": 0.6509403776377439, | |
| "num_tokens": 10632501.0, | |
| "step": 1790 | |
| }, | |
| { | |
| "entropy": 1.8264927819371224, | |
| "epoch": 3.09284332688588, | |
| "grad_norm": 0.6354222297668457, | |
| "learning_rate": 0.00012804, | |
| "loss": 1.8965986251831055, | |
| "mean_token_accuracy": 0.6518782209604979, | |
| "num_tokens": 10693167.0, | |
| "step": 1800 | |
| }, | |
| { | |
| "entropy": 1.8696907818317414, | |
| "epoch": 3.1100365355684505, | |
| "grad_norm": 0.7568804621696472, | |
| "learning_rate": 0.00012764, | |
| "loss": 1.9332853317260743, | |
| "mean_token_accuracy": 0.6471077598631382, | |
| "num_tokens": 10753837.0, | |
| "step": 1810 | |
| }, | |
| { | |
| "entropy": 1.886954003572464, | |
| "epoch": 3.1272297442510206, | |
| "grad_norm": 0.7069846391677856, | |
| "learning_rate": 0.00012724, | |
| "loss": 1.9263908386230468, | |
| "mean_token_accuracy": 0.6466126769781113, | |
| "num_tokens": 10815256.0, | |
| "step": 1820 | |
| }, | |
| { | |
| "entropy": 1.8424360305070877, | |
| "epoch": 3.144422952933591, | |
| "grad_norm": 0.6524083614349365, | |
| "learning_rate": 0.00012684, | |
| "loss": 1.9088315963745117, | |
| "mean_token_accuracy": 0.6496367674320936, | |
| "num_tokens": 10877848.0, | |
| "step": 1830 | |
| }, | |
| { | |
| "entropy": 1.8966794192790986, | |
| "epoch": 3.1616161616161618, | |
| "grad_norm": 0.687421977519989, | |
| "learning_rate": 0.00012644000000000002, | |
| "loss": 1.9748069763183593, | |
| "mean_token_accuracy": 0.6424707356840372, | |
| "num_tokens": 10938042.0, | |
| "step": 1840 | |
| }, | |
| { | |
| "entropy": 1.81406429708004, | |
| "epoch": 3.178809370298732, | |
| "grad_norm": 0.7668496370315552, | |
| "learning_rate": 0.00012604, | |
| "loss": 1.8712465286254882, | |
| "mean_token_accuracy": 0.6571074567735196, | |
| "num_tokens": 10996204.0, | |
| "step": 1850 | |
| }, | |
| { | |
| "entropy": 1.8159340515732765, | |
| "epoch": 3.1960025789813025, | |
| "grad_norm": 0.7182545065879822, | |
| "learning_rate": 0.00012564, | |
| "loss": 1.830276107788086, | |
| "mean_token_accuracy": 0.6546356856822968, | |
| "num_tokens": 11056605.0, | |
| "step": 1860 | |
| }, | |
| { | |
| "entropy": 1.9095668271183968, | |
| "epoch": 3.2131957876638726, | |
| "grad_norm": 0.7548812031745911, | |
| "learning_rate": 0.00012524000000000001, | |
| "loss": 1.998922348022461, | |
| "mean_token_accuracy": 0.6411306612193585, | |
| "num_tokens": 11116614.0, | |
| "step": 1870 | |
| }, | |
| { | |
| "entropy": 1.8717206478118897, | |
| "epoch": 3.230388996346443, | |
| "grad_norm": 0.7692223191261292, | |
| "learning_rate": 0.00012484, | |
| "loss": 1.914438247680664, | |
| "mean_token_accuracy": 0.6441164951771498, | |
| "num_tokens": 11175802.0, | |
| "step": 1880 | |
| }, | |
| { | |
| "entropy": 1.8943733513355254, | |
| "epoch": 3.2475822050290137, | |
| "grad_norm": 0.6439138650894165, | |
| "learning_rate": 0.00012444, | |
| "loss": 1.9280553817749024, | |
| "mean_token_accuracy": 0.6476396139711141, | |
| "num_tokens": 11236477.0, | |
| "step": 1890 | |
| }, | |
| { | |
| "entropy": 1.8841392308473588, | |
| "epoch": 3.264775413711584, | |
| "grad_norm": 0.6971343159675598, | |
| "learning_rate": 0.00012404, | |
| "loss": 1.942568588256836, | |
| "mean_token_accuracy": 0.6398356795310974, | |
| "num_tokens": 11295146.0, | |
| "step": 1900 | |
| }, | |
| { | |
| "entropy": 1.8830088019371032, | |
| "epoch": 3.2819686223941544, | |
| "grad_norm": 0.7196023464202881, | |
| "learning_rate": 0.00012364, | |
| "loss": 1.963007354736328, | |
| "mean_token_accuracy": 0.6452915534377098, | |
| "num_tokens": 11355726.0, | |
| "step": 1910 | |
| }, | |
| { | |
| "entropy": 1.927216087281704, | |
| "epoch": 3.2991618310767246, | |
| "grad_norm": 0.790634274482727, | |
| "learning_rate": 0.00012324, | |
| "loss": 2.0809165954589846, | |
| "mean_token_accuracy": 0.6384686015546321, | |
| "num_tokens": 11415237.0, | |
| "step": 1920 | |
| }, | |
| { | |
| "entropy": 1.849087017774582, | |
| "epoch": 3.316355039759295, | |
| "grad_norm": 0.6752087473869324, | |
| "learning_rate": 0.00012284, | |
| "loss": 1.9017595291137694, | |
| "mean_token_accuracy": 0.6522149413824081, | |
| "num_tokens": 11476337.0, | |
| "step": 1930 | |
| }, | |
| { | |
| "entropy": 1.8517325416207313, | |
| "epoch": 3.3335482484418657, | |
| "grad_norm": 0.8036973476409912, | |
| "learning_rate": 0.00012244, | |
| "loss": 1.9011222839355468, | |
| "mean_token_accuracy": 0.6499856971204281, | |
| "num_tokens": 11537529.0, | |
| "step": 1940 | |
| }, | |
| { | |
| "entropy": 1.7622334837913514, | |
| "epoch": 3.350741457124436, | |
| "grad_norm": 0.7138587832450867, | |
| "learning_rate": 0.00012204, | |
| "loss": 1.7955827713012695, | |
| "mean_token_accuracy": 0.6596556272357702, | |
| "num_tokens": 11595421.0, | |
| "step": 1950 | |
| }, | |
| { | |
| "entropy": 1.8950866341590882, | |
| "epoch": 3.3679346658070064, | |
| "grad_norm": 0.6869714260101318, | |
| "learning_rate": 0.00012164, | |
| "loss": 1.948552131652832, | |
| "mean_token_accuracy": 0.6493024453520775, | |
| "num_tokens": 11655749.0, | |
| "step": 1960 | |
| }, | |
| { | |
| "entropy": 1.9235218942165375, | |
| "epoch": 3.3851278744895765, | |
| "grad_norm": 0.656403124332428, | |
| "learning_rate": 0.00012124, | |
| "loss": 2.04327449798584, | |
| "mean_token_accuracy": 0.6389912366867065, | |
| "num_tokens": 11717271.0, | |
| "step": 1970 | |
| }, | |
| { | |
| "entropy": 1.834906594455242, | |
| "epoch": 3.402321083172147, | |
| "grad_norm": 0.7343699932098389, | |
| "learning_rate": 0.00012084, | |
| "loss": 1.9038848876953125, | |
| "mean_token_accuracy": 0.6569048661738635, | |
| "num_tokens": 11778095.0, | |
| "step": 1980 | |
| }, | |
| { | |
| "entropy": 1.8515655741095542, | |
| "epoch": 3.4195142918547172, | |
| "grad_norm": 0.7009745240211487, | |
| "learning_rate": 0.00012043999999999999, | |
| "loss": 1.9157728195190429, | |
| "mean_token_accuracy": 0.6512683361768723, | |
| "num_tokens": 11835954.0, | |
| "step": 1990 | |
| }, | |
| { | |
| "entropy": 1.8634012743830681, | |
| "epoch": 3.436707500537288, | |
| "grad_norm": 0.6880552172660828, | |
| "learning_rate": 0.00012004, | |
| "loss": 1.9772762298583983, | |
| "mean_token_accuracy": 0.6531724959611893, | |
| "num_tokens": 11896615.0, | |
| "step": 2000 | |
| }, | |
| { | |
| "entropy": 1.8952298507094383, | |
| "epoch": 3.453900709219858, | |
| "grad_norm": 0.7292787432670593, | |
| "learning_rate": 0.00011964, | |
| "loss": 1.9302806854248047, | |
| "mean_token_accuracy": 0.6462091594934464, | |
| "num_tokens": 11954949.0, | |
| "step": 2010 | |
| }, | |
| { | |
| "entropy": 1.8723753660917282, | |
| "epoch": 3.4710939179024285, | |
| "grad_norm": 0.730530858039856, | |
| "learning_rate": 0.00011923999999999999, | |
| "loss": 1.9216194152832031, | |
| "mean_token_accuracy": 0.6504904717206955, | |
| "num_tokens": 12013803.0, | |
| "step": 2020 | |
| }, | |
| { | |
| "entropy": 1.8673277243971824, | |
| "epoch": 3.488287126584999, | |
| "grad_norm": 0.7530126571655273, | |
| "learning_rate": 0.00011884, | |
| "loss": 1.968985366821289, | |
| "mean_token_accuracy": 0.646847129613161, | |
| "num_tokens": 12073284.0, | |
| "step": 2030 | |
| }, | |
| { | |
| "entropy": 1.8757897645235062, | |
| "epoch": 3.505480335267569, | |
| "grad_norm": 0.7031217813491821, | |
| "learning_rate": 0.00011844, | |
| "loss": 1.9071741104125977, | |
| "mean_token_accuracy": 0.6450003884732723, | |
| "num_tokens": 12126451.0, | |
| "step": 2040 | |
| }, | |
| { | |
| "entropy": 1.7986262783408165, | |
| "epoch": 3.5226735439501398, | |
| "grad_norm": 0.7223983407020569, | |
| "learning_rate": 0.00011804, | |
| "loss": 1.8450950622558593, | |
| "mean_token_accuracy": 0.6576410517096519, | |
| "num_tokens": 12183343.0, | |
| "step": 2050 | |
| }, | |
| { | |
| "entropy": 1.8884935915470122, | |
| "epoch": 3.53986675263271, | |
| "grad_norm": 0.7206518650054932, | |
| "learning_rate": 0.00011763999999999999, | |
| "loss": 1.9660964965820313, | |
| "mean_token_accuracy": 0.6422303304076195, | |
| "num_tokens": 12243607.0, | |
| "step": 2060 | |
| }, | |
| { | |
| "entropy": 1.8009026944637299, | |
| "epoch": 3.5570599613152805, | |
| "grad_norm": 0.7229637503623962, | |
| "learning_rate": 0.00011724000000000002, | |
| "loss": 1.851433563232422, | |
| "mean_token_accuracy": 0.6556052915751934, | |
| "num_tokens": 12304867.0, | |
| "step": 2070 | |
| }, | |
| { | |
| "entropy": 1.7949693977832795, | |
| "epoch": 3.574253169997851, | |
| "grad_norm": 0.6935518383979797, | |
| "learning_rate": 0.00011684000000000001, | |
| "loss": 1.8848058700561523, | |
| "mean_token_accuracy": 0.6580755174160003, | |
| "num_tokens": 12367633.0, | |
| "step": 2080 | |
| }, | |
| { | |
| "entropy": 1.8038981169462205, | |
| "epoch": 3.591446378680421, | |
| "grad_norm": 0.7003904581069946, | |
| "learning_rate": 0.00011644000000000002, | |
| "loss": 1.8867233276367188, | |
| "mean_token_accuracy": 0.655081395432353, | |
| "num_tokens": 12423928.0, | |
| "step": 2090 | |
| }, | |
| { | |
| "entropy": 1.850062020123005, | |
| "epoch": 3.6086395873629917, | |
| "grad_norm": 0.6852926015853882, | |
| "learning_rate": 0.00011604000000000002, | |
| "loss": 1.9325201034545898, | |
| "mean_token_accuracy": 0.6472255479544401, | |
| "num_tokens": 12479411.0, | |
| "step": 2100 | |
| }, | |
| { | |
| "entropy": 1.8294448778033257, | |
| "epoch": 3.625832796045562, | |
| "grad_norm": 0.7044693827629089, | |
| "learning_rate": 0.00011564000000000001, | |
| "loss": 1.8989273071289063, | |
| "mean_token_accuracy": 0.6499249216169118, | |
| "num_tokens": 12539175.0, | |
| "step": 2110 | |
| }, | |
| { | |
| "entropy": 1.8719267755746842, | |
| "epoch": 3.6430260047281324, | |
| "grad_norm": 0.7180586457252502, | |
| "learning_rate": 0.00011524000000000001, | |
| "loss": 1.925216293334961, | |
| "mean_token_accuracy": 0.648950444161892, | |
| "num_tokens": 12598337.0, | |
| "step": 2120 | |
| }, | |
| { | |
| "entropy": 1.88923449665308, | |
| "epoch": 3.660219213410703, | |
| "grad_norm": 0.7464597821235657, | |
| "learning_rate": 0.00011484000000000002, | |
| "loss": 1.990826416015625, | |
| "mean_token_accuracy": 0.6456409864127636, | |
| "num_tokens": 12656592.0, | |
| "step": 2130 | |
| }, | |
| { | |
| "entropy": 1.8126205861568452, | |
| "epoch": 3.677412422093273, | |
| "grad_norm": 0.7253774404525757, | |
| "learning_rate": 0.00011444000000000001, | |
| "loss": 1.9414216995239257, | |
| "mean_token_accuracy": 0.6552157323807478, | |
| "num_tokens": 12717791.0, | |
| "step": 2140 | |
| }, | |
| { | |
| "entropy": 1.8930377542972565, | |
| "epoch": 3.6946056307758437, | |
| "grad_norm": 0.7404170036315918, | |
| "learning_rate": 0.00011404000000000001, | |
| "loss": 1.9364784240722657, | |
| "mean_token_accuracy": 0.6434980578720569, | |
| "num_tokens": 12775445.0, | |
| "step": 2150 | |
| }, | |
| { | |
| "entropy": 1.7652419656515121, | |
| "epoch": 3.711798839458414, | |
| "grad_norm": 0.688732385635376, | |
| "learning_rate": 0.00011364000000000002, | |
| "loss": 1.7636165618896484, | |
| "mean_token_accuracy": 0.6639453627169132, | |
| "num_tokens": 12834599.0, | |
| "step": 2160 | |
| }, | |
| { | |
| "entropy": 1.7745767116546631, | |
| "epoch": 3.7289920481409844, | |
| "grad_norm": 0.7011992335319519, | |
| "learning_rate": 0.00011324000000000001, | |
| "loss": 1.8347841262817384, | |
| "mean_token_accuracy": 0.6586773280054331, | |
| "num_tokens": 12889887.0, | |
| "step": 2170 | |
| }, | |
| { | |
| "entropy": 1.7952505484223367, | |
| "epoch": 3.746185256823555, | |
| "grad_norm": 0.7646785378456116, | |
| "learning_rate": 0.00011284000000000001, | |
| "loss": 1.883163070678711, | |
| "mean_token_accuracy": 0.6589437790215016, | |
| "num_tokens": 12950286.0, | |
| "step": 2180 | |
| }, | |
| { | |
| "entropy": 1.8878965258598328, | |
| "epoch": 3.763378465506125, | |
| "grad_norm": 0.7722623944282532, | |
| "learning_rate": 0.00011244000000000001, | |
| "loss": 1.9674694061279296, | |
| "mean_token_accuracy": 0.6422343414276839, | |
| "num_tokens": 13011083.0, | |
| "step": 2190 | |
| }, | |
| { | |
| "entropy": 1.919720321893692, | |
| "epoch": 3.780571674188695, | |
| "grad_norm": 0.7656893134117126, | |
| "learning_rate": 0.00011204000000000002, | |
| "loss": 1.9919773101806642, | |
| "mean_token_accuracy": 0.6393908958882093, | |
| "num_tokens": 13069376.0, | |
| "step": 2200 | |
| }, | |
| { | |
| "entropy": 1.77825688123703, | |
| "epoch": 3.797764882871266, | |
| "grad_norm": 0.8324808478355408, | |
| "learning_rate": 0.00011164000000000001, | |
| "loss": 1.8173160552978516, | |
| "mean_token_accuracy": 0.659475727379322, | |
| "num_tokens": 13124851.0, | |
| "step": 2210 | |
| }, | |
| { | |
| "entropy": 1.8232837438583374, | |
| "epoch": 3.8149580915538364, | |
| "grad_norm": 0.741481363773346, | |
| "learning_rate": 0.00011124000000000001, | |
| "loss": 1.860748291015625, | |
| "mean_token_accuracy": 0.6524971850216389, | |
| "num_tokens": 13182576.0, | |
| "step": 2220 | |
| }, | |
| { | |
| "entropy": 1.8588940657675266, | |
| "epoch": 3.8321513002364065, | |
| "grad_norm": 0.7748705148696899, | |
| "learning_rate": 0.00011084000000000002, | |
| "loss": 1.9206954956054687, | |
| "mean_token_accuracy": 0.6516353718936443, | |
| "num_tokens": 13242703.0, | |
| "step": 2230 | |
| }, | |
| { | |
| "entropy": 1.823398308455944, | |
| "epoch": 3.849344508918977, | |
| "grad_norm": 0.6341049671173096, | |
| "learning_rate": 0.00011044, | |
| "loss": 1.8718917846679688, | |
| "mean_token_accuracy": 0.6596139155328273, | |
| "num_tokens": 13303181.0, | |
| "step": 2240 | |
| }, | |
| { | |
| "entropy": 1.8098929420113563, | |
| "epoch": 3.866537717601547, | |
| "grad_norm": 0.6672969460487366, | |
| "learning_rate": 0.00011004000000000001, | |
| "loss": 1.8999752044677733, | |
| "mean_token_accuracy": 0.6594760783016682, | |
| "num_tokens": 13364371.0, | |
| "step": 2250 | |
| }, | |
| { | |
| "entropy": 1.7795367375016213, | |
| "epoch": 3.8837309262841178, | |
| "grad_norm": 0.6343891024589539, | |
| "learning_rate": 0.00010964000000000001, | |
| "loss": 1.827276611328125, | |
| "mean_token_accuracy": 0.6668465688824654, | |
| "num_tokens": 13425450.0, | |
| "step": 2260 | |
| }, | |
| { | |
| "entropy": 1.8673226684331894, | |
| "epoch": 3.9009241349666883, | |
| "grad_norm": 0.7357877492904663, | |
| "learning_rate": 0.00010924, | |
| "loss": 1.9206443786621095, | |
| "mean_token_accuracy": 0.647479448094964, | |
| "num_tokens": 13485806.0, | |
| "step": 2270 | |
| }, | |
| { | |
| "entropy": 1.806484942883253, | |
| "epoch": 3.9181173436492585, | |
| "grad_norm": 0.7172144055366516, | |
| "learning_rate": 0.00010884000000000001, | |
| "loss": 1.8789045333862304, | |
| "mean_token_accuracy": 0.6594084780663252, | |
| "num_tokens": 13544934.0, | |
| "step": 2280 | |
| }, | |
| { | |
| "entropy": 1.7970930591225625, | |
| "epoch": 3.935310552331829, | |
| "grad_norm": 0.7578801512718201, | |
| "learning_rate": 0.00010844000000000001, | |
| "loss": 1.8405040740966796, | |
| "mean_token_accuracy": 0.6608923889696598, | |
| "num_tokens": 13606653.0, | |
| "step": 2290 | |
| }, | |
| { | |
| "entropy": 1.8469372361898422, | |
| "epoch": 3.952503761014399, | |
| "grad_norm": 0.7626324892044067, | |
| "learning_rate": 0.00010804, | |
| "loss": 1.8629837036132812, | |
| "mean_token_accuracy": 0.6560039456933737, | |
| "num_tokens": 13663938.0, | |
| "step": 2300 | |
| }, | |
| { | |
| "entropy": 1.836122378706932, | |
| "epoch": 3.9696969696969697, | |
| "grad_norm": 0.7074365615844727, | |
| "learning_rate": 0.00010764, | |
| "loss": 1.8942070007324219, | |
| "mean_token_accuracy": 0.647238065674901, | |
| "num_tokens": 13722549.0, | |
| "step": 2310 | |
| }, | |
| { | |
| "entropy": 1.821449062973261, | |
| "epoch": 3.9868901783795403, | |
| "grad_norm": 0.6956577301025391, | |
| "learning_rate": 0.00010724000000000001, | |
| "loss": 1.8947336196899414, | |
| "mean_token_accuracy": 0.6528103355318308, | |
| "num_tokens": 13785922.0, | |
| "step": 2320 | |
| }, | |
| { | |
| "entropy": 1.839719023023333, | |
| "epoch": 4.003438641736514, | |
| "grad_norm": 0.6865222454071045, | |
| "learning_rate": 0.00010684, | |
| "loss": 1.8803377151489258, | |
| "mean_token_accuracy": 0.6526942384707464, | |
| "num_tokens": 13844647.0, | |
| "step": 2330 | |
| }, | |
| { | |
| "entropy": 1.855065654218197, | |
| "epoch": 4.020631850419084, | |
| "grad_norm": 0.7424384355545044, | |
| "learning_rate": 0.00010644, | |
| "loss": 1.9461166381835937, | |
| "mean_token_accuracy": 0.6463506512343884, | |
| "num_tokens": 13904724.0, | |
| "step": 2340 | |
| }, | |
| { | |
| "entropy": 1.7508789122104644, | |
| "epoch": 4.037825059101655, | |
| "grad_norm": 0.6670609712600708, | |
| "learning_rate": 0.00010604000000000001, | |
| "loss": 1.781893539428711, | |
| "mean_token_accuracy": 0.6653038747608662, | |
| "num_tokens": 13963472.0, | |
| "step": 2350 | |
| }, | |
| { | |
| "entropy": 1.8165026590228082, | |
| "epoch": 4.0550182677842255, | |
| "grad_norm": 0.7823750376701355, | |
| "learning_rate": 0.00010564000000000001, | |
| "loss": 1.8847312927246094, | |
| "mean_token_accuracy": 0.6607359856367111, | |
| "num_tokens": 14019708.0, | |
| "step": 2360 | |
| }, | |
| { | |
| "entropy": 1.794335062801838, | |
| "epoch": 4.072211476466796, | |
| "grad_norm": 0.8262340426445007, | |
| "learning_rate": 0.00010524, | |
| "loss": 1.8576740264892577, | |
| "mean_token_accuracy": 0.6582343481481076, | |
| "num_tokens": 14076178.0, | |
| "step": 2370 | |
| }, | |
| { | |
| "entropy": 1.8828865155577659, | |
| "epoch": 4.089404685149366, | |
| "grad_norm": 0.784656822681427, | |
| "learning_rate": 0.00010484, | |
| "loss": 1.9146394729614258, | |
| "mean_token_accuracy": 0.6491621173918247, | |
| "num_tokens": 14133662.0, | |
| "step": 2380 | |
| }, | |
| { | |
| "entropy": 1.918326808512211, | |
| "epoch": 4.106597893831936, | |
| "grad_norm": 0.7571077346801758, | |
| "learning_rate": 0.00010444000000000001, | |
| "loss": 2.024713897705078, | |
| "mean_token_accuracy": 0.643079025298357, | |
| "num_tokens": 14196967.0, | |
| "step": 2390 | |
| }, | |
| { | |
| "entropy": 1.7909317679703236, | |
| "epoch": 4.123791102514507, | |
| "grad_norm": 0.7276471257209778, | |
| "learning_rate": 0.00010404, | |
| "loss": 1.845133399963379, | |
| "mean_token_accuracy": 0.6548417568206787, | |
| "num_tokens": 14256866.0, | |
| "step": 2400 | |
| }, | |
| { | |
| "entropy": 1.7750686906278133, | |
| "epoch": 4.140984311197077, | |
| "grad_norm": 0.668246328830719, | |
| "learning_rate": 0.00010364, | |
| "loss": 1.7945009231567384, | |
| "mean_token_accuracy": 0.6641525950282812, | |
| "num_tokens": 14318324.0, | |
| "step": 2410 | |
| }, | |
| { | |
| "entropy": 1.823828212916851, | |
| "epoch": 4.158177519879647, | |
| "grad_norm": 0.7596518993377686, | |
| "learning_rate": 0.00010324000000000001, | |
| "loss": 1.898871612548828, | |
| "mean_token_accuracy": 0.6519910141825676, | |
| "num_tokens": 14380775.0, | |
| "step": 2420 | |
| }, | |
| { | |
| "entropy": 1.7938876405358315, | |
| "epoch": 4.175370728562218, | |
| "grad_norm": 0.6834619641304016, | |
| "learning_rate": 0.00010284, | |
| "loss": 1.8518138885498048, | |
| "mean_token_accuracy": 0.6622516691684723, | |
| "num_tokens": 14440862.0, | |
| "step": 2430 | |
| }, | |
| { | |
| "entropy": 1.8744625180959702, | |
| "epoch": 4.192563937244788, | |
| "grad_norm": 0.8088146448135376, | |
| "learning_rate": 0.00010244, | |
| "loss": 1.9542848587036132, | |
| "mean_token_accuracy": 0.6499028638005256, | |
| "num_tokens": 14500841.0, | |
| "step": 2440 | |
| }, | |
| { | |
| "entropy": 1.8284114554524422, | |
| "epoch": 4.209757145927359, | |
| "grad_norm": 0.82193523645401, | |
| "learning_rate": 0.00010204, | |
| "loss": 1.9107404708862306, | |
| "mean_token_accuracy": 0.6551219135522842, | |
| "num_tokens": 14564257.0, | |
| "step": 2450 | |
| }, | |
| { | |
| "entropy": 1.8538024842739105, | |
| "epoch": 4.226950354609929, | |
| "grad_norm": 0.7263757586479187, | |
| "learning_rate": 0.00010164, | |
| "loss": 1.8713863372802735, | |
| "mean_token_accuracy": 0.6510257624089718, | |
| "num_tokens": 14623019.0, | |
| "step": 2460 | |
| }, | |
| { | |
| "entropy": 1.756752038002014, | |
| "epoch": 4.244143563292499, | |
| "grad_norm": 0.7334346175193787, | |
| "learning_rate": 0.00010124, | |
| "loss": 1.7855047225952148, | |
| "mean_token_accuracy": 0.6687729060649872, | |
| "num_tokens": 14682191.0, | |
| "step": 2470 | |
| }, | |
| { | |
| "entropy": 1.7032470375299453, | |
| "epoch": 4.26133677197507, | |
| "grad_norm": 0.7168938517570496, | |
| "learning_rate": 0.00010084, | |
| "loss": 1.7648530960083009, | |
| "mean_token_accuracy": 0.6696467150002718, | |
| "num_tokens": 14739840.0, | |
| "step": 2480 | |
| }, | |
| { | |
| "entropy": 1.7426577515900135, | |
| "epoch": 4.27852998065764, | |
| "grad_norm": 0.7091065645217896, | |
| "learning_rate": 0.00010044000000000001, | |
| "loss": 1.8180946350097655, | |
| "mean_token_accuracy": 0.6640235505998134, | |
| "num_tokens": 14798444.0, | |
| "step": 2490 | |
| }, | |
| { | |
| "entropy": 1.8743537411093711, | |
| "epoch": 4.295723189340211, | |
| "grad_norm": 0.6376718878746033, | |
| "learning_rate": 0.00010004, | |
| "loss": 1.9534942626953125, | |
| "mean_token_accuracy": 0.6467559643089771, | |
| "num_tokens": 14861262.0, | |
| "step": 2500 | |
| }, | |
| { | |
| "entropy": 1.8234948687255383, | |
| "epoch": 4.312916398022781, | |
| "grad_norm": 0.778538167476654, | |
| "learning_rate": 9.964e-05, | |
| "loss": 1.8733020782470704, | |
| "mean_token_accuracy": 0.6553889319300652, | |
| "num_tokens": 14920923.0, | |
| "step": 2510 | |
| }, | |
| { | |
| "entropy": 1.812998068332672, | |
| "epoch": 4.330109606705351, | |
| "grad_norm": 0.7861834764480591, | |
| "learning_rate": 9.924e-05, | |
| "loss": 1.8699317932128907, | |
| "mean_token_accuracy": 0.6555795632302761, | |
| "num_tokens": 14978173.0, | |
| "step": 2520 | |
| }, | |
| { | |
| "entropy": 1.8013822883367538, | |
| "epoch": 4.347302815387922, | |
| "grad_norm": 0.751916229724884, | |
| "learning_rate": 9.884e-05, | |
| "loss": 1.8372121810913087, | |
| "mean_token_accuracy": 0.664341426640749, | |
| "num_tokens": 15034480.0, | |
| "step": 2530 | |
| }, | |
| { | |
| "entropy": 1.7700918450951577, | |
| "epoch": 4.364496024070492, | |
| "grad_norm": 0.7365695834159851, | |
| "learning_rate": 9.844000000000001e-05, | |
| "loss": 1.8166645050048829, | |
| "mean_token_accuracy": 0.6654425717890262, | |
| "num_tokens": 15093226.0, | |
| "step": 2540 | |
| }, | |
| { | |
| "entropy": 1.7808674454689026, | |
| "epoch": 4.381689232753063, | |
| "grad_norm": 0.7306393980979919, | |
| "learning_rate": 9.804e-05, | |
| "loss": 1.8363780975341797, | |
| "mean_token_accuracy": 0.6601886965334416, | |
| "num_tokens": 15149937.0, | |
| "step": 2550 | |
| }, | |
| { | |
| "entropy": 1.7890540674328803, | |
| "epoch": 4.398882441435633, | |
| "grad_norm": 0.7466715574264526, | |
| "learning_rate": 9.764000000000001e-05, | |
| "loss": 1.847653579711914, | |
| "mean_token_accuracy": 0.6586611110717058, | |
| "num_tokens": 15210500.0, | |
| "step": 2560 | |
| }, | |
| { | |
| "entropy": 1.7866264268755914, | |
| "epoch": 4.416075650118203, | |
| "grad_norm": 0.7825273871421814, | |
| "learning_rate": 9.724000000000001e-05, | |
| "loss": 1.82576904296875, | |
| "mean_token_accuracy": 0.6592508733272553, | |
| "num_tokens": 15268262.0, | |
| "step": 2570 | |
| }, | |
| { | |
| "entropy": 1.8321722269058227, | |
| "epoch": 4.433268858800774, | |
| "grad_norm": 0.7158058285713196, | |
| "learning_rate": 9.684000000000001e-05, | |
| "loss": 1.8807327270507812, | |
| "mean_token_accuracy": 0.6545467376708984, | |
| "num_tokens": 15330745.0, | |
| "step": 2580 | |
| }, | |
| { | |
| "entropy": 1.739266212284565, | |
| "epoch": 4.450462067483344, | |
| "grad_norm": 0.7281847596168518, | |
| "learning_rate": 9.644e-05, | |
| "loss": 1.7686588287353515, | |
| "mean_token_accuracy": 0.6666045777499676, | |
| "num_tokens": 15391266.0, | |
| "step": 2590 | |
| }, | |
| { | |
| "entropy": 1.8295569285750388, | |
| "epoch": 4.467655276165915, | |
| "grad_norm": 0.7166727781295776, | |
| "learning_rate": 9.604000000000001e-05, | |
| "loss": 1.9156217575073242, | |
| "mean_token_accuracy": 0.655017600953579, | |
| "num_tokens": 15449819.0, | |
| "step": 2600 | |
| }, | |
| { | |
| "entropy": 1.8236071288585662, | |
| "epoch": 4.484848484848484, | |
| "grad_norm": 0.6946532726287842, | |
| "learning_rate": 9.564000000000001e-05, | |
| "loss": 1.9035514831542968, | |
| "mean_token_accuracy": 0.649907086789608, | |
| "num_tokens": 15513231.0, | |
| "step": 2610 | |
| }, | |
| { | |
| "entropy": 1.7869442969560623, | |
| "epoch": 4.502041693531055, | |
| "grad_norm": 0.7257023453712463, | |
| "learning_rate": 9.524e-05, | |
| "loss": 1.841336441040039, | |
| "mean_token_accuracy": 0.6655759517103433, | |
| "num_tokens": 15568973.0, | |
| "step": 2620 | |
| }, | |
| { | |
| "entropy": 1.7462848544120788, | |
| "epoch": 4.519234902213626, | |
| "grad_norm": 0.7239391803741455, | |
| "learning_rate": 9.484e-05, | |
| "loss": 1.7989360809326171, | |
| "mean_token_accuracy": 0.6646886244416237, | |
| "num_tokens": 15627655.0, | |
| "step": 2630 | |
| }, | |
| { | |
| "entropy": 1.7926493644714356, | |
| "epoch": 4.536428110896196, | |
| "grad_norm": 0.7628325819969177, | |
| "learning_rate": 9.444000000000001e-05, | |
| "loss": 1.8627632141113282, | |
| "mean_token_accuracy": 0.654141866415739, | |
| "num_tokens": 15687626.0, | |
| "step": 2640 | |
| }, | |
| { | |
| "entropy": 1.7928333327174186, | |
| "epoch": 4.553621319578767, | |
| "grad_norm": 0.629107654094696, | |
| "learning_rate": 9.404e-05, | |
| "loss": 1.8784042358398438, | |
| "mean_token_accuracy": 0.6618591919541359, | |
| "num_tokens": 15750035.0, | |
| "step": 2650 | |
| }, | |
| { | |
| "entropy": 1.7438783437013625, | |
| "epoch": 4.570814528261336, | |
| "grad_norm": 0.6948845982551575, | |
| "learning_rate": 9.364e-05, | |
| "loss": 1.7456579208374023, | |
| "mean_token_accuracy": 0.6722261719405651, | |
| "num_tokens": 15809533.0, | |
| "step": 2660 | |
| }, | |
| { | |
| "entropy": 1.7451874181628226, | |
| "epoch": 4.588007736943907, | |
| "grad_norm": 0.7213107943534851, | |
| "learning_rate": 9.324000000000001e-05, | |
| "loss": 1.8111917495727539, | |
| "mean_token_accuracy": 0.6621977139264346, | |
| "num_tokens": 15866570.0, | |
| "step": 2670 | |
| }, | |
| { | |
| "entropy": 1.806991095095873, | |
| "epoch": 4.6052009456264775, | |
| "grad_norm": 0.9146936535835266, | |
| "learning_rate": 9.284e-05, | |
| "loss": 1.8761199951171874, | |
| "mean_token_accuracy": 0.6552402298897505, | |
| "num_tokens": 15923681.0, | |
| "step": 2680 | |
| }, | |
| { | |
| "entropy": 1.854476225376129, | |
| "epoch": 4.622394154309048, | |
| "grad_norm": 0.675061047077179, | |
| "learning_rate": 9.244e-05, | |
| "loss": 1.8601364135742187, | |
| "mean_token_accuracy": 0.656403211131692, | |
| "num_tokens": 15979879.0, | |
| "step": 2690 | |
| }, | |
| { | |
| "entropy": 1.8345128282904626, | |
| "epoch": 4.639587362991619, | |
| "grad_norm": 0.7702699303627014, | |
| "learning_rate": 9.204e-05, | |
| "loss": 1.9170707702636718, | |
| "mean_token_accuracy": 0.6507652081549168, | |
| "num_tokens": 16040136.0, | |
| "step": 2700 | |
| }, | |
| { | |
| "entropy": 1.8444690719246863, | |
| "epoch": 4.656780571674188, | |
| "grad_norm": 0.7249677181243896, | |
| "learning_rate": 9.164000000000001e-05, | |
| "loss": 1.9021928787231446, | |
| "mean_token_accuracy": 0.6553504541516304, | |
| "num_tokens": 16097652.0, | |
| "step": 2710 | |
| }, | |
| { | |
| "entropy": 1.8083212688565253, | |
| "epoch": 4.673973780356759, | |
| "grad_norm": 0.7018275260925293, | |
| "learning_rate": 9.124e-05, | |
| "loss": 1.87921199798584, | |
| "mean_token_accuracy": 0.6609590038657188, | |
| "num_tokens": 16159014.0, | |
| "step": 2720 | |
| }, | |
| { | |
| "entropy": 1.793540646135807, | |
| "epoch": 4.6911669890393295, | |
| "grad_norm": 0.731863796710968, | |
| "learning_rate": 9.084e-05, | |
| "loss": 1.847224807739258, | |
| "mean_token_accuracy": 0.6638176888227463, | |
| "num_tokens": 16223636.0, | |
| "step": 2730 | |
| }, | |
| { | |
| "entropy": 1.7947301134467124, | |
| "epoch": 4.7083601977219, | |
| "grad_norm": 0.7208489775657654, | |
| "learning_rate": 9.044000000000001e-05, | |
| "loss": 1.8400375366210937, | |
| "mean_token_accuracy": 0.6600434482097626, | |
| "num_tokens": 16281647.0, | |
| "step": 2740 | |
| }, | |
| { | |
| "entropy": 1.8043948471546174, | |
| "epoch": 4.725553406404471, | |
| "grad_norm": 0.7633848190307617, | |
| "learning_rate": 9.004e-05, | |
| "loss": 1.8509382247924804, | |
| "mean_token_accuracy": 0.6632162068039179, | |
| "num_tokens": 16340706.0, | |
| "step": 2750 | |
| }, | |
| { | |
| "entropy": 1.8240734949707984, | |
| "epoch": 4.74274661508704, | |
| "grad_norm": 0.7516812086105347, | |
| "learning_rate": 8.964e-05, | |
| "loss": 1.9139686584472657, | |
| "mean_token_accuracy": 0.6504824224859476, | |
| "num_tokens": 16398077.0, | |
| "step": 2760 | |
| }, | |
| { | |
| "entropy": 1.7775158017873764, | |
| "epoch": 4.759939823769611, | |
| "grad_norm": 0.7677133679389954, | |
| "learning_rate": 8.924e-05, | |
| "loss": 1.8351661682128906, | |
| "mean_token_accuracy": 0.6568478621542454, | |
| "num_tokens": 16458898.0, | |
| "step": 2770 | |
| }, | |
| { | |
| "entropy": 1.8671277523040772, | |
| "epoch": 4.7771330324521815, | |
| "grad_norm": 0.750451385974884, | |
| "learning_rate": 8.884e-05, | |
| "loss": 1.9589305877685548, | |
| "mean_token_accuracy": 0.6506143860518933, | |
| "num_tokens": 16519496.0, | |
| "step": 2780 | |
| }, | |
| { | |
| "entropy": 1.7745324671268463, | |
| "epoch": 4.794326241134752, | |
| "grad_norm": 0.8302338719367981, | |
| "learning_rate": 8.844e-05, | |
| "loss": 1.8637496948242187, | |
| "mean_token_accuracy": 0.6621543657034635, | |
| "num_tokens": 16579080.0, | |
| "step": 2790 | |
| }, | |
| { | |
| "entropy": 1.73246541172266, | |
| "epoch": 4.811519449817322, | |
| "grad_norm": 0.778176486492157, | |
| "learning_rate": 8.804e-05, | |
| "loss": 1.752696418762207, | |
| "mean_token_accuracy": 0.6727286443114281, | |
| "num_tokens": 16640932.0, | |
| "step": 2800 | |
| }, | |
| { | |
| "entropy": 1.8060437709093093, | |
| "epoch": 4.828712658499892, | |
| "grad_norm": 0.9019444584846497, | |
| "learning_rate": 8.764e-05, | |
| "loss": 1.9031681060791015, | |
| "mean_token_accuracy": 0.6563040159642697, | |
| "num_tokens": 16702244.0, | |
| "step": 2810 | |
| }, | |
| { | |
| "entropy": 1.8732322439551354, | |
| "epoch": 4.845905867182463, | |
| "grad_norm": 0.7397829294204712, | |
| "learning_rate": 8.724e-05, | |
| "loss": 1.9326038360595703, | |
| "mean_token_accuracy": 0.6478111572563648, | |
| "num_tokens": 16764555.0, | |
| "step": 2820 | |
| }, | |
| { | |
| "entropy": 1.842681024968624, | |
| "epoch": 4.863099075865033, | |
| "grad_norm": 0.8511717915534973, | |
| "learning_rate": 8.684e-05, | |
| "loss": 1.9107376098632813, | |
| "mean_token_accuracy": 0.6531910292804242, | |
| "num_tokens": 16821936.0, | |
| "step": 2830 | |
| }, | |
| { | |
| "entropy": 1.7571960732340812, | |
| "epoch": 4.880292284547604, | |
| "grad_norm": 0.7064304947853088, | |
| "learning_rate": 8.643999999999999e-05, | |
| "loss": 1.7985404968261718, | |
| "mean_token_accuracy": 0.6667480751872062, | |
| "num_tokens": 16882205.0, | |
| "step": 2840 | |
| }, | |
| { | |
| "entropy": 1.8695308573544025, | |
| "epoch": 4.897485493230175, | |
| "grad_norm": 0.7386742234230042, | |
| "learning_rate": 8.604000000000001e-05, | |
| "loss": 1.9543342590332031, | |
| "mean_token_accuracy": 0.6496741093695164, | |
| "num_tokens": 16939799.0, | |
| "step": 2850 | |
| }, | |
| { | |
| "entropy": 1.7877972453832627, | |
| "epoch": 4.914678701912744, | |
| "grad_norm": 0.7687976956367493, | |
| "learning_rate": 8.564000000000001e-05, | |
| "loss": 1.7994373321533204, | |
| "mean_token_accuracy": 0.6637697361409665, | |
| "num_tokens": 16997716.0, | |
| "step": 2860 | |
| }, | |
| { | |
| "entropy": 1.761916320025921, | |
| "epoch": 4.931871910595315, | |
| "grad_norm": 0.7507193088531494, | |
| "learning_rate": 8.524e-05, | |
| "loss": 1.788670539855957, | |
| "mean_token_accuracy": 0.6648910716176033, | |
| "num_tokens": 17057260.0, | |
| "step": 2870 | |
| }, | |
| { | |
| "entropy": 1.804823537170887, | |
| "epoch": 4.949065119277885, | |
| "grad_norm": 0.727188229560852, | |
| "learning_rate": 8.484000000000001e-05, | |
| "loss": 1.855522346496582, | |
| "mean_token_accuracy": 0.657912939786911, | |
| "num_tokens": 17116073.0, | |
| "step": 2880 | |
| }, | |
| { | |
| "entropy": 1.8259041801095008, | |
| "epoch": 4.966258327960456, | |
| "grad_norm": 0.7195336818695068, | |
| "learning_rate": 8.444000000000001e-05, | |
| "loss": 1.8942272186279296, | |
| "mean_token_accuracy": 0.6546841934323311, | |
| "num_tokens": 17174141.0, | |
| "step": 2890 | |
| }, | |
| { | |
| "entropy": 1.7153871595859527, | |
| "epoch": 4.983451536643026, | |
| "grad_norm": 0.7093940377235413, | |
| "learning_rate": 8.404e-05, | |
| "loss": 1.7350996017456055, | |
| "mean_token_accuracy": 0.6728265054523945, | |
| "num_tokens": 17233307.0, | |
| "step": 2900 | |
| }, | |
| { | |
| "entropy": 1.7630670566063422, | |
| "epoch": 5.0, | |
| "grad_norm": 0.979345440864563, | |
| "learning_rate": 8.364e-05, | |
| "loss": 1.8098876953125, | |
| "mean_token_accuracy": 0.6604567510741097, | |
| "num_tokens": 17289810.0, | |
| "step": 2910 | |
| }, | |
| { | |
| "entropy": 1.8877688512206077, | |
| "epoch": 5.017193208682571, | |
| "grad_norm": 0.8140257596969604, | |
| "learning_rate": 8.324000000000001e-05, | |
| "loss": 1.9562681198120118, | |
| "mean_token_accuracy": 0.6476880256086588, | |
| "num_tokens": 17349922.0, | |
| "step": 2920 | |
| }, | |
| { | |
| "entropy": 1.6694072388112544, | |
| "epoch": 5.034386417365141, | |
| "grad_norm": 0.7486578226089478, | |
| "learning_rate": 8.284000000000001e-05, | |
| "loss": 1.71788330078125, | |
| "mean_token_accuracy": 0.6781885512173176, | |
| "num_tokens": 17409363.0, | |
| "step": 2930 | |
| }, | |
| { | |
| "entropy": 1.8061093628406524, | |
| "epoch": 5.051579626047711, | |
| "grad_norm": 0.8148984313011169, | |
| "learning_rate": 8.244e-05, | |
| "loss": 1.8484228134155274, | |
| "mean_token_accuracy": 0.6591597832739353, | |
| "num_tokens": 17468218.0, | |
| "step": 2940 | |
| }, | |
| { | |
| "entropy": 1.7561381176114081, | |
| "epoch": 5.068772834730281, | |
| "grad_norm": 0.7412339448928833, | |
| "learning_rate": 8.204000000000001e-05, | |
| "loss": 1.8109855651855469, | |
| "mean_token_accuracy": 0.6648329850286245, | |
| "num_tokens": 17529603.0, | |
| "step": 2950 | |
| }, | |
| { | |
| "entropy": 1.7058369636535644, | |
| "epoch": 5.085966043412852, | |
| "grad_norm": 0.7845883369445801, | |
| "learning_rate": 8.164000000000001e-05, | |
| "loss": 1.7577402114868164, | |
| "mean_token_accuracy": 0.675883399322629, | |
| "num_tokens": 17587275.0, | |
| "step": 2960 | |
| }, | |
| { | |
| "entropy": 1.7319279327988624, | |
| "epoch": 5.1031592520954225, | |
| "grad_norm": 0.7546029090881348, | |
| "learning_rate": 8.124e-05, | |
| "loss": 1.8096488952636718, | |
| "mean_token_accuracy": 0.668717809766531, | |
| "num_tokens": 17647368.0, | |
| "step": 2970 | |
| }, | |
| { | |
| "entropy": 1.7872621923685075, | |
| "epoch": 5.120352460777993, | |
| "grad_norm": 0.7214957475662231, | |
| "learning_rate": 8.084e-05, | |
| "loss": 1.7827239990234376, | |
| "mean_token_accuracy": 0.663322826102376, | |
| "num_tokens": 17708210.0, | |
| "step": 2980 | |
| }, | |
| { | |
| "entropy": 1.7479579642415046, | |
| "epoch": 5.137545669460563, | |
| "grad_norm": 0.6938044428825378, | |
| "learning_rate": 8.044000000000001e-05, | |
| "loss": 1.837489700317383, | |
| "mean_token_accuracy": 0.666904554143548, | |
| "num_tokens": 17770498.0, | |
| "step": 2990 | |
| }, | |
| { | |
| "entropy": 1.760008592903614, | |
| "epoch": 5.154738878143133, | |
| "grad_norm": 0.7440096139907837, | |
| "learning_rate": 8.004e-05, | |
| "loss": 1.7957250595092773, | |
| "mean_token_accuracy": 0.6704145818948746, | |
| "num_tokens": 17831493.0, | |
| "step": 3000 | |
| }, | |
| { | |
| "entropy": 1.7866925299167633, | |
| "epoch": 5.171932086825704, | |
| "grad_norm": 0.775793731212616, | |
| "learning_rate": 7.964e-05, | |
| "loss": 1.8513370513916017, | |
| "mean_token_accuracy": 0.6593568369746208, | |
| "num_tokens": 17893338.0, | |
| "step": 3010 | |
| }, | |
| { | |
| "entropy": 1.720614206790924, | |
| "epoch": 5.1891252955082745, | |
| "grad_norm": 0.7855071425437927, | |
| "learning_rate": 7.924000000000001e-05, | |
| "loss": 1.7529998779296876, | |
| "mean_token_accuracy": 0.6738685265183448, | |
| "num_tokens": 17949102.0, | |
| "step": 3020 | |
| }, | |
| { | |
| "entropy": 1.8130397230386734, | |
| "epoch": 5.206318504190844, | |
| "grad_norm": 0.7261347770690918, | |
| "learning_rate": 7.884e-05, | |
| "loss": 1.8530288696289063, | |
| "mean_token_accuracy": 0.6672368694096804, | |
| "num_tokens": 18011291.0, | |
| "step": 3030 | |
| }, | |
| { | |
| "entropy": 1.7442916065454483, | |
| "epoch": 5.223511712873415, | |
| "grad_norm": 0.7350125908851624, | |
| "learning_rate": 7.844e-05, | |
| "loss": 1.8185455322265625, | |
| "mean_token_accuracy": 0.6648106183856726, | |
| "num_tokens": 18069969.0, | |
| "step": 3040 | |
| }, | |
| { | |
| "entropy": 1.8336029559373856, | |
| "epoch": 5.240704921555985, | |
| "grad_norm": 0.9380921125411987, | |
| "learning_rate": 7.804e-05, | |
| "loss": 1.8585586547851562, | |
| "mean_token_accuracy": 0.6559876747429371, | |
| "num_tokens": 18126839.0, | |
| "step": 3050 | |
| }, | |
| { | |
| "entropy": 1.831410789489746, | |
| "epoch": 5.257898130238556, | |
| "grad_norm": 0.7422699928283691, | |
| "learning_rate": 7.764e-05, | |
| "loss": 1.8669567108154297, | |
| "mean_token_accuracy": 0.660079612582922, | |
| "num_tokens": 18189169.0, | |
| "step": 3060 | |
| }, | |
| { | |
| "entropy": 1.7772829428315162, | |
| "epoch": 5.2750913389211265, | |
| "grad_norm": 0.80905681848526, | |
| "learning_rate": 7.724e-05, | |
| "loss": 1.8516859054565429, | |
| "mean_token_accuracy": 0.6614492174237967, | |
| "num_tokens": 18247092.0, | |
| "step": 3070 | |
| }, | |
| { | |
| "entropy": 1.8644750490784645, | |
| "epoch": 5.292284547603696, | |
| "grad_norm": 0.7613252997398376, | |
| "learning_rate": 7.684e-05, | |
| "loss": 1.8771135330200195, | |
| "mean_token_accuracy": 0.6572393793612719, | |
| "num_tokens": 18305337.0, | |
| "step": 3080 | |
| }, | |
| { | |
| "entropy": 1.7888765200972556, | |
| "epoch": 5.309477756286267, | |
| "grad_norm": 0.8216497302055359, | |
| "learning_rate": 7.644e-05, | |
| "loss": 1.877157211303711, | |
| "mean_token_accuracy": 0.6567147132009268, | |
| "num_tokens": 18366693.0, | |
| "step": 3090 | |
| }, | |
| { | |
| "entropy": 1.8046741798520087, | |
| "epoch": 5.326670964968837, | |
| "grad_norm": 0.7475964426994324, | |
| "learning_rate": 7.604e-05, | |
| "loss": 1.8193256378173828, | |
| "mean_token_accuracy": 0.6620738692581654, | |
| "num_tokens": 18424592.0, | |
| "step": 3100 | |
| }, | |
| { | |
| "entropy": 1.7556863978505135, | |
| "epoch": 5.343864173651408, | |
| "grad_norm": 0.7376730442047119, | |
| "learning_rate": 7.564e-05, | |
| "loss": 1.8117481231689454, | |
| "mean_token_accuracy": 0.6669185206294059, | |
| "num_tokens": 18480554.0, | |
| "step": 3110 | |
| }, | |
| { | |
| "entropy": 1.680773164331913, | |
| "epoch": 5.361057382333978, | |
| "grad_norm": 0.8276366591453552, | |
| "learning_rate": 7.524e-05, | |
| "loss": 1.7203754425048827, | |
| "mean_token_accuracy": 0.6801572386175394, | |
| "num_tokens": 18533859.0, | |
| "step": 3120 | |
| }, | |
| { | |
| "entropy": 1.7824992030858993, | |
| "epoch": 5.378250591016548, | |
| "grad_norm": 0.7689419984817505, | |
| "learning_rate": 7.484e-05, | |
| "loss": 1.7929088592529296, | |
| "mean_token_accuracy": 0.6625824831426144, | |
| "num_tokens": 18591432.0, | |
| "step": 3130 | |
| }, | |
| { | |
| "entropy": 1.7943954214453697, | |
| "epoch": 5.395443799699119, | |
| "grad_norm": 0.7818305492401123, | |
| "learning_rate": 7.444e-05, | |
| "loss": 1.9027202606201172, | |
| "mean_token_accuracy": 0.6574487689882517, | |
| "num_tokens": 18653780.0, | |
| "step": 3140 | |
| }, | |
| { | |
| "entropy": 1.6858137652277947, | |
| "epoch": 5.412637008381689, | |
| "grad_norm": 0.7783890962600708, | |
| "learning_rate": 7.404e-05, | |
| "loss": 1.6952400207519531, | |
| "mean_token_accuracy": 0.681446236371994, | |
| "num_tokens": 18712089.0, | |
| "step": 3150 | |
| }, | |
| { | |
| "entropy": 1.781475655734539, | |
| "epoch": 5.42983021706426, | |
| "grad_norm": 0.8033313751220703, | |
| "learning_rate": 7.364e-05, | |
| "loss": 1.8575824737548827, | |
| "mean_token_accuracy": 0.6616954285651445, | |
| "num_tokens": 18770760.0, | |
| "step": 3160 | |
| }, | |
| { | |
| "entropy": 1.8044284671545028, | |
| "epoch": 5.44702342574683, | |
| "grad_norm": 0.8778691291809082, | |
| "learning_rate": 7.324000000000001e-05, | |
| "loss": 1.868129348754883, | |
| "mean_token_accuracy": 0.6609551507979632, | |
| "num_tokens": 18830053.0, | |
| "step": 3170 | |
| }, | |
| { | |
| "entropy": 1.7783053085207938, | |
| "epoch": 5.4642166344294, | |
| "grad_norm": 0.794116735458374, | |
| "learning_rate": 7.284000000000001e-05, | |
| "loss": 1.8013723373413086, | |
| "mean_token_accuracy": 0.6687252540141344, | |
| "num_tokens": 18889827.0, | |
| "step": 3180 | |
| }, | |
| { | |
| "entropy": 1.7062184020876885, | |
| "epoch": 5.481409843111971, | |
| "grad_norm": 0.7348354458808899, | |
| "learning_rate": 7.244e-05, | |
| "loss": 1.7668045043945313, | |
| "mean_token_accuracy": 0.6689080417156219, | |
| "num_tokens": 18948434.0, | |
| "step": 3190 | |
| }, | |
| { | |
| "entropy": 1.7791135892271996, | |
| "epoch": 5.498603051794541, | |
| "grad_norm": 0.7285153865814209, | |
| "learning_rate": 7.204000000000001e-05, | |
| "loss": 1.8126005172729491, | |
| "mean_token_accuracy": 0.6612196549773216, | |
| "num_tokens": 19009735.0, | |
| "step": 3200 | |
| }, | |
| { | |
| "entropy": 1.7333651915192605, | |
| "epoch": 5.515796260477112, | |
| "grad_norm": 0.768817126750946, | |
| "learning_rate": 7.164000000000001e-05, | |
| "loss": 1.7593820571899415, | |
| "mean_token_accuracy": 0.6708316601812839, | |
| "num_tokens": 19070839.0, | |
| "step": 3210 | |
| }, | |
| { | |
| "entropy": 1.7646871596574782, | |
| "epoch": 5.532989469159682, | |
| "grad_norm": 0.7897234559059143, | |
| "learning_rate": 7.124e-05, | |
| "loss": 1.8217975616455078, | |
| "mean_token_accuracy": 0.6624716755002737, | |
| "num_tokens": 19131701.0, | |
| "step": 3220 | |
| }, | |
| { | |
| "entropy": 1.79796422123909, | |
| "epoch": 5.550182677842252, | |
| "grad_norm": 0.7753779292106628, | |
| "learning_rate": 7.084e-05, | |
| "loss": 1.8724674224853515, | |
| "mean_token_accuracy": 0.6603185098618269, | |
| "num_tokens": 19192206.0, | |
| "step": 3230 | |
| }, | |
| { | |
| "entropy": 1.6741029411554336, | |
| "epoch": 5.567375886524823, | |
| "grad_norm": 0.7559053301811218, | |
| "learning_rate": 7.044000000000001e-05, | |
| "loss": 1.7434120178222656, | |
| "mean_token_accuracy": 0.6718135714530945, | |
| "num_tokens": 19250887.0, | |
| "step": 3240 | |
| }, | |
| { | |
| "entropy": 1.8032452374696732, | |
| "epoch": 5.584569095207393, | |
| "grad_norm": 0.8887183666229248, | |
| "learning_rate": 7.004e-05, | |
| "loss": 1.8439495086669921, | |
| "mean_token_accuracy": 0.6600725017488003, | |
| "num_tokens": 19310066.0, | |
| "step": 3250 | |
| }, | |
| { | |
| "entropy": 1.7758998274803162, | |
| "epoch": 5.601762303889964, | |
| "grad_norm": 0.7295767068862915, | |
| "learning_rate": 6.964e-05, | |
| "loss": 1.8148815155029296, | |
| "mean_token_accuracy": 0.6652825616300106, | |
| "num_tokens": 19369366.0, | |
| "step": 3260 | |
| }, | |
| { | |
| "entropy": 1.662617878615856, | |
| "epoch": 5.6189555125725335, | |
| "grad_norm": 0.7842978835105896, | |
| "learning_rate": 6.924000000000001e-05, | |
| "loss": 1.7120464324951172, | |
| "mean_token_accuracy": 0.6784614086151123, | |
| "num_tokens": 19427377.0, | |
| "step": 3270 | |
| }, | |
| { | |
| "entropy": 1.8431208834052086, | |
| "epoch": 5.636148721255104, | |
| "grad_norm": 0.8514787554740906, | |
| "learning_rate": 6.884e-05, | |
| "loss": 1.9169921875, | |
| "mean_token_accuracy": 0.6530084304511548, | |
| "num_tokens": 19488230.0, | |
| "step": 3280 | |
| }, | |
| { | |
| "entropy": 1.6891573801636697, | |
| "epoch": 5.653341929937675, | |
| "grad_norm": 0.7638376951217651, | |
| "learning_rate": 6.844e-05, | |
| "loss": 1.7445995330810546, | |
| "mean_token_accuracy": 0.6780110366642476, | |
| "num_tokens": 19546500.0, | |
| "step": 3290 | |
| }, | |
| { | |
| "entropy": 1.7722659215331078, | |
| "epoch": 5.670535138620245, | |
| "grad_norm": 0.8072571754455566, | |
| "learning_rate": 6.804e-05, | |
| "loss": 1.8334453582763672, | |
| "mean_token_accuracy": 0.6662346951663494, | |
| "num_tokens": 19604586.0, | |
| "step": 3300 | |
| }, | |
| { | |
| "entropy": 1.824295823276043, | |
| "epoch": 5.687728347302816, | |
| "grad_norm": 0.8235921263694763, | |
| "learning_rate": 6.764000000000001e-05, | |
| "loss": 1.936505126953125, | |
| "mean_token_accuracy": 0.6559072963893413, | |
| "num_tokens": 19664352.0, | |
| "step": 3310 | |
| }, | |
| { | |
| "entropy": 1.776401199400425, | |
| "epoch": 5.704921555985385, | |
| "grad_norm": 0.7260850071907043, | |
| "learning_rate": 6.724e-05, | |
| "loss": 1.8124887466430664, | |
| "mean_token_accuracy": 0.6629696622490883, | |
| "num_tokens": 19725395.0, | |
| "step": 3320 | |
| }, | |
| { | |
| "entropy": 1.8356325037777423, | |
| "epoch": 5.722114764667956, | |
| "grad_norm": 0.7962324619293213, | |
| "learning_rate": 6.684e-05, | |
| "loss": 1.865267562866211, | |
| "mean_token_accuracy": 0.6575648851692677, | |
| "num_tokens": 19786886.0, | |
| "step": 3330 | |
| }, | |
| { | |
| "entropy": 1.7975003249943255, | |
| "epoch": 5.739307973350527, | |
| "grad_norm": 0.8319332599639893, | |
| "learning_rate": 6.644000000000001e-05, | |
| "loss": 1.8043830871582032, | |
| "mean_token_accuracy": 0.6615023009479046, | |
| "num_tokens": 19846593.0, | |
| "step": 3340 | |
| }, | |
| { | |
| "entropy": 1.7411245226860046, | |
| "epoch": 5.756501182033097, | |
| "grad_norm": 0.8770884871482849, | |
| "learning_rate": 6.604e-05, | |
| "loss": 1.8150835037231445, | |
| "mean_token_accuracy": 0.6629907101392746, | |
| "num_tokens": 19901728.0, | |
| "step": 3350 | |
| }, | |
| { | |
| "entropy": 1.7868980005383492, | |
| "epoch": 5.773694390715667, | |
| "grad_norm": 0.8012292385101318, | |
| "learning_rate": 6.564e-05, | |
| "loss": 1.8411848068237304, | |
| "mean_token_accuracy": 0.6610983822494745, | |
| "num_tokens": 19962294.0, | |
| "step": 3360 | |
| }, | |
| { | |
| "entropy": 1.8151665195822715, | |
| "epoch": 5.790887599398237, | |
| "grad_norm": 0.6628616452217102, | |
| "learning_rate": 6.524e-05, | |
| "loss": 1.873934555053711, | |
| "mean_token_accuracy": 0.6563344091176987, | |
| "num_tokens": 20023732.0, | |
| "step": 3370 | |
| }, | |
| { | |
| "entropy": 1.7544716522097588, | |
| "epoch": 5.808080808080808, | |
| "grad_norm": 0.765192985534668, | |
| "learning_rate": 6.484e-05, | |
| "loss": 1.7676244735717774, | |
| "mean_token_accuracy": 0.6681830242276192, | |
| "num_tokens": 20084955.0, | |
| "step": 3380 | |
| }, | |
| { | |
| "entropy": 1.7742430947721004, | |
| "epoch": 5.8252740167633785, | |
| "grad_norm": 0.9363911747932434, | |
| "learning_rate": 6.444e-05, | |
| "loss": 1.7936756134033203, | |
| "mean_token_accuracy": 0.6640898622572422, | |
| "num_tokens": 20145523.0, | |
| "step": 3390 | |
| }, | |
| { | |
| "entropy": 1.7672609627246856, | |
| "epoch": 5.842467225445949, | |
| "grad_norm": 0.8024185299873352, | |
| "learning_rate": 6.404e-05, | |
| "loss": 1.8390132904052734, | |
| "mean_token_accuracy": 0.66049126945436, | |
| "num_tokens": 20207613.0, | |
| "step": 3400 | |
| }, | |
| { | |
| "entropy": 1.7419263988733291, | |
| "epoch": 5.85966043412852, | |
| "grad_norm": 0.7452662587165833, | |
| "learning_rate": 6.364e-05, | |
| "loss": 1.7592693328857423, | |
| "mean_token_accuracy": 0.6729626737534999, | |
| "num_tokens": 20264080.0, | |
| "step": 3410 | |
| }, | |
| { | |
| "entropy": 1.794348457455635, | |
| "epoch": 5.876853642811089, | |
| "grad_norm": 0.7698886394500732, | |
| "learning_rate": 6.324e-05, | |
| "loss": 1.8215929031372071, | |
| "mean_token_accuracy": 0.6594825953245163, | |
| "num_tokens": 20325935.0, | |
| "step": 3420 | |
| }, | |
| { | |
| "entropy": 1.8058848246932029, | |
| "epoch": 5.89404685149366, | |
| "grad_norm": 0.7813654541969299, | |
| "learning_rate": 6.284e-05, | |
| "loss": 1.8705635070800781, | |
| "mean_token_accuracy": 0.6575064200907945, | |
| "num_tokens": 20384219.0, | |
| "step": 3430 | |
| }, | |
| { | |
| "entropy": 1.8296722590923309, | |
| "epoch": 5.9112400601762305, | |
| "grad_norm": 0.7985308766365051, | |
| "learning_rate": 6.244e-05, | |
| "loss": 1.8622390747070312, | |
| "mean_token_accuracy": 0.6598837457597255, | |
| "num_tokens": 20442055.0, | |
| "step": 3440 | |
| }, | |
| { | |
| "entropy": 1.745755286514759, | |
| "epoch": 5.928433268858801, | |
| "grad_norm": 0.7957124710083008, | |
| "learning_rate": 6.204e-05, | |
| "loss": 1.778817367553711, | |
| "mean_token_accuracy": 0.6711658544838428, | |
| "num_tokens": 20500787.0, | |
| "step": 3450 | |
| }, | |
| { | |
| "entropy": 1.8037077650427817, | |
| "epoch": 5.945626477541371, | |
| "grad_norm": 0.8097943067550659, | |
| "learning_rate": 6.164e-05, | |
| "loss": 1.8763154983520507, | |
| "mean_token_accuracy": 0.6581781908869744, | |
| "num_tokens": 20560544.0, | |
| "step": 3460 | |
| }, | |
| { | |
| "entropy": 1.6478220209479333, | |
| "epoch": 5.962819686223941, | |
| "grad_norm": 0.7882372736930847, | |
| "learning_rate": 6.124e-05, | |
| "loss": 1.7081596374511718, | |
| "mean_token_accuracy": 0.6812212504446507, | |
| "num_tokens": 20621866.0, | |
| "step": 3470 | |
| }, | |
| { | |
| "entropy": 1.7625097312033176, | |
| "epoch": 5.980012894906512, | |
| "grad_norm": 0.780114471912384, | |
| "learning_rate": 6.084000000000001e-05, | |
| "loss": 1.7864303588867188, | |
| "mean_token_accuracy": 0.6720023825764656, | |
| "num_tokens": 20677413.0, | |
| "step": 3480 | |
| }, | |
| { | |
| "entropy": 1.7418652877211571, | |
| "epoch": 5.9972061035890825, | |
| "grad_norm": 0.8374961614608765, | |
| "learning_rate": 6.044000000000001e-05, | |
| "loss": 1.8426128387451173, | |
| "mean_token_accuracy": 0.6612365163862706, | |
| "num_tokens": 20737424.0, | |
| "step": 3490 | |
| }, | |
| { | |
| "entropy": 1.8112690711950328, | |
| "epoch": 6.013754566946056, | |
| "grad_norm": 0.7742412686347961, | |
| "learning_rate": 6.004000000000001e-05, | |
| "loss": 1.8320732116699219, | |
| "mean_token_accuracy": 0.6643109286760355, | |
| "num_tokens": 20795175.0, | |
| "step": 3500 | |
| }, | |
| { | |
| "entropy": 1.687648557126522, | |
| "epoch": 6.0309477756286265, | |
| "grad_norm": 0.8348304629325867, | |
| "learning_rate": 5.9640000000000005e-05, | |
| "loss": 1.7558349609375, | |
| "mean_token_accuracy": 0.6784385897219181, | |
| "num_tokens": 20852486.0, | |
| "step": 3510 | |
| }, | |
| { | |
| "entropy": 1.6863658234477044, | |
| "epoch": 6.048140984311197, | |
| "grad_norm": 0.7642632126808167, | |
| "learning_rate": 5.924000000000001e-05, | |
| "loss": 1.6536775588989259, | |
| "mean_token_accuracy": 0.680523382127285, | |
| "num_tokens": 20908597.0, | |
| "step": 3520 | |
| }, | |
| { | |
| "entropy": 1.6652932062745094, | |
| "epoch": 6.065334192993768, | |
| "grad_norm": 0.8676924109458923, | |
| "learning_rate": 5.8840000000000006e-05, | |
| "loss": 1.7443069458007812, | |
| "mean_token_accuracy": 0.6719188451766968, | |
| "num_tokens": 20966567.0, | |
| "step": 3530 | |
| }, | |
| { | |
| "entropy": 1.7391631960868836, | |
| "epoch": 6.082527401676338, | |
| "grad_norm": 0.8444374799728394, | |
| "learning_rate": 5.844e-05, | |
| "loss": 1.7849775314331056, | |
| "mean_token_accuracy": 0.672398941218853, | |
| "num_tokens": 21023832.0, | |
| "step": 3540 | |
| }, | |
| { | |
| "entropy": 1.7432220742106437, | |
| "epoch": 6.099720610358908, | |
| "grad_norm": 0.7972187995910645, | |
| "learning_rate": 5.804000000000001e-05, | |
| "loss": 1.8264921188354493, | |
| "mean_token_accuracy": 0.6713483344763518, | |
| "num_tokens": 21080325.0, | |
| "step": 3550 | |
| }, | |
| { | |
| "entropy": 1.7394985787570476, | |
| "epoch": 6.1169138190414785, | |
| "grad_norm": 0.8266369700431824, | |
| "learning_rate": 5.7640000000000004e-05, | |
| "loss": 1.819821548461914, | |
| "mean_token_accuracy": 0.6708907049149275, | |
| "num_tokens": 21143316.0, | |
| "step": 3560 | |
| }, | |
| { | |
| "entropy": 1.7923602670431138, | |
| "epoch": 6.134107027724049, | |
| "grad_norm": 0.8315872550010681, | |
| "learning_rate": 5.724000000000001e-05, | |
| "loss": 1.8086809158325194, | |
| "mean_token_accuracy": 0.665992408245802, | |
| "num_tokens": 21203848.0, | |
| "step": 3570 | |
| }, | |
| { | |
| "entropy": 1.711188006401062, | |
| "epoch": 6.15130023640662, | |
| "grad_norm": 0.8174048066139221, | |
| "learning_rate": 5.6840000000000005e-05, | |
| "loss": 1.7656991958618165, | |
| "mean_token_accuracy": 0.6711975857615471, | |
| "num_tokens": 21266260.0, | |
| "step": 3580 | |
| }, | |
| { | |
| "entropy": 1.8437035098671912, | |
| "epoch": 6.16849344508919, | |
| "grad_norm": 0.8155949711799622, | |
| "learning_rate": 5.644e-05, | |
| "loss": 1.877999496459961, | |
| "mean_token_accuracy": 0.6532085236161947, | |
| "num_tokens": 21326008.0, | |
| "step": 3590 | |
| }, | |
| { | |
| "entropy": 1.7264528393745422, | |
| "epoch": 6.18568665377176, | |
| "grad_norm": 0.7951272130012512, | |
| "learning_rate": 5.6040000000000006e-05, | |
| "loss": 1.747119140625, | |
| "mean_token_accuracy": 0.6696909107267857, | |
| "num_tokens": 21385356.0, | |
| "step": 3600 | |
| }, | |
| { | |
| "entropy": 1.68227918446064, | |
| "epoch": 6.20287986245433, | |
| "grad_norm": 0.779587984085083, | |
| "learning_rate": 5.564e-05, | |
| "loss": 1.7062965393066407, | |
| "mean_token_accuracy": 0.6786911800503731, | |
| "num_tokens": 21443231.0, | |
| "step": 3610 | |
| }, | |
| { | |
| "entropy": 1.7644565671682357, | |
| "epoch": 6.220073071136901, | |
| "grad_norm": 0.9153981804847717, | |
| "learning_rate": 5.524e-05, | |
| "loss": 1.8082721710205079, | |
| "mean_token_accuracy": 0.6671201888471842, | |
| "num_tokens": 21499309.0, | |
| "step": 3620 | |
| }, | |
| { | |
| "entropy": 1.7211210913956165, | |
| "epoch": 6.237266279819472, | |
| "grad_norm": 0.8166586756706238, | |
| "learning_rate": 5.4840000000000003e-05, | |
| "loss": 1.769371795654297, | |
| "mean_token_accuracy": 0.6694241009652615, | |
| "num_tokens": 21558565.0, | |
| "step": 3630 | |
| }, | |
| { | |
| "entropy": 1.7693689942359925, | |
| "epoch": 6.254459488502041, | |
| "grad_norm": 0.7773623466491699, | |
| "learning_rate": 5.444e-05, | |
| "loss": 1.848412322998047, | |
| "mean_token_accuracy": 0.66685731112957, | |
| "num_tokens": 21618504.0, | |
| "step": 3640 | |
| }, | |
| { | |
| "entropy": 1.8090675905346871, | |
| "epoch": 6.271652697184612, | |
| "grad_norm": 0.9420453310012817, | |
| "learning_rate": 5.4040000000000004e-05, | |
| "loss": 1.8266836166381837, | |
| "mean_token_accuracy": 0.6643423162400722, | |
| "num_tokens": 21676861.0, | |
| "step": 3650 | |
| }, | |
| { | |
| "entropy": 1.7340097561478616, | |
| "epoch": 6.288845905867182, | |
| "grad_norm": 0.805880069732666, | |
| "learning_rate": 5.364e-05, | |
| "loss": 1.7760274887084961, | |
| "mean_token_accuracy": 0.6729184173047542, | |
| "num_tokens": 21734874.0, | |
| "step": 3660 | |
| }, | |
| { | |
| "entropy": 1.733542764186859, | |
| "epoch": 6.306039114549753, | |
| "grad_norm": 0.7459798455238342, | |
| "learning_rate": 5.324e-05, | |
| "loss": 1.7874065399169923, | |
| "mean_token_accuracy": 0.6733234331011773, | |
| "num_tokens": 21797467.0, | |
| "step": 3670 | |
| }, | |
| { | |
| "entropy": 1.6855479300022125, | |
| "epoch": 6.3232323232323235, | |
| "grad_norm": 0.7362611889839172, | |
| "learning_rate": 5.284e-05, | |
| "loss": 1.7557338714599608, | |
| "mean_token_accuracy": 0.6742986045777798, | |
| "num_tokens": 21856704.0, | |
| "step": 3680 | |
| }, | |
| { | |
| "entropy": 1.762756396830082, | |
| "epoch": 6.340425531914893, | |
| "grad_norm": 0.8349901437759399, | |
| "learning_rate": 5.244e-05, | |
| "loss": 1.784174346923828, | |
| "mean_token_accuracy": 0.6732991166412831, | |
| "num_tokens": 21915781.0, | |
| "step": 3690 | |
| }, | |
| { | |
| "entropy": 1.7664957396686076, | |
| "epoch": 6.357618740597464, | |
| "grad_norm": 0.8295337557792664, | |
| "learning_rate": 5.204e-05, | |
| "loss": 1.8338695526123048, | |
| "mean_token_accuracy": 0.6659718155860901, | |
| "num_tokens": 21973568.0, | |
| "step": 3700 | |
| }, | |
| { | |
| "entropy": 1.7744196206331253, | |
| "epoch": 6.374811949280034, | |
| "grad_norm": 0.739115297794342, | |
| "learning_rate": 5.164e-05, | |
| "loss": 1.8148929595947265, | |
| "mean_token_accuracy": 0.6660460762679576, | |
| "num_tokens": 22032979.0, | |
| "step": 3710 | |
| }, | |
| { | |
| "entropy": 1.7459667712450027, | |
| "epoch": 6.392005157962605, | |
| "grad_norm": 0.7716593146324158, | |
| "learning_rate": 5.124e-05, | |
| "loss": 1.8079204559326172, | |
| "mean_token_accuracy": 0.66551748290658, | |
| "num_tokens": 22092283.0, | |
| "step": 3720 | |
| }, | |
| { | |
| "entropy": 1.7491293936967849, | |
| "epoch": 6.4091983666451755, | |
| "grad_norm": 0.8270374536514282, | |
| "learning_rate": 5.084e-05, | |
| "loss": 1.8020380020141602, | |
| "mean_token_accuracy": 0.6673273537307978, | |
| "num_tokens": 22150667.0, | |
| "step": 3730 | |
| }, | |
| { | |
| "entropy": 1.6887403331696986, | |
| "epoch": 6.426391575327745, | |
| "grad_norm": 0.8306758403778076, | |
| "learning_rate": 5.044e-05, | |
| "loss": 1.7328964233398438, | |
| "mean_token_accuracy": 0.676455694437027, | |
| "num_tokens": 22211170.0, | |
| "step": 3740 | |
| }, | |
| { | |
| "entropy": 1.8332835257053375, | |
| "epoch": 6.443584784010316, | |
| "grad_norm": 0.8369497656822205, | |
| "learning_rate": 5.0039999999999995e-05, | |
| "loss": 1.913273239135742, | |
| "mean_token_accuracy": 0.656198850646615, | |
| "num_tokens": 22269928.0, | |
| "step": 3750 | |
| }, | |
| { | |
| "entropy": 1.6914366707205772, | |
| "epoch": 6.460777992692886, | |
| "grad_norm": 0.7562059164047241, | |
| "learning_rate": 4.9640000000000006e-05, | |
| "loss": 1.7506240844726562, | |
| "mean_token_accuracy": 0.67936124317348, | |
| "num_tokens": 22328611.0, | |
| "step": 3760 | |
| }, | |
| { | |
| "entropy": 1.7604179099202155, | |
| "epoch": 6.477971201375457, | |
| "grad_norm": 0.7541300058364868, | |
| "learning_rate": 4.924e-05, | |
| "loss": 1.8065948486328125, | |
| "mean_token_accuracy": 0.6697364591062069, | |
| "num_tokens": 22389219.0, | |
| "step": 3770 | |
| }, | |
| { | |
| "entropy": 1.731757602095604, | |
| "epoch": 6.4951644100580275, | |
| "grad_norm": 0.8319364190101624, | |
| "learning_rate": 4.884e-05, | |
| "loss": 1.7902181625366211, | |
| "mean_token_accuracy": 0.6673447206616402, | |
| "num_tokens": 22449858.0, | |
| "step": 3780 | |
| }, | |
| { | |
| "entropy": 1.7152166068553925, | |
| "epoch": 6.512357618740597, | |
| "grad_norm": 0.8575091361999512, | |
| "learning_rate": 4.8440000000000004e-05, | |
| "loss": 1.7424659729003906, | |
| "mean_token_accuracy": 0.6707747709006071, | |
| "num_tokens": 22509375.0, | |
| "step": 3790 | |
| }, | |
| { | |
| "entropy": 1.6641680032014847, | |
| "epoch": 6.529550827423168, | |
| "grad_norm": 0.7516652345657349, | |
| "learning_rate": 4.804e-05, | |
| "loss": 1.6937873840332032, | |
| "mean_token_accuracy": 0.6811798132956028, | |
| "num_tokens": 22566440.0, | |
| "step": 3800 | |
| }, | |
| { | |
| "entropy": 1.7551555022597314, | |
| "epoch": 6.546744036105738, | |
| "grad_norm": 0.817863941192627, | |
| "learning_rate": 4.7640000000000005e-05, | |
| "loss": 1.8282489776611328, | |
| "mean_token_accuracy": 0.6655839093029499, | |
| "num_tokens": 22627900.0, | |
| "step": 3810 | |
| }, | |
| { | |
| "entropy": 1.7025569766759872, | |
| "epoch": 6.563937244788309, | |
| "grad_norm": 0.757764458656311, | |
| "learning_rate": 4.724e-05, | |
| "loss": 1.7325496673583984, | |
| "mean_token_accuracy": 0.6785391330718994, | |
| "num_tokens": 22685738.0, | |
| "step": 3820 | |
| }, | |
| { | |
| "entropy": 1.699775031208992, | |
| "epoch": 6.5811304534708785, | |
| "grad_norm": 0.7960421442985535, | |
| "learning_rate": 4.684e-05, | |
| "loss": 1.7602745056152345, | |
| "mean_token_accuracy": 0.6698532458394766, | |
| "num_tokens": 22745696.0, | |
| "step": 3830 | |
| }, | |
| { | |
| "entropy": 1.8100605458021164, | |
| "epoch": 6.598323662153449, | |
| "grad_norm": 0.8477244973182678, | |
| "learning_rate": 4.644e-05, | |
| "loss": 1.8226333618164063, | |
| "mean_token_accuracy": 0.6646727129817009, | |
| "num_tokens": 22805783.0, | |
| "step": 3840 | |
| }, | |
| { | |
| "entropy": 1.7685839846730231, | |
| "epoch": 6.61551687083602, | |
| "grad_norm": 0.7853493690490723, | |
| "learning_rate": 4.604e-05, | |
| "loss": 1.8230281829833985, | |
| "mean_token_accuracy": 0.664577030390501, | |
| "num_tokens": 22866822.0, | |
| "step": 3850 | |
| }, | |
| { | |
| "entropy": 1.7810854628682136, | |
| "epoch": 6.63271007951859, | |
| "grad_norm": 0.7139444351196289, | |
| "learning_rate": 4.564e-05, | |
| "loss": 1.855198287963867, | |
| "mean_token_accuracy": 0.6652711797505617, | |
| "num_tokens": 22928790.0, | |
| "step": 3860 | |
| }, | |
| { | |
| "entropy": 1.7815292954444886, | |
| "epoch": 6.649903288201161, | |
| "grad_norm": 0.7039018869400024, | |
| "learning_rate": 4.524000000000001e-05, | |
| "loss": 1.845859909057617, | |
| "mean_token_accuracy": 0.6595252249389887, | |
| "num_tokens": 22990170.0, | |
| "step": 3870 | |
| }, | |
| { | |
| "entropy": 1.7107908308506012, | |
| "epoch": 6.667096496883731, | |
| "grad_norm": 0.7651708126068115, | |
| "learning_rate": 4.4840000000000004e-05, | |
| "loss": 1.7340824127197265, | |
| "mean_token_accuracy": 0.6750431463122368, | |
| "num_tokens": 23047902.0, | |
| "step": 3880 | |
| }, | |
| { | |
| "entropy": 1.7069460928440094, | |
| "epoch": 6.684289705566301, | |
| "grad_norm": 0.7385950088500977, | |
| "learning_rate": 4.444e-05, | |
| "loss": 1.758881187438965, | |
| "mean_token_accuracy": 0.6745327576994896, | |
| "num_tokens": 23112106.0, | |
| "step": 3890 | |
| }, | |
| { | |
| "entropy": 1.821124967932701, | |
| "epoch": 6.701482914248872, | |
| "grad_norm": 0.7827627658843994, | |
| "learning_rate": 4.4040000000000005e-05, | |
| "loss": 1.913480567932129, | |
| "mean_token_accuracy": 0.6593531377613544, | |
| "num_tokens": 23170056.0, | |
| "step": 3900 | |
| }, | |
| { | |
| "entropy": 1.7924881175160408, | |
| "epoch": 6.718676122931442, | |
| "grad_norm": 0.8166612386703491, | |
| "learning_rate": 4.364e-05, | |
| "loss": 1.855017852783203, | |
| "mean_token_accuracy": 0.6593458168208599, | |
| "num_tokens": 23228582.0, | |
| "step": 3910 | |
| }, | |
| { | |
| "entropy": 1.736910080909729, | |
| "epoch": 6.735869331614013, | |
| "grad_norm": 0.779629647731781, | |
| "learning_rate": 4.324e-05, | |
| "loss": 1.7581821441650392, | |
| "mean_token_accuracy": 0.6779871381819248, | |
| "num_tokens": 23288702.0, | |
| "step": 3920 | |
| }, | |
| { | |
| "entropy": 1.6776573412120341, | |
| "epoch": 6.7530625402965825, | |
| "grad_norm": 0.7625913619995117, | |
| "learning_rate": 4.284e-05, | |
| "loss": 1.7102031707763672, | |
| "mean_token_accuracy": 0.6794889360666275, | |
| "num_tokens": 23349004.0, | |
| "step": 3930 | |
| }, | |
| { | |
| "entropy": 1.8100020587444305, | |
| "epoch": 6.770255748979153, | |
| "grad_norm": 0.7499405145645142, | |
| "learning_rate": 4.244e-05, | |
| "loss": 1.8514158248901367, | |
| "mean_token_accuracy": 0.6620845705270767, | |
| "num_tokens": 23410874.0, | |
| "step": 3940 | |
| }, | |
| { | |
| "entropy": 1.697011759877205, | |
| "epoch": 6.787448957661724, | |
| "grad_norm": 0.736323893070221, | |
| "learning_rate": 4.2040000000000004e-05, | |
| "loss": 1.7609180450439452, | |
| "mean_token_accuracy": 0.6772994473576546, | |
| "num_tokens": 23472518.0, | |
| "step": 3950 | |
| }, | |
| { | |
| "entropy": 1.764576494693756, | |
| "epoch": 6.804642166344294, | |
| "grad_norm": 0.8523833751678467, | |
| "learning_rate": 4.164e-05, | |
| "loss": 1.81484375, | |
| "mean_token_accuracy": 0.6644324712455273, | |
| "num_tokens": 23531203.0, | |
| "step": 3960 | |
| }, | |
| { | |
| "entropy": 1.7241224959492683, | |
| "epoch": 6.821835375026865, | |
| "grad_norm": 0.8820350766181946, | |
| "learning_rate": 4.124e-05, | |
| "loss": 1.739130401611328, | |
| "mean_token_accuracy": 0.6771424360573292, | |
| "num_tokens": 23590289.0, | |
| "step": 3970 | |
| }, | |
| { | |
| "entropy": 1.6967746496200562, | |
| "epoch": 6.8390285837094345, | |
| "grad_norm": 0.8161067962646484, | |
| "learning_rate": 4.084e-05, | |
| "loss": 1.7659534454345702, | |
| "mean_token_accuracy": 0.6744477659463882, | |
| "num_tokens": 23647985.0, | |
| "step": 3980 | |
| }, | |
| { | |
| "entropy": 1.8578275874257089, | |
| "epoch": 6.856221792392005, | |
| "grad_norm": 0.778160810470581, | |
| "learning_rate": 4.044e-05, | |
| "loss": 1.9046249389648438, | |
| "mean_token_accuracy": 0.6525318272411823, | |
| "num_tokens": 23707387.0, | |
| "step": 3990 | |
| }, | |
| { | |
| "entropy": 1.781902502477169, | |
| "epoch": 6.873415001074576, | |
| "grad_norm": 0.9398592710494995, | |
| "learning_rate": 4.004e-05, | |
| "loss": 1.8081722259521484, | |
| "mean_token_accuracy": 0.6625144556164742, | |
| "num_tokens": 23764831.0, | |
| "step": 4000 | |
| }, | |
| { | |
| "entropy": 1.699565550684929, | |
| "epoch": 6.890608209757146, | |
| "grad_norm": 0.7662839889526367, | |
| "learning_rate": 3.964e-05, | |
| "loss": 1.7373327255249023, | |
| "mean_token_accuracy": 0.6809282444417477, | |
| "num_tokens": 23825367.0, | |
| "step": 4010 | |
| }, | |
| { | |
| "entropy": 1.6455101184546947, | |
| "epoch": 6.907801418439716, | |
| "grad_norm": 0.7619901299476624, | |
| "learning_rate": 3.9240000000000004e-05, | |
| "loss": 1.709805679321289, | |
| "mean_token_accuracy": 0.6812954246997833, | |
| "num_tokens": 23887369.0, | |
| "step": 4020 | |
| }, | |
| { | |
| "entropy": 1.7952800825238229, | |
| "epoch": 6.924994627122286, | |
| "grad_norm": 0.7858437299728394, | |
| "learning_rate": 3.884e-05, | |
| "loss": 1.8688398361206056, | |
| "mean_token_accuracy": 0.6621494639664889, | |
| "num_tokens": 23949358.0, | |
| "step": 4030 | |
| }, | |
| { | |
| "entropy": 1.772008201479912, | |
| "epoch": 6.942187835804857, | |
| "grad_norm": 0.7586779594421387, | |
| "learning_rate": 3.8440000000000005e-05, | |
| "loss": 1.798760986328125, | |
| "mean_token_accuracy": 0.667642817273736, | |
| "num_tokens": 24009691.0, | |
| "step": 4040 | |
| }, | |
| { | |
| "entropy": 1.7289930269122125, | |
| "epoch": 6.959381044487428, | |
| "grad_norm": 0.854505717754364, | |
| "learning_rate": 3.804e-05, | |
| "loss": 1.771562385559082, | |
| "mean_token_accuracy": 0.6692178774625063, | |
| "num_tokens": 24064506.0, | |
| "step": 4050 | |
| }, | |
| { | |
| "entropy": 1.715189914405346, | |
| "epoch": 6.976574253169998, | |
| "grad_norm": 0.758488655090332, | |
| "learning_rate": 3.7640000000000006e-05, | |
| "loss": 1.756412887573242, | |
| "mean_token_accuracy": 0.6710222817957401, | |
| "num_tokens": 24126841.0, | |
| "step": 4060 | |
| }, | |
| { | |
| "entropy": 1.7383173301815986, | |
| "epoch": 6.993767461852569, | |
| "grad_norm": 0.7450618147850037, | |
| "learning_rate": 3.724e-05, | |
| "loss": 1.7997669219970702, | |
| "mean_token_accuracy": 0.6649864386767149, | |
| "num_tokens": 24186159.0, | |
| "step": 4070 | |
| }, | |
| { | |
| "entropy": 1.7172312767474682, | |
| "epoch": 7.010315925209542, | |
| "grad_norm": 0.8475770950317383, | |
| "learning_rate": 3.684e-05, | |
| "loss": 1.7585922241210938, | |
| "mean_token_accuracy": 0.6746863397684965, | |
| "num_tokens": 24239759.0, | |
| "step": 4080 | |
| }, | |
| { | |
| "entropy": 1.7192407630383968, | |
| "epoch": 7.027509133892113, | |
| "grad_norm": 0.7818967700004578, | |
| "learning_rate": 3.6440000000000003e-05, | |
| "loss": 1.7634265899658204, | |
| "mean_token_accuracy": 0.6724576361477375, | |
| "num_tokens": 24298775.0, | |
| "step": 4090 | |
| }, | |
| { | |
| "entropy": 1.7496131911873818, | |
| "epoch": 7.044702342574683, | |
| "grad_norm": 0.8118335008621216, | |
| "learning_rate": 3.604e-05, | |
| "loss": 1.802253532409668, | |
| "mean_token_accuracy": 0.6702191606163979, | |
| "num_tokens": 24361142.0, | |
| "step": 4100 | |
| }, | |
| { | |
| "entropy": 1.7090509735047816, | |
| "epoch": 7.061895551257253, | |
| "grad_norm": 0.8414726257324219, | |
| "learning_rate": 3.5640000000000004e-05, | |
| "loss": 1.7347373962402344, | |
| "mean_token_accuracy": 0.679864277690649, | |
| "num_tokens": 24419838.0, | |
| "step": 4110 | |
| }, | |
| { | |
| "entropy": 1.6807728812098504, | |
| "epoch": 7.079088759939824, | |
| "grad_norm": 0.8567139506340027, | |
| "learning_rate": 3.524e-05, | |
| "loss": 1.7365150451660156, | |
| "mean_token_accuracy": 0.6765194039791822, | |
| "num_tokens": 24477518.0, | |
| "step": 4120 | |
| }, | |
| { | |
| "entropy": 1.709678091108799, | |
| "epoch": 7.096281968622394, | |
| "grad_norm": 0.8345620036125183, | |
| "learning_rate": 3.484e-05, | |
| "loss": 1.730575180053711, | |
| "mean_token_accuracy": 0.6709145799279213, | |
| "num_tokens": 24534560.0, | |
| "step": 4130 | |
| }, | |
| { | |
| "entropy": 1.6541544690728187, | |
| "epoch": 7.113475177304965, | |
| "grad_norm": 0.8509814143180847, | |
| "learning_rate": 3.444e-05, | |
| "loss": 1.6795757293701172, | |
| "mean_token_accuracy": 0.6856038823723793, | |
| "num_tokens": 24594829.0, | |
| "step": 4140 | |
| }, | |
| { | |
| "entropy": 1.7498343527317046, | |
| "epoch": 7.130668385987535, | |
| "grad_norm": 0.8674039244651794, | |
| "learning_rate": 3.404e-05, | |
| "loss": 1.8083892822265626, | |
| "mean_token_accuracy": 0.6709578204900026, | |
| "num_tokens": 24656798.0, | |
| "step": 4150 | |
| }, | |
| { | |
| "entropy": 1.677807478606701, | |
| "epoch": 7.147861594670105, | |
| "grad_norm": 0.8016234040260315, | |
| "learning_rate": 3.3639999999999996e-05, | |
| "loss": 1.7206790924072266, | |
| "mean_token_accuracy": 0.6754934191703796, | |
| "num_tokens": 24714009.0, | |
| "step": 4160 | |
| }, | |
| { | |
| "entropy": 1.672835360467434, | |
| "epoch": 7.1650548033526755, | |
| "grad_norm": 0.7139334082603455, | |
| "learning_rate": 3.324e-05, | |
| "loss": 1.7049163818359374, | |
| "mean_token_accuracy": 0.6851269513368606, | |
| "num_tokens": 24778022.0, | |
| "step": 4170 | |
| }, | |
| { | |
| "entropy": 1.6577355667948723, | |
| "epoch": 7.182248012035246, | |
| "grad_norm": 0.9129847288131714, | |
| "learning_rate": 3.2840000000000004e-05, | |
| "loss": 1.7073640823364258, | |
| "mean_token_accuracy": 0.6768647953867912, | |
| "num_tokens": 24837669.0, | |
| "step": 4180 | |
| }, | |
| { | |
| "entropy": 1.7049853071570396, | |
| "epoch": 7.199441220717817, | |
| "grad_norm": 0.7545643448829651, | |
| "learning_rate": 3.244e-05, | |
| "loss": 1.754374122619629, | |
| "mean_token_accuracy": 0.6808854278177023, | |
| "num_tokens": 24898991.0, | |
| "step": 4190 | |
| }, | |
| { | |
| "entropy": 1.6785477355122567, | |
| "epoch": 7.216634429400387, | |
| "grad_norm": 0.8802333474159241, | |
| "learning_rate": 3.2040000000000005e-05, | |
| "loss": 1.6974828720092774, | |
| "mean_token_accuracy": 0.6824289247393608, | |
| "num_tokens": 24957348.0, | |
| "step": 4200 | |
| }, | |
| { | |
| "entropy": 1.7312355414032936, | |
| "epoch": 7.233827638082957, | |
| "grad_norm": 0.8227038383483887, | |
| "learning_rate": 3.164e-05, | |
| "loss": 1.7645183563232423, | |
| "mean_token_accuracy": 0.6661410238593817, | |
| "num_tokens": 25016658.0, | |
| "step": 4210 | |
| }, | |
| { | |
| "entropy": 1.8124181643128394, | |
| "epoch": 7.2510208467655275, | |
| "grad_norm": 0.8563106060028076, | |
| "learning_rate": 3.1240000000000006e-05, | |
| "loss": 1.8163776397705078, | |
| "mean_token_accuracy": 0.6610642150044441, | |
| "num_tokens": 25074658.0, | |
| "step": 4220 | |
| }, | |
| { | |
| "entropy": 1.776869924366474, | |
| "epoch": 7.268214055448098, | |
| "grad_norm": 0.8615058064460754, | |
| "learning_rate": 3.084e-05, | |
| "loss": 1.861563491821289, | |
| "mean_token_accuracy": 0.6624562762677669, | |
| "num_tokens": 25132732.0, | |
| "step": 4230 | |
| }, | |
| { | |
| "entropy": 1.742109003663063, | |
| "epoch": 7.285407264130669, | |
| "grad_norm": 0.7851050496101379, | |
| "learning_rate": 3.0440000000000003e-05, | |
| "loss": 1.7527351379394531, | |
| "mean_token_accuracy": 0.6712357953190804, | |
| "num_tokens": 25194009.0, | |
| "step": 4240 | |
| }, | |
| { | |
| "entropy": 1.7356494843959809, | |
| "epoch": 7.302600472813239, | |
| "grad_norm": 0.8842288255691528, | |
| "learning_rate": 3.004e-05, | |
| "loss": 1.8091196060180663, | |
| "mean_token_accuracy": 0.6680308949202299, | |
| "num_tokens": 25250681.0, | |
| "step": 4250 | |
| }, | |
| { | |
| "entropy": 1.714112138748169, | |
| "epoch": 7.319793681495809, | |
| "grad_norm": 0.8050926923751831, | |
| "learning_rate": 2.964e-05, | |
| "loss": 1.741617774963379, | |
| "mean_token_accuracy": 0.6764710985124112, | |
| "num_tokens": 25307119.0, | |
| "step": 4260 | |
| }, | |
| { | |
| "entropy": 1.7806825146079064, | |
| "epoch": 7.3369868901783795, | |
| "grad_norm": 0.755797803401947, | |
| "learning_rate": 2.924e-05, | |
| "loss": 1.8448747634887694, | |
| "mean_token_accuracy": 0.6646751999855042, | |
| "num_tokens": 25365721.0, | |
| "step": 4270 | |
| }, | |
| { | |
| "entropy": 1.7478718511760234, | |
| "epoch": 7.35418009886095, | |
| "grad_norm": 0.8148614764213562, | |
| "learning_rate": 2.8840000000000002e-05, | |
| "loss": 1.8303293228149413, | |
| "mean_token_accuracy": 0.6662985436618328, | |
| "num_tokens": 25423309.0, | |
| "step": 4280 | |
| }, | |
| { | |
| "entropy": 1.6996045634150505, | |
| "epoch": 7.371373307543521, | |
| "grad_norm": 0.7613778114318848, | |
| "learning_rate": 2.844e-05, | |
| "loss": 1.7077817916870117, | |
| "mean_token_accuracy": 0.679437268525362, | |
| "num_tokens": 25480080.0, | |
| "step": 4290 | |
| }, | |
| { | |
| "entropy": 1.8055237784981728, | |
| "epoch": 7.38856651622609, | |
| "grad_norm": 0.899900496006012, | |
| "learning_rate": 2.804e-05, | |
| "loss": 1.882634735107422, | |
| "mean_token_accuracy": 0.659589122608304, | |
| "num_tokens": 25538885.0, | |
| "step": 4300 | |
| }, | |
| { | |
| "entropy": 1.6835025876760483, | |
| "epoch": 7.405759724908661, | |
| "grad_norm": 0.7718909382820129, | |
| "learning_rate": 2.764e-05, | |
| "loss": 1.7145641326904297, | |
| "mean_token_accuracy": 0.6805526971817016, | |
| "num_tokens": 25598830.0, | |
| "step": 4310 | |
| }, | |
| { | |
| "entropy": 1.7392980232834816, | |
| "epoch": 7.422952933591231, | |
| "grad_norm": 0.7144562005996704, | |
| "learning_rate": 2.724e-05, | |
| "loss": 1.7779796600341797, | |
| "mean_token_accuracy": 0.6709600411355495, | |
| "num_tokens": 25660275.0, | |
| "step": 4320 | |
| }, | |
| { | |
| "entropy": 1.7193088322877883, | |
| "epoch": 7.440146142273802, | |
| "grad_norm": 0.8038010001182556, | |
| "learning_rate": 2.6840000000000004e-05, | |
| "loss": 1.7928234100341798, | |
| "mean_token_accuracy": 0.6767275612801313, | |
| "num_tokens": 25719958.0, | |
| "step": 4330 | |
| }, | |
| { | |
| "entropy": 1.7314304433763028, | |
| "epoch": 7.457339350956373, | |
| "grad_norm": 0.7783089876174927, | |
| "learning_rate": 2.6440000000000004e-05, | |
| "loss": 1.7952003479003906, | |
| "mean_token_accuracy": 0.6740467935800553, | |
| "num_tokens": 25776689.0, | |
| "step": 4340 | |
| }, | |
| { | |
| "entropy": 1.74028614833951, | |
| "epoch": 7.474532559638942, | |
| "grad_norm": 0.8052565455436707, | |
| "learning_rate": 2.6040000000000005e-05, | |
| "loss": 1.7803146362304687, | |
| "mean_token_accuracy": 0.6733121275901794, | |
| "num_tokens": 25837916.0, | |
| "step": 4350 | |
| }, | |
| { | |
| "entropy": 1.6831192195415496, | |
| "epoch": 7.491725768321513, | |
| "grad_norm": 0.8941977024078369, | |
| "learning_rate": 2.5640000000000002e-05, | |
| "loss": 1.7077743530273437, | |
| "mean_token_accuracy": 0.6749852932989597, | |
| "num_tokens": 25896712.0, | |
| "step": 4360 | |
| }, | |
| { | |
| "entropy": 1.7840609520673751, | |
| "epoch": 7.508918977004083, | |
| "grad_norm": 0.818671703338623, | |
| "learning_rate": 2.5240000000000002e-05, | |
| "loss": 1.8329656600952149, | |
| "mean_token_accuracy": 0.6679215718060731, | |
| "num_tokens": 25958383.0, | |
| "step": 4370 | |
| }, | |
| { | |
| "entropy": 1.76528559923172, | |
| "epoch": 7.526112185686654, | |
| "grad_norm": 0.7579294443130493, | |
| "learning_rate": 2.4840000000000003e-05, | |
| "loss": 1.7914703369140625, | |
| "mean_token_accuracy": 0.6695499271154404, | |
| "num_tokens": 26017754.0, | |
| "step": 4380 | |
| }, | |
| { | |
| "entropy": 1.704708030819893, | |
| "epoch": 7.5433053943692245, | |
| "grad_norm": 0.8200159668922424, | |
| "learning_rate": 2.4440000000000003e-05, | |
| "loss": 1.774311637878418, | |
| "mean_token_accuracy": 0.6739427134394645, | |
| "num_tokens": 26075760.0, | |
| "step": 4390 | |
| }, | |
| { | |
| "entropy": 1.7540104657411575, | |
| "epoch": 7.560498603051794, | |
| "grad_norm": 0.8373399972915649, | |
| "learning_rate": 2.404e-05, | |
| "loss": 1.796240997314453, | |
| "mean_token_accuracy": 0.6640590511262416, | |
| "num_tokens": 26133858.0, | |
| "step": 4400 | |
| }, | |
| { | |
| "entropy": 1.754172220826149, | |
| "epoch": 7.577691811734365, | |
| "grad_norm": 0.7368677258491516, | |
| "learning_rate": 2.364e-05, | |
| "loss": 1.8175994873046875, | |
| "mean_token_accuracy": 0.6717667855322361, | |
| "num_tokens": 26197518.0, | |
| "step": 4410 | |
| }, | |
| { | |
| "entropy": 1.6564558774232865, | |
| "epoch": 7.594885020416935, | |
| "grad_norm": 0.8868939280509949, | |
| "learning_rate": 2.324e-05, | |
| "loss": 1.669070053100586, | |
| "mean_token_accuracy": 0.6839951984584332, | |
| "num_tokens": 26250823.0, | |
| "step": 4420 | |
| }, | |
| { | |
| "entropy": 1.7594470486044884, | |
| "epoch": 7.612078229099506, | |
| "grad_norm": 0.86412513256073, | |
| "learning_rate": 2.284e-05, | |
| "loss": 1.8095222473144532, | |
| "mean_token_accuracy": 0.666244950518012, | |
| "num_tokens": 26312548.0, | |
| "step": 4430 | |
| }, | |
| { | |
| "entropy": 1.7646627604961396, | |
| "epoch": 7.6292714377820765, | |
| "grad_norm": 0.7128214836120605, | |
| "learning_rate": 2.244e-05, | |
| "loss": 1.832158660888672, | |
| "mean_token_accuracy": 0.6679420609027147, | |
| "num_tokens": 26376747.0, | |
| "step": 4440 | |
| }, | |
| { | |
| "entropy": 1.7401177063584328, | |
| "epoch": 7.646464646464646, | |
| "grad_norm": 0.7479432225227356, | |
| "learning_rate": 2.2040000000000002e-05, | |
| "loss": 1.7779264450073242, | |
| "mean_token_accuracy": 0.6710429213941097, | |
| "num_tokens": 26438907.0, | |
| "step": 4450 | |
| }, | |
| { | |
| "entropy": 1.6960709124803544, | |
| "epoch": 7.663657855147217, | |
| "grad_norm": 0.8182732462882996, | |
| "learning_rate": 2.1640000000000003e-05, | |
| "loss": 1.7709745407104491, | |
| "mean_token_accuracy": 0.6782359674572944, | |
| "num_tokens": 26499840.0, | |
| "step": 4460 | |
| }, | |
| { | |
| "entropy": 1.8024938970804214, | |
| "epoch": 7.680851063829787, | |
| "grad_norm": 0.8208670020103455, | |
| "learning_rate": 2.124e-05, | |
| "loss": 1.8752277374267579, | |
| "mean_token_accuracy": 0.6610838636755944, | |
| "num_tokens": 26561739.0, | |
| "step": 4470 | |
| }, | |
| { | |
| "entropy": 1.6679524429142476, | |
| "epoch": 7.698044272512358, | |
| "grad_norm": 0.7669119834899902, | |
| "learning_rate": 2.084e-05, | |
| "loss": 1.6840700149536132, | |
| "mean_token_accuracy": 0.6839361816644669, | |
| "num_tokens": 26618997.0, | |
| "step": 4480 | |
| }, | |
| { | |
| "entropy": 1.669876104593277, | |
| "epoch": 7.715237481194928, | |
| "grad_norm": 0.8296427130699158, | |
| "learning_rate": 2.044e-05, | |
| "loss": 1.6926704406738282, | |
| "mean_token_accuracy": 0.6837400387972593, | |
| "num_tokens": 26677617.0, | |
| "step": 4490 | |
| }, | |
| { | |
| "entropy": 1.7478768080472946, | |
| "epoch": 7.732430689877498, | |
| "grad_norm": 0.9231081008911133, | |
| "learning_rate": 2.004e-05, | |
| "loss": 1.8043970108032226, | |
| "mean_token_accuracy": 0.6680058591067791, | |
| "num_tokens": 26735542.0, | |
| "step": 4500 | |
| }, | |
| { | |
| "entropy": 1.7587152615189552, | |
| "epoch": 7.749623898560069, | |
| "grad_norm": 0.8131846189498901, | |
| "learning_rate": 1.9640000000000002e-05, | |
| "loss": 1.798016357421875, | |
| "mean_token_accuracy": 0.6655693002045154, | |
| "num_tokens": 26796245.0, | |
| "step": 4510 | |
| }, | |
| { | |
| "entropy": 1.7238084524869919, | |
| "epoch": 7.766817107242639, | |
| "grad_norm": 0.8774024248123169, | |
| "learning_rate": 1.924e-05, | |
| "loss": 1.7398443222045898, | |
| "mean_token_accuracy": 0.6723451249301433, | |
| "num_tokens": 26852843.0, | |
| "step": 4520 | |
| }, | |
| { | |
| "entropy": 1.8012757793068885, | |
| "epoch": 7.78401031592521, | |
| "grad_norm": 0.881601095199585, | |
| "learning_rate": 1.8840000000000003e-05, | |
| "loss": 1.851584243774414, | |
| "mean_token_accuracy": 0.6612551022320986, | |
| "num_tokens": 26912327.0, | |
| "step": 4530 | |
| }, | |
| { | |
| "entropy": 1.7035338878631592, | |
| "epoch": 7.8012035246077795, | |
| "grad_norm": 0.8460244536399841, | |
| "learning_rate": 1.8440000000000003e-05, | |
| "loss": 1.7524948120117188, | |
| "mean_token_accuracy": 0.6760960537940264, | |
| "num_tokens": 26971076.0, | |
| "step": 4540 | |
| }, | |
| { | |
| "entropy": 1.6795054778456688, | |
| "epoch": 7.81839673329035, | |
| "grad_norm": 0.7720061540603638, | |
| "learning_rate": 1.804e-05, | |
| "loss": 1.70491943359375, | |
| "mean_token_accuracy": 0.6768644891679287, | |
| "num_tokens": 27031120.0, | |
| "step": 4550 | |
| }, | |
| { | |
| "entropy": 1.775759120285511, | |
| "epoch": 7.835589941972921, | |
| "grad_norm": 0.8407703638076782, | |
| "learning_rate": 1.764e-05, | |
| "loss": 1.8208852767944337, | |
| "mean_token_accuracy": 0.6638765886425972, | |
| "num_tokens": 27089926.0, | |
| "step": 4560 | |
| }, | |
| { | |
| "entropy": 1.7749223679304122, | |
| "epoch": 7.852783150655491, | |
| "grad_norm": 0.8033788204193115, | |
| "learning_rate": 1.724e-05, | |
| "loss": 1.8128280639648438, | |
| "mean_token_accuracy": 0.6697524327784776, | |
| "num_tokens": 27155776.0, | |
| "step": 4570 | |
| }, | |
| { | |
| "entropy": 1.7019891321659089, | |
| "epoch": 7.869976359338062, | |
| "grad_norm": 0.8756063580513, | |
| "learning_rate": 1.684e-05, | |
| "loss": 1.752833366394043, | |
| "mean_token_accuracy": 0.6720911644399166, | |
| "num_tokens": 27213676.0, | |
| "step": 4580 | |
| }, | |
| { | |
| "entropy": 1.7089907452464104, | |
| "epoch": 7.8871695680206315, | |
| "grad_norm": 0.8547044396400452, | |
| "learning_rate": 1.644e-05, | |
| "loss": 1.7329090118408204, | |
| "mean_token_accuracy": 0.6730512753129005, | |
| "num_tokens": 27273812.0, | |
| "step": 4590 | |
| }, | |
| { | |
| "entropy": 1.8000069722533225, | |
| "epoch": 7.904362776703202, | |
| "grad_norm": 0.8191949725151062, | |
| "learning_rate": 1.604e-05, | |
| "loss": 1.8508378982543945, | |
| "mean_token_accuracy": 0.6602330446243286, | |
| "num_tokens": 27334482.0, | |
| "step": 4600 | |
| }, | |
| { | |
| "entropy": 1.6531485810875892, | |
| "epoch": 7.921555985385773, | |
| "grad_norm": 0.7952063679695129, | |
| "learning_rate": 1.5640000000000003e-05, | |
| "loss": 1.6732818603515625, | |
| "mean_token_accuracy": 0.6840143203735352, | |
| "num_tokens": 27390777.0, | |
| "step": 4610 | |
| }, | |
| { | |
| "entropy": 1.7451679170131684, | |
| "epoch": 7.938749194068343, | |
| "grad_norm": 0.7736355066299438, | |
| "learning_rate": 1.5240000000000001e-05, | |
| "loss": 1.836105728149414, | |
| "mean_token_accuracy": 0.6631482250988483, | |
| "num_tokens": 27452458.0, | |
| "step": 4620 | |
| }, | |
| { | |
| "entropy": 1.6219932287931442, | |
| "epoch": 7.955942402750914, | |
| "grad_norm": 0.7429597973823547, | |
| "learning_rate": 1.4840000000000002e-05, | |
| "loss": 1.6252763748168946, | |
| "mean_token_accuracy": 0.6922797068953515, | |
| "num_tokens": 27510793.0, | |
| "step": 4630 | |
| }, | |
| { | |
| "entropy": 1.7097622737288476, | |
| "epoch": 7.9731356114334835, | |
| "grad_norm": 0.7546749114990234, | |
| "learning_rate": 1.444e-05, | |
| "loss": 1.7529830932617188, | |
| "mean_token_accuracy": 0.6756818048655987, | |
| "num_tokens": 27570434.0, | |
| "step": 4640 | |
| }, | |
| { | |
| "entropy": 1.7681476891040802, | |
| "epoch": 7.990328820116054, | |
| "grad_norm": 0.8919919729232788, | |
| "learning_rate": 1.4040000000000001e-05, | |
| "loss": 1.8469413757324218, | |
| "mean_token_accuracy": 0.6651480123400688, | |
| "num_tokens": 27632017.0, | |
| "step": 4650 | |
| }, | |
| { | |
| "entropy": 1.7464849283168842, | |
| "epoch": 8.006877283473028, | |
| "grad_norm": 0.8629288077354431, | |
| "learning_rate": 1.364e-05, | |
| "loss": 1.7770162582397462, | |
| "mean_token_accuracy": 0.6717489861048661, | |
| "num_tokens": 27687721.0, | |
| "step": 4660 | |
| }, | |
| { | |
| "entropy": 1.733792708069086, | |
| "epoch": 8.024070492155598, | |
| "grad_norm": 0.8012450337409973, | |
| "learning_rate": 1.324e-05, | |
| "loss": 1.7535259246826171, | |
| "mean_token_accuracy": 0.6781957261264324, | |
| "num_tokens": 27748609.0, | |
| "step": 4670 | |
| }, | |
| { | |
| "entropy": 1.673891542851925, | |
| "epoch": 8.041263700838169, | |
| "grad_norm": 0.8763530850410461, | |
| "learning_rate": 1.2839999999999999e-05, | |
| "loss": 1.7353546142578125, | |
| "mean_token_accuracy": 0.6773874297738075, | |
| "num_tokens": 27805200.0, | |
| "step": 4680 | |
| }, | |
| { | |
| "entropy": 1.6245143353939056, | |
| "epoch": 8.05845690952074, | |
| "grad_norm": 0.7880796194076538, | |
| "learning_rate": 1.244e-05, | |
| "loss": 1.6489152908325195, | |
| "mean_token_accuracy": 0.6891307681798935, | |
| "num_tokens": 27866189.0, | |
| "step": 4690 | |
| }, | |
| { | |
| "entropy": 1.7772031486034394, | |
| "epoch": 8.07565011820331, | |
| "grad_norm": 0.894481360912323, | |
| "learning_rate": 1.204e-05, | |
| "loss": 1.8237220764160156, | |
| "mean_token_accuracy": 0.6645158022642136, | |
| "num_tokens": 27929040.0, | |
| "step": 4700 | |
| }, | |
| { | |
| "entropy": 1.6911936491727828, | |
| "epoch": 8.09284332688588, | |
| "grad_norm": 0.8212205171585083, | |
| "learning_rate": 1.164e-05, | |
| "loss": 1.718613624572754, | |
| "mean_token_accuracy": 0.6778515942394734, | |
| "num_tokens": 27989259.0, | |
| "step": 4710 | |
| }, | |
| { | |
| "entropy": 1.7341958984732628, | |
| "epoch": 8.110036535568451, | |
| "grad_norm": 0.8757619261741638, | |
| "learning_rate": 1.124e-05, | |
| "loss": 1.83496150970459, | |
| "mean_token_accuracy": 0.67105031311512, | |
| "num_tokens": 28051037.0, | |
| "step": 4720 | |
| }, | |
| { | |
| "entropy": 1.6540620133280755, | |
| "epoch": 8.127229744251022, | |
| "grad_norm": 0.6871177554130554, | |
| "learning_rate": 1.084e-05, | |
| "loss": 1.6868721008300782, | |
| "mean_token_accuracy": 0.6824644193053245, | |
| "num_tokens": 28117218.0, | |
| "step": 4730 | |
| }, | |
| { | |
| "entropy": 1.7760244339704514, | |
| "epoch": 8.144422952933592, | |
| "grad_norm": 0.8672593832015991, | |
| "learning_rate": 1.0440000000000002e-05, | |
| "loss": 1.8467548370361329, | |
| "mean_token_accuracy": 0.6605620160698891, | |
| "num_tokens": 28176643.0, | |
| "step": 4740 | |
| }, | |
| { | |
| "entropy": 1.6998422421514987, | |
| "epoch": 8.16161616161616, | |
| "grad_norm": 0.9853087663650513, | |
| "learning_rate": 1.004e-05, | |
| "loss": 1.7283611297607422, | |
| "mean_token_accuracy": 0.6775359824299813, | |
| "num_tokens": 28234550.0, | |
| "step": 4750 | |
| }, | |
| { | |
| "entropy": 1.7665151111781596, | |
| "epoch": 8.178809370298731, | |
| "grad_norm": 0.8272210955619812, | |
| "learning_rate": 9.640000000000001e-06, | |
| "loss": 1.8442218780517579, | |
| "mean_token_accuracy": 0.6675057601183653, | |
| "num_tokens": 28292004.0, | |
| "step": 4760 | |
| }, | |
| { | |
| "entropy": 1.7351939789950848, | |
| "epoch": 8.196002578981302, | |
| "grad_norm": 0.8758223652839661, | |
| "learning_rate": 9.24e-06, | |
| "loss": 1.7823253631591798, | |
| "mean_token_accuracy": 0.6717655852437019, | |
| "num_tokens": 28351089.0, | |
| "step": 4770 | |
| }, | |
| { | |
| "entropy": 1.7320286817848682, | |
| "epoch": 8.213195787663873, | |
| "grad_norm": 0.8538162708282471, | |
| "learning_rate": 8.840000000000002e-06, | |
| "loss": 1.758108139038086, | |
| "mean_token_accuracy": 0.6750058546662331, | |
| "num_tokens": 28411108.0, | |
| "step": 4780 | |
| }, | |
| { | |
| "entropy": 1.7250167533755303, | |
| "epoch": 8.230388996346443, | |
| "grad_norm": 0.8055081963539124, | |
| "learning_rate": 8.44e-06, | |
| "loss": 1.7342365264892579, | |
| "mean_token_accuracy": 0.6727670766413212, | |
| "num_tokens": 28469910.0, | |
| "step": 4790 | |
| }, | |
| { | |
| "entropy": 1.6715928614139557, | |
| "epoch": 8.247582205029014, | |
| "grad_norm": 0.8282851576805115, | |
| "learning_rate": 8.040000000000001e-06, | |
| "loss": 1.7284685134887696, | |
| "mean_token_accuracy": 0.6803247310221195, | |
| "num_tokens": 28528732.0, | |
| "step": 4800 | |
| }, | |
| { | |
| "entropy": 1.7717369854450227, | |
| "epoch": 8.264775413711584, | |
| "grad_norm": 0.7199074029922485, | |
| "learning_rate": 7.64e-06, | |
| "loss": 1.8089387893676758, | |
| "mean_token_accuracy": 0.6684400778263807, | |
| "num_tokens": 28591231.0, | |
| "step": 4810 | |
| }, | |
| { | |
| "entropy": 1.6829568967223167, | |
| "epoch": 8.281968622394155, | |
| "grad_norm": 0.8212400674819946, | |
| "learning_rate": 7.240000000000001e-06, | |
| "loss": 1.6901424407958985, | |
| "mean_token_accuracy": 0.6812582932412624, | |
| "num_tokens": 28651538.0, | |
| "step": 4820 | |
| }, | |
| { | |
| "entropy": 1.7792557999491692, | |
| "epoch": 8.299161831076725, | |
| "grad_norm": 0.8251553773880005, | |
| "learning_rate": 6.840000000000001e-06, | |
| "loss": 1.8440101623535157, | |
| "mean_token_accuracy": 0.6635224357247352, | |
| "num_tokens": 28713818.0, | |
| "step": 4830 | |
| }, | |
| { | |
| "entropy": 1.6888219453394413, | |
| "epoch": 8.316355039759294, | |
| "grad_norm": 0.799067497253418, | |
| "learning_rate": 6.44e-06, | |
| "loss": 1.7452951431274415, | |
| "mean_token_accuracy": 0.6766478583216667, | |
| "num_tokens": 28771713.0, | |
| "step": 4840 | |
| }, | |
| { | |
| "entropy": 1.6663143932819366, | |
| "epoch": 8.333548248441865, | |
| "grad_norm": 0.7468796968460083, | |
| "learning_rate": 6.040000000000001e-06, | |
| "loss": 1.6975286483764649, | |
| "mean_token_accuracy": 0.6818139903247357, | |
| "num_tokens": 28833584.0, | |
| "step": 4850 | |
| }, | |
| { | |
| "entropy": 1.736840507388115, | |
| "epoch": 8.350741457124435, | |
| "grad_norm": 0.9168211817741394, | |
| "learning_rate": 5.64e-06, | |
| "loss": 1.8019765853881835, | |
| "mean_token_accuracy": 0.6729365028440952, | |
| "num_tokens": 28891158.0, | |
| "step": 4860 | |
| }, | |
| { | |
| "entropy": 1.7159839145839215, | |
| "epoch": 8.367934665807006, | |
| "grad_norm": 0.8348814249038696, | |
| "learning_rate": 5.240000000000001e-06, | |
| "loss": 1.7910118103027344, | |
| "mean_token_accuracy": 0.67631860896945, | |
| "num_tokens": 28948026.0, | |
| "step": 4870 | |
| }, | |
| { | |
| "entropy": 1.7169093780219555, | |
| "epoch": 8.385127874489577, | |
| "grad_norm": 0.8493881821632385, | |
| "learning_rate": 4.84e-06, | |
| "loss": 1.7167430877685548, | |
| "mean_token_accuracy": 0.6753393478691578, | |
| "num_tokens": 29005197.0, | |
| "step": 4880 | |
| }, | |
| { | |
| "entropy": 1.6801239594817161, | |
| "epoch": 8.402321083172147, | |
| "grad_norm": 0.8069011569023132, | |
| "learning_rate": 4.440000000000001e-06, | |
| "loss": 1.6674100875854492, | |
| "mean_token_accuracy": 0.681441531330347, | |
| "num_tokens": 29062454.0, | |
| "step": 4890 | |
| }, | |
| { | |
| "entropy": 1.7267012923955918, | |
| "epoch": 8.419514291854718, | |
| "grad_norm": 0.8063756823539734, | |
| "learning_rate": 4.04e-06, | |
| "loss": 1.7544673919677733, | |
| "mean_token_accuracy": 0.6745367147028446, | |
| "num_tokens": 29121055.0, | |
| "step": 4900 | |
| }, | |
| { | |
| "entropy": 1.6062462359666825, | |
| "epoch": 8.436707500537288, | |
| "grad_norm": 0.8285024762153625, | |
| "learning_rate": 3.6400000000000003e-06, | |
| "loss": 1.6273128509521484, | |
| "mean_token_accuracy": 0.690464211255312, | |
| "num_tokens": 29176963.0, | |
| "step": 4910 | |
| }, | |
| { | |
| "entropy": 1.7958560451865195, | |
| "epoch": 8.453900709219859, | |
| "grad_norm": 0.8202657103538513, | |
| "learning_rate": 3.24e-06, | |
| "loss": 1.8311897277832032, | |
| "mean_token_accuracy": 0.661663169786334, | |
| "num_tokens": 29235880.0, | |
| "step": 4920 | |
| }, | |
| { | |
| "entropy": 1.665907260030508, | |
| "epoch": 8.47109391790243, | |
| "grad_norm": 0.8672494292259216, | |
| "learning_rate": 2.8400000000000003e-06, | |
| "loss": 1.6878423690795898, | |
| "mean_token_accuracy": 0.6819184564054013, | |
| "num_tokens": 29295823.0, | |
| "step": 4930 | |
| }, | |
| { | |
| "entropy": 1.7426866918802262, | |
| "epoch": 8.488287126584998, | |
| "grad_norm": 0.8398126363754272, | |
| "learning_rate": 2.4400000000000004e-06, | |
| "loss": 1.810443115234375, | |
| "mean_token_accuracy": 0.6639036998152733, | |
| "num_tokens": 29355386.0, | |
| "step": 4940 | |
| }, | |
| { | |
| "entropy": 1.6938614405691623, | |
| "epoch": 8.505480335267569, | |
| "grad_norm": 0.7652584314346313, | |
| "learning_rate": 2.0400000000000004e-06, | |
| "loss": 1.7690727233886718, | |
| "mean_token_accuracy": 0.6737098075449467, | |
| "num_tokens": 29414966.0, | |
| "step": 4950 | |
| }, | |
| { | |
| "entropy": 1.7538506165146828, | |
| "epoch": 8.52267354395014, | |
| "grad_norm": 0.8389163017272949, | |
| "learning_rate": 1.6400000000000002e-06, | |
| "loss": 1.8067062377929688, | |
| "mean_token_accuracy": 0.6728679880499839, | |
| "num_tokens": 29472960.0, | |
| "step": 4960 | |
| }, | |
| { | |
| "entropy": 1.7591105610132218, | |
| "epoch": 8.53986675263271, | |
| "grad_norm": 0.8280366063117981, | |
| "learning_rate": 1.24e-06, | |
| "loss": 1.7855098724365235, | |
| "mean_token_accuracy": 0.6670263484120369, | |
| "num_tokens": 29531300.0, | |
| "step": 4970 | |
| }, | |
| { | |
| "entropy": 1.6825189530849456, | |
| "epoch": 8.55705996131528, | |
| "grad_norm": 0.8177328109741211, | |
| "learning_rate": 8.4e-07, | |
| "loss": 1.731926727294922, | |
| "mean_token_accuracy": 0.6818420931696891, | |
| "num_tokens": 29591290.0, | |
| "step": 4980 | |
| }, | |
| { | |
| "entropy": 1.7112577512860299, | |
| "epoch": 8.574253169997851, | |
| "grad_norm": 0.8413036465644836, | |
| "learning_rate": 4.4e-07, | |
| "loss": 1.7446353912353516, | |
| "mean_token_accuracy": 0.6750271447002888, | |
| "num_tokens": 29646086.0, | |
| "step": 4990 | |
| }, | |
| { | |
| "entropy": 1.7419164210557938, | |
| "epoch": 8.591446378680422, | |
| "grad_norm": 0.9462088346481323, | |
| "learning_rate": 4e-08, | |
| "loss": 1.7870445251464844, | |
| "mean_token_accuracy": 0.666933435574174, | |
| "num_tokens": 29704815.0, | |
| "step": 5000 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 5000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 9, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.438188209453138e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |