PEFT
Safetensors
English
cybersecurity
malware-analysis
att&ck
threat-intelligence
mixtral
lora
expert-adapters
cape-sandbox
digital-forensics
Instructions to use umer07/fathom-mixtral with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- PEFT
How to use umer07/fathom-mixtral with PEFT:
from peft import PeftModel from transformers import AutoModelForCausalLM base_model = AutoModelForCausalLM.from_pretrained("mistralai/Mixtral-8x7B-Instruct-v0.1") model = PeftModel.from_pretrained(base_model, "umer07/fathom-mixtral") - Notebooks
- Google Colab
- Kaggle
| [ | |
| { | |
| "loss": 1.5327, | |
| "grad_norm": 0.5706859827041626, | |
| "learning_rate": 4.5e-05, | |
| "entropy": 1.2147093176841737, | |
| "num_tokens": 156309.0, | |
| "mean_token_accuracy": 0.659215685725212, | |
| "epoch": 0.03355704697986577, | |
| "step": 10 | |
| }, | |
| { | |
| "loss": 1.1979, | |
| "grad_norm": 1.02685546875, | |
| "learning_rate": 9.5e-05, | |
| "entropy": 1.2502837777137756, | |
| "num_tokens": 307512.0, | |
| "mean_token_accuracy": 0.7190202981233597, | |
| "epoch": 0.06711409395973154, | |
| "step": 20 | |
| }, | |
| { | |
| "loss": 0.6687, | |
| "grad_norm": 0.5571576356887817, | |
| "learning_rate": 9.999075138471951e-05, | |
| "entropy": 0.6448976010084152, | |
| "num_tokens": 461226.0, | |
| "mean_token_accuracy": 0.8424648404121399, | |
| "epoch": 0.10067114093959731, | |
| "step": 30 | |
| }, | |
| { | |
| "loss": 0.4812, | |
| "grad_norm": 0.38987967371940613, | |
| "learning_rate": 9.995878525539525e-05, | |
| "entropy": 0.47405652403831483, | |
| "num_tokens": 618054.0, | |
| "mean_token_accuracy": 0.8851036787033081, | |
| "epoch": 0.1342281879194631, | |
| "step": 40 | |
| }, | |
| { | |
| "loss": 0.3701, | |
| "grad_norm": 0.2643287777900696, | |
| "learning_rate": 9.990400202763563e-05, | |
| "entropy": 0.3693989284336567, | |
| "num_tokens": 775209.0, | |
| "mean_token_accuracy": 0.9097828209400177, | |
| "epoch": 0.16778523489932887, | |
| "step": 50 | |
| }, | |
| { | |
| "loss": 0.3193, | |
| "grad_norm": 0.27373382449150085, | |
| "learning_rate": 9.982642672195092e-05, | |
| "entropy": 0.32156657576560976, | |
| "num_tokens": 931365.0, | |
| "mean_token_accuracy": 0.920613020658493, | |
| "epoch": 0.20134228187919462, | |
| "step": 60 | |
| }, | |
| { | |
| "loss": 0.2866, | |
| "grad_norm": 0.3094562292098999, | |
| "learning_rate": 9.972609476841367e-05, | |
| "entropy": 0.292499927431345, | |
| "num_tokens": 1086695.0, | |
| "mean_token_accuracy": 0.9282213032245636, | |
| "epoch": 0.2348993288590604, | |
| "step": 70 | |
| }, | |
| { | |
| "loss": 0.2489, | |
| "grad_norm": 0.2749999761581421, | |
| "learning_rate": 9.960305199047712e-05, | |
| "entropy": 0.2526983417570591, | |
| "num_tokens": 1241470.0, | |
| "mean_token_accuracy": 0.9362666577100753, | |
| "epoch": 0.2684563758389262, | |
| "step": 80 | |
| }, | |
| { | |
| "loss": 0.2459, | |
| "grad_norm": 0.2721326947212219, | |
| "learning_rate": 9.945735458404681e-05, | |
| "entropy": 0.24655402898788453, | |
| "num_tokens": 1396709.0, | |
| "mean_token_accuracy": 0.9367562144994735, | |
| "epoch": 0.30201342281879195, | |
| "step": 90 | |
| }, | |
| { | |
| "loss": 0.2435, | |
| "grad_norm": 0.21973158419132233, | |
| "learning_rate": 9.928906909181481e-05, | |
| "entropy": 0.24917888268828392, | |
| "num_tokens": 1555025.0, | |
| "mean_token_accuracy": 0.9364570289850235, | |
| "epoch": 0.33557046979865773, | |
| "step": 100 | |
| }, | |
| { | |
| "loss": 0.2347, | |
| "grad_norm": 0.26128071546554565, | |
| "learning_rate": 9.909827237286849e-05, | |
| "entropy": 0.23597020357847215, | |
| "num_tokens": 1706763.0, | |
| "mean_token_accuracy": 0.9403788715600967, | |
| "epoch": 0.3691275167785235, | |
| "step": 110 | |
| }, | |
| { | |
| "loss": 0.2478, | |
| "grad_norm": 0.20566503703594208, | |
| "learning_rate": 9.888505156758759e-05, | |
| "entropy": 0.2498910292983055, | |
| "num_tokens": 1867753.0, | |
| "mean_token_accuracy": 0.9367677628993988, | |
| "epoch": 0.40268456375838924, | |
| "step": 120 | |
| }, | |
| { | |
| "loss": 0.2417, | |
| "grad_norm": 0.190927654504776, | |
| "learning_rate": 9.864950405784551e-05, | |
| "entropy": 0.24509735256433487, | |
| "num_tokens": 2027436.0, | |
| "mean_token_accuracy": 0.9376339435577392, | |
| "epoch": 0.436241610738255, | |
| "step": 130 | |
| }, | |
| { | |
| "loss": 0.2196, | |
| "grad_norm": 0.1996077448129654, | |
| "learning_rate": 9.839173742253334e-05, | |
| "entropy": 0.22244628816843032, | |
| "num_tokens": 2182978.0, | |
| "mean_token_accuracy": 0.9424097180366516, | |
| "epoch": 0.4697986577181208, | |
| "step": 140 | |
| }, | |
| { | |
| "loss": 0.2534, | |
| "grad_norm": 0.21034187078475952, | |
| "learning_rate": 9.811186938842645e-05, | |
| "entropy": 0.2574092894792557, | |
| "num_tokens": 2343108.0, | |
| "mean_token_accuracy": 0.9341461777687072, | |
| "epoch": 0.5033557046979866, | |
| "step": 150 | |
| }, | |
| { | |
| "loss": 0.2369, | |
| "grad_norm": 0.20116521418094635, | |
| "learning_rate": 9.781002777641664e-05, | |
| "entropy": 0.23710015863180162, | |
| "num_tokens": 2503180.0, | |
| "mean_token_accuracy": 0.9385264277458191, | |
| "epoch": 0.5369127516778524, | |
| "step": 160 | |
| }, | |
| { | |
| "loss": 0.2194, | |
| "grad_norm": 0.20917288959026337, | |
| "learning_rate": 9.748635044313386e-05, | |
| "entropy": 0.22300637513399124, | |
| "num_tokens": 2658840.0, | |
| "mean_token_accuracy": 0.9410306721925735, | |
| "epoch": 0.5704697986577181, | |
| "step": 170 | |
| }, | |
| { | |
| "loss": 0.2295, | |
| "grad_norm": 0.19810253381729126, | |
| "learning_rate": 9.714098521798465e-05, | |
| "entropy": 0.23066828176379203, | |
| "num_tokens": 2818656.0, | |
| "mean_token_accuracy": 0.9388932436704636, | |
| "epoch": 0.6040268456375839, | |
| "step": 180 | |
| }, | |
| { | |
| "loss": 0.231, | |
| "grad_norm": 0.17845278978347778, | |
| "learning_rate": 9.677408983563565e-05, | |
| "entropy": 0.23543039262294768, | |
| "num_tokens": 2975473.0, | |
| "mean_token_accuracy": 0.9391510993242264, | |
| "epoch": 0.6375838926174496, | |
| "step": 190 | |
| }, | |
| { | |
| "loss": 0.2152, | |
| "grad_norm": 0.23437312245368958, | |
| "learning_rate": 9.638583186397331e-05, | |
| "entropy": 0.21993101984262467, | |
| "num_tokens": 3130452.0, | |
| "mean_token_accuracy": 0.9423492521047592, | |
| "epoch": 0.6711409395973155, | |
| "step": 200 | |
| }, | |
| { | |
| "loss": 0.204, | |
| "grad_norm": 0.2377660572528839, | |
| "learning_rate": 9.597638862757255e-05, | |
| "entropy": 0.2064872995018959, | |
| "num_tokens": 3287490.0, | |
| "mean_token_accuracy": 0.9451317518949509, | |
| "epoch": 0.7046979865771812, | |
| "step": 210 | |
| }, | |
| { | |
| "loss": 0.2158, | |
| "grad_norm": 0.17862877249717712, | |
| "learning_rate": 9.554594712670926e-05, | |
| "entropy": 0.2166791968047619, | |
| "num_tokens": 3446266.0, | |
| "mean_token_accuracy": 0.9421544283628464, | |
| "epoch": 0.738255033557047, | |
| "step": 220 | |
| }, | |
| { | |
| "loss": 0.2253, | |
| "grad_norm": 0.1511947214603424, | |
| "learning_rate": 9.509470395195399e-05, | |
| "entropy": 0.22676953300833702, | |
| "num_tokens": 3602351.0, | |
| "mean_token_accuracy": 0.939826962351799, | |
| "epoch": 0.7718120805369127, | |
| "step": 230 | |
| }, | |
| { | |
| "loss": 0.2201, | |
| "grad_norm": 0.26649826765060425, | |
| "learning_rate": 9.46228651943853e-05, | |
| "entropy": 0.22365730181336402, | |
| "num_tokens": 3757932.0, | |
| "mean_token_accuracy": 0.9410541921854019, | |
| "epoch": 0.8053691275167785, | |
| "step": 240 | |
| }, | |
| { | |
| "loss": 0.2026, | |
| "grad_norm": 0.19634084403514862, | |
| "learning_rate": 9.413064635146418e-05, | |
| "entropy": 0.20426617376506329, | |
| "num_tokens": 3912346.0, | |
| "mean_token_accuracy": 0.9445015460252761, | |
| "epoch": 0.8389261744966443, | |
| "step": 250 | |
| }, | |
| { | |
| "loss": 0.2201, | |
| "grad_norm": 0.19683024287223816, | |
| "learning_rate": 9.361827222861241e-05, | |
| "entropy": 0.22703441381454467, | |
| "num_tokens": 4071117.0, | |
| "mean_token_accuracy": 0.9421917259693146, | |
| "epoch": 0.87248322147651, | |
| "step": 260 | |
| }, | |
| { | |
| "loss": 0.2048, | |
| "grad_norm": 0.184284046292305, | |
| "learning_rate": 9.308597683653975e-05, | |
| "entropy": 0.20578452795743943, | |
| "num_tokens": 4226750.0, | |
| "mean_token_accuracy": 0.9451743960380554, | |
| "epoch": 0.9060402684563759, | |
| "step": 270 | |
| }, | |
| { | |
| "loss": 0.2181, | |
| "grad_norm": 0.19832965731620789, | |
| "learning_rate": 9.253400328436699e-05, | |
| "entropy": 0.22092150747776032, | |
| "num_tokens": 4385096.0, | |
| "mean_token_accuracy": 0.9426127344369888, | |
| "epoch": 0.9395973154362416, | |
| "step": 280 | |
| }, | |
| { | |
| "loss": 0.2178, | |
| "grad_norm": 0.16237328946590424, | |
| "learning_rate": 9.196260366859342e-05, | |
| "entropy": 0.22047539427876472, | |
| "num_tokens": 4544552.0, | |
| "mean_token_accuracy": 0.9419336885213851, | |
| "epoch": 0.9731543624161074, | |
| "step": 290 | |
| }, | |
| { | |
| "loss": 0.2188, | |
| "grad_norm": 0.19606098532676697, | |
| "learning_rate": 9.137203895795983e-05, | |
| "entropy": 0.2207832932472229, | |
| "num_tokens": 4701426.0, | |
| "mean_token_accuracy": 0.941154745221138, | |
| "epoch": 1.0067114093959733, | |
| "step": 300 | |
| }, | |
| { | |
| "loss": 0.1948, | |
| "grad_norm": 0.2281929850578308, | |
| "learning_rate": 9.076257887425923e-05, | |
| "entropy": 0.19738901741802692, | |
| "num_tokens": 4858846.0, | |
| "mean_token_accuracy": 0.9471817642450333, | |
| "epoch": 1.0402684563758389, | |
| "step": 310 | |
| }, | |
| { | |
| "loss": 0.2037, | |
| "grad_norm": 0.19096426665782928, | |
| "learning_rate": 9.01345017691499e-05, | |
| "entropy": 0.2047410562634468, | |
| "num_tokens": 5015590.0, | |
| "mean_token_accuracy": 0.9454032570123673, | |
| "epoch": 1.0738255033557047, | |
| "step": 320 | |
| }, | |
| { | |
| "loss": 0.2023, | |
| "grad_norm": 0.19078002870082855, | |
| "learning_rate": 8.948809449702711e-05, | |
| "entropy": 0.21062878221273423, | |
| "num_tokens": 5173149.0, | |
| "mean_token_accuracy": 0.9446896910667419, | |
| "epoch": 1.1073825503355705, | |
| "step": 330 | |
| }, | |
| { | |
| "loss": 0.1997, | |
| "grad_norm": 0.26884037256240845, | |
| "learning_rate": 8.882365228401139e-05, | |
| "entropy": 0.20285834297537803, | |
| "num_tokens": 5331307.0, | |
| "mean_token_accuracy": 0.9449720442295074, | |
| "epoch": 1.1409395973154361, | |
| "step": 340 | |
| }, | |
| { | |
| "loss": 0.1967, | |
| "grad_norm": 0.23782381415367126, | |
| "learning_rate": 8.814147859311332e-05, | |
| "entropy": 0.1966269500553608, | |
| "num_tokens": 5486558.0, | |
| "mean_token_accuracy": 0.9466675192117691, | |
| "epoch": 1.174496644295302, | |
| "step": 350 | |
| }, | |
| { | |
| "loss": 0.2135, | |
| "grad_norm": 0.19140051305294037, | |
| "learning_rate": 8.744188498563641e-05, | |
| "entropy": 0.21856041625142097, | |
| "num_tokens": 5643311.0, | |
| "mean_token_accuracy": 0.9413786977529526, | |
| "epoch": 1.2080536912751678, | |
| "step": 360 | |
| }, | |
| { | |
| "loss": 0.1956, | |
| "grad_norm": 0.2523305118083954, | |
| "learning_rate": 8.672519097888126e-05, | |
| "entropy": 0.19831475540995597, | |
| "num_tokens": 5798511.0, | |
| "mean_token_accuracy": 0.946366423368454, | |
| "epoch": 1.2416107382550337, | |
| "step": 370 | |
| }, | |
| { | |
| "loss": 0.1895, | |
| "grad_norm": 0.16881771385669708, | |
| "learning_rate": 8.599172390021615e-05, | |
| "entropy": 0.1935427539050579, | |
| "num_tokens": 5954868.0, | |
| "mean_token_accuracy": 0.9482783049345016, | |
| "epoch": 1.2751677852348993, | |
| "step": 380 | |
| }, | |
| { | |
| "loss": 0.1857, | |
| "grad_norm": 0.15226209163665771, | |
| "learning_rate": 8.524181873758059e-05, | |
| "entropy": 0.18582973293960095, | |
| "num_tokens": 6108894.0, | |
| "mean_token_accuracy": 0.9488643199205399, | |
| "epoch": 1.308724832214765, | |
| "step": 390 | |
| }, | |
| { | |
| "loss": 0.2078, | |
| "grad_norm": 0.19812646508216858, | |
| "learning_rate": 8.447581798649014e-05, | |
| "entropy": 0.20552278831601142, | |
| "num_tokens": 6267764.0, | |
| "mean_token_accuracy": 0.9442088425159454, | |
| "epoch": 1.342281879194631, | |
| "step": 400 | |
| }, | |
| { | |
| "loss": 0.2074, | |
| "grad_norm": 0.188148632645607, | |
| "learning_rate": 8.369407149361241e-05, | |
| "entropy": 0.21127415373921393, | |
| "num_tokens": 6424716.0, | |
| "mean_token_accuracy": 0.9449205726385117, | |
| "epoch": 1.3758389261744965, | |
| "step": 410 | |
| }, | |
| { | |
| "loss": 0.1948, | |
| "grad_norm": 0.19974718987941742, | |
| "learning_rate": 8.289693629698564e-05, | |
| "entropy": 0.19758303910493852, | |
| "num_tokens": 6579760.0, | |
| "mean_token_accuracy": 0.9469632744789124, | |
| "epoch": 1.4093959731543624, | |
| "step": 420 | |
| }, | |
| { | |
| "loss": 0.2144, | |
| "grad_norm": 0.2088315635919571, | |
| "learning_rate": 8.208477646295277e-05, | |
| "entropy": 0.21340604722499848, | |
| "num_tokens": 6737911.0, | |
| "mean_token_accuracy": 0.9427065312862396, | |
| "epoch": 1.4429530201342282, | |
| "step": 430 | |
| }, | |
| { | |
| "loss": 0.2145, | |
| "grad_norm": 0.31156837940216064, | |
| "learning_rate": 8.125796291988577e-05, | |
| "entropy": 0.21349589377641678, | |
| "num_tokens": 6894903.0, | |
| "mean_token_accuracy": 0.9425385951995849, | |
| "epoch": 1.476510067114094, | |
| "step": 440 | |
| }, | |
| { | |
| "loss": 0.2159, | |
| "grad_norm": 0.18016791343688965, | |
| "learning_rate": 8.041687328877567e-05, | |
| "entropy": 0.2204777255654335, | |
| "num_tokens": 7054943.0, | |
| "mean_token_accuracy": 0.9415547668933868, | |
| "epoch": 1.5100671140939599, | |
| "step": 450 | |
| }, | |
| { | |
| "loss": 0.202, | |
| "grad_norm": 0.2058987319469452, | |
| "learning_rate": 7.956189171076616e-05, | |
| "entropy": 0.20117317102849483, | |
| "num_tokens": 7211170.0, | |
| "mean_token_accuracy": 0.9456082373857498, | |
| "epoch": 1.5436241610738255, | |
| "step": 460 | |
| }, | |
| { | |
| "loss": 0.1939, | |
| "grad_norm": 0.15302836894989014, | |
| "learning_rate": 7.869340867170928e-05, | |
| "entropy": 0.19479563012719153, | |
| "num_tokens": 7369121.0, | |
| "mean_token_accuracy": 0.9471449553966522, | |
| "epoch": 1.5771812080536913, | |
| "step": 470 | |
| }, | |
| { | |
| "loss": 0.1886, | |
| "grad_norm": 0.16370812058448792, | |
| "learning_rate": 7.781182082382325e-05, | |
| "entropy": 0.18975077904760837, | |
| "num_tokens": 7525693.0, | |
| "mean_token_accuracy": 0.9485057532787323, | |
| "epoch": 1.610738255033557, | |
| "step": 480 | |
| }, | |
| { | |
| "loss": 0.2023, | |
| "grad_norm": 0.1960698664188385, | |
| "learning_rate": 7.691753080453412e-05, | |
| "entropy": 0.2045104220509529, | |
| "num_tokens": 7683086.0, | |
| "mean_token_accuracy": 0.9456699937582016, | |
| "epoch": 1.6442953020134228, | |
| "step": 490 | |
| }, | |
| { | |
| "loss": 0.1991, | |
| "grad_norm": 0.18973101675510406, | |
| "learning_rate": 7.60109470525839e-05, | |
| "entropy": 0.20317464768886567, | |
| "num_tokens": 7840651.0, | |
| "mean_token_accuracy": 0.9453895062208175, | |
| "epoch": 1.6778523489932886, | |
| "step": 500 | |
| }, | |
| { | |
| "loss": 0.1939, | |
| "grad_norm": 0.19162558019161224, | |
| "learning_rate": 7.509248362148889e-05, | |
| "entropy": 0.19502840861678122, | |
| "num_tokens": 7997627.0, | |
| "mean_token_accuracy": 0.9481672704219818, | |
| "epoch": 1.7114093959731544, | |
| "step": 510 | |
| }, | |
| { | |
| "loss": 0.1921, | |
| "grad_norm": 0.18042920529842377, | |
| "learning_rate": 7.416255999043401e-05, | |
| "entropy": 0.19183044098317623, | |
| "num_tokens": 8153075.0, | |
| "mean_token_accuracy": 0.9467999368906022, | |
| "epoch": 1.7449664429530203, | |
| "step": 520 | |
| }, | |
| { | |
| "loss": 0.1788, | |
| "grad_norm": 0.1608305722475052, | |
| "learning_rate": 7.322160087268877e-05, | |
| "entropy": 0.18251866959035395, | |
| "num_tokens": 8307904.0, | |
| "mean_token_accuracy": 0.9506349295377732, | |
| "epoch": 1.778523489932886, | |
| "step": 530 | |
| }, | |
| { | |
| "loss": 0.1808, | |
| "grad_norm": 0.18992868065834045, | |
| "learning_rate": 7.227003602163295e-05, | |
| "entropy": 0.1834562122821808, | |
| "num_tokens": 8464236.0, | |
| "mean_token_accuracy": 0.9505033463239669, | |
| "epoch": 1.8120805369127517, | |
| "step": 540 | |
| }, | |
| { | |
| "loss": 0.1868, | |
| "grad_norm": 0.21314968168735504, | |
| "learning_rate": 7.130830003448032e-05, | |
| "entropy": 0.1886416744440794, | |
| "num_tokens": 8622442.0, | |
| "mean_token_accuracy": 0.9492894530296325, | |
| "epoch": 1.8456375838926173, | |
| "step": 550 | |
| }, | |
| { | |
| "loss": 0.19, | |
| "grad_norm": 0.196988046169281, | |
| "learning_rate": 7.033683215379002e-05, | |
| "entropy": 0.1898799568414688, | |
| "num_tokens": 8778712.0, | |
| "mean_token_accuracy": 0.9487443953752518, | |
| "epoch": 1.8791946308724832, | |
| "step": 560 | |
| }, | |
| { | |
| "loss": 0.1872, | |
| "grad_norm": 0.19480957090854645, | |
| "learning_rate": 6.935607606685642e-05, | |
| "entropy": 0.1890778660774231, | |
| "num_tokens": 8934261.0, | |
| "mean_token_accuracy": 0.9492935538291931, | |
| "epoch": 1.912751677852349, | |
| "step": 570 | |
| }, | |
| { | |
| "loss": 0.1923, | |
| "grad_norm": 0.1902640014886856, | |
| "learning_rate": 6.836647970306894e-05, | |
| "entropy": 0.1931080285459757, | |
| "num_tokens": 9093573.0, | |
| "mean_token_accuracy": 0.9487350136041641, | |
| "epoch": 1.9463087248322148, | |
| "step": 580 | |
| }, | |
| { | |
| "loss": 0.1817, | |
| "grad_norm": 0.19666995108127594, | |
| "learning_rate": 6.736849502933452e-05, | |
| "entropy": 0.18354742750525474, | |
| "num_tokens": 9247665.0, | |
| "mean_token_accuracy": 0.9502300530672073, | |
| "epoch": 1.9798657718120807, | |
| "step": 590 | |
| }, | |
| { | |
| "loss": 0.1859, | |
| "grad_norm": 0.2059757262468338, | |
| "learning_rate": 6.636257784365584e-05, | |
| "entropy": 0.18999958783388138, | |
| "num_tokens": 9402433.0, | |
| "mean_token_accuracy": 0.9492688685655594, | |
| "epoch": 2.0134228187919465, | |
| "step": 600 | |
| }, | |
| { | |
| "loss": 0.1847, | |
| "grad_norm": 0.1790536344051361, | |
| "learning_rate": 6.53491875669601e-05, | |
| "entropy": 0.18768733143806457, | |
| "num_tokens": 9559144.0, | |
| "mean_token_accuracy": 0.949212983250618, | |
| "epoch": 2.046979865771812, | |
| "step": 610 | |
| }, | |
| { | |
| "loss": 0.1642, | |
| "grad_norm": 0.21339593827724457, | |
| "learning_rate": 6.432878703327298e-05, | |
| "entropy": 0.16429513394832612, | |
| "num_tokens": 9714865.0, | |
| "mean_token_accuracy": 0.9541611701250077, | |
| "epoch": 2.0805369127516777, | |
| "step": 620 | |
| }, | |
| { | |
| "loss": 0.1855, | |
| "grad_norm": 0.21740688383579254, | |
| "learning_rate": 6.330184227833376e-05, | |
| "entropy": 0.18604125529527665, | |
| "num_tokens": 9872250.0, | |
| "mean_token_accuracy": 0.9479648023843765, | |
| "epoch": 2.1140939597315436, | |
| "step": 630 | |
| }, | |
| { | |
| "loss": 0.1898, | |
| "grad_norm": 0.1908544898033142, | |
| "learning_rate": 6.226882232674825e-05, | |
| "entropy": 0.19123471677303314, | |
| "num_tokens": 10026954.0, | |
| "mean_token_accuracy": 0.9476721823215485, | |
| "epoch": 2.1476510067114094, | |
| "step": 640 | |
| }, | |
| { | |
| "loss": 0.1652, | |
| "grad_norm": 0.2003369778394699, | |
| "learning_rate": 6.123019897777657e-05, | |
| "entropy": 0.16700895316898823, | |
| "num_tokens": 10177873.0, | |
| "mean_token_accuracy": 0.9538896352052688, | |
| "epoch": 2.1812080536912752, | |
| "step": 650 | |
| }, | |
| { | |
| "loss": 0.2036, | |
| "grad_norm": 0.22211240231990814, | |
| "learning_rate": 6.0186446589853784e-05, | |
| "entropy": 0.20571780651807786, | |
| "num_tokens": 10338481.0, | |
| "mean_token_accuracy": 0.9439575403928757, | |
| "epoch": 2.214765100671141, | |
| "step": 660 | |
| }, | |
| { | |
| "loss": 0.2002, | |
| "grad_norm": 0.23921112716197968, | |
| "learning_rate": 5.9138041863941616e-05, | |
| "entropy": 0.19964271709322928, | |
| "num_tokens": 10495720.0, | |
| "mean_token_accuracy": 0.9459815502166748, | |
| "epoch": 2.248322147651007, | |
| "step": 670 | |
| }, | |
| { | |
| "loss": 0.1989, | |
| "grad_norm": 0.20520520210266113, | |
| "learning_rate": 5.808546362581032e-05, | |
| "entropy": 0.20052680410444737, | |
| "num_tokens": 10655224.0, | |
| "mean_token_accuracy": 0.9459189057350159, | |
| "epoch": 2.2818791946308723, | |
| "step": 680 | |
| }, | |
| { | |
| "loss": 0.1803, | |
| "grad_norm": 0.20859530568122864, | |
| "learning_rate": 5.7029192607350146e-05, | |
| "entropy": 0.18437107987701892, | |
| "num_tokens": 10812393.0, | |
| "mean_token_accuracy": 0.9512098997831344, | |
| "epoch": 2.315436241610738, | |
| "step": 690 | |
| }, | |
| { | |
| "loss": 0.1625, | |
| "grad_norm": 0.21114759147167206, | |
| "learning_rate": 5.596971122701221e-05, | |
| "entropy": 0.16424274519085885, | |
| "num_tokens": 10965653.0, | |
| "mean_token_accuracy": 0.9543471187353134, | |
| "epoch": 2.348993288590604, | |
| "step": 700 | |
| }, | |
| { | |
| "loss": 0.164, | |
| "grad_norm": 0.19510741531848907, | |
| "learning_rate": 5.4907503369479116e-05, | |
| "entropy": 0.16679177805781364, | |
| "num_tokens": 11120539.0, | |
| "mean_token_accuracy": 0.954514691233635, | |
| "epoch": 2.38255033557047, | |
| "step": 710 | |
| }, | |
| { | |
| "loss": 0.1772, | |
| "grad_norm": 0.244350403547287, | |
| "learning_rate": 5.384305416466584e-05, | |
| "entropy": 0.17797317542135715, | |
| "num_tokens": 11275790.0, | |
| "mean_token_accuracy": 0.950087907910347, | |
| "epoch": 2.4161073825503356, | |
| "step": 720 | |
| }, | |
| { | |
| "loss": 0.1835, | |
| "grad_norm": 0.19845031201839447, | |
| "learning_rate": 5.2776849766152e-05, | |
| "entropy": 0.18638909384608268, | |
| "num_tokens": 11430204.0, | |
| "mean_token_accuracy": 0.9490520358085632, | |
| "epoch": 2.4496644295302015, | |
| "step": 730 | |
| }, | |
| { | |
| "loss": 0.1883, | |
| "grad_norm": 0.20039933919906616, | |
| "learning_rate": 5.170937712914655e-05, | |
| "entropy": 0.18770763501524926, | |
| "num_tokens": 11592345.0, | |
| "mean_token_accuracy": 0.9483928889036178, | |
| "epoch": 2.4832214765100673, | |
| "step": 740 | |
| }, | |
| { | |
| "loss": 0.1528, | |
| "grad_norm": 0.20752805471420288, | |
| "learning_rate": 5.064112378808637e-05, | |
| "entropy": 0.15445428304374217, | |
| "num_tokens": 11744570.0, | |
| "mean_token_accuracy": 0.9564135998487473, | |
| "epoch": 2.5167785234899327, | |
| "step": 750 | |
| }, | |
| { | |
| "loss": 0.1793, | |
| "grad_norm": 0.2342364341020584, | |
| "learning_rate": 4.957257763397024e-05, | |
| "entropy": 0.18113909810781478, | |
| "num_tokens": 11901062.0, | |
| "mean_token_accuracy": 0.9501085758209229, | |
| "epoch": 2.5503355704697985, | |
| "step": 760 | |
| }, | |
| { | |
| "loss": 0.1572, | |
| "grad_norm": 0.20083309710025787, | |
| "learning_rate": 4.850422669153009e-05, | |
| "entropy": 0.16168876476585864, | |
| "num_tokens": 12053753.0, | |
| "mean_token_accuracy": 0.9557742238044739, | |
| "epoch": 2.5838926174496644, | |
| "step": 770 | |
| }, | |
| { | |
| "loss": 0.1625, | |
| "grad_norm": 0.20891757309436798, | |
| "learning_rate": 4.743655889634105e-05, | |
| "entropy": 0.16362526044249534, | |
| "num_tokens": 12208995.0, | |
| "mean_token_accuracy": 0.9552077889442444, | |
| "epoch": 2.61744966442953, | |
| "step": 780 | |
| }, | |
| { | |
| "loss": 0.1567, | |
| "grad_norm": 0.21302001178264618, | |
| "learning_rate": 4.6370061871972326e-05, | |
| "entropy": 0.15993254519999028, | |
| "num_tokens": 12364530.0, | |
| "mean_token_accuracy": 0.955506107211113, | |
| "epoch": 2.651006711409396, | |
| "step": 790 | |
| }, | |
| { | |
| "loss": 0.1705, | |
| "grad_norm": 0.19647525250911713, | |
| "learning_rate": 4.530522270728048e-05, | |
| "entropy": 0.1731419090181589, | |
| "num_tokens": 12522512.0, | |
| "mean_token_accuracy": 0.9523056238889694, | |
| "epoch": 2.684563758389262, | |
| "step": 800 | |
| }, | |
| { | |
| "loss": 0.1656, | |
| "grad_norm": 0.23564544320106506, | |
| "learning_rate": 4.424252773394704e-05, | |
| "entropy": 0.16778925359249114, | |
| "num_tokens": 12678022.0, | |
| "mean_token_accuracy": 0.9539547711610794, | |
| "epoch": 2.7181208053691277, | |
| "step": 810 | |
| }, | |
| { | |
| "loss": 0.1862, | |
| "grad_norm": 0.19549453258514404, | |
| "learning_rate": 4.318246230436174e-05, | |
| "entropy": 0.1878517519682646, | |
| "num_tokens": 12838063.0, | |
| "mean_token_accuracy": 0.949842843413353, | |
| "epoch": 2.751677852348993, | |
| "step": 820 | |
| }, | |
| { | |
| "loss": 0.1687, | |
| "grad_norm": 0.23449555039405823, | |
| "learning_rate": 4.212551056995323e-05, | |
| "entropy": 0.17150973305106162, | |
| "num_tokens": 12993073.0, | |
| "mean_token_accuracy": 0.953241491317749, | |
| "epoch": 2.785234899328859, | |
| "step": 830 | |
| }, | |
| { | |
| "loss": 0.181, | |
| "grad_norm": 0.19433574378490448, | |
| "learning_rate": 4.107215526006817e-05, | |
| "entropy": 0.1791513752192259, | |
| "num_tokens": 13151000.0, | |
| "mean_token_accuracy": 0.9498830765485764, | |
| "epoch": 2.8187919463087248, | |
| "step": 840 | |
| }, | |
| { | |
| "loss": 0.1782, | |
| "grad_norm": 0.31362298130989075, | |
| "learning_rate": 4.00228774614998e-05, | |
| "entropy": 0.18104747980833052, | |
| "num_tokens": 13306740.0, | |
| "mean_token_accuracy": 0.9511455327272416, | |
| "epoch": 2.8523489932885906, | |
| "step": 850 | |
| }, | |
| { | |
| "loss": 0.1841, | |
| "grad_norm": 0.19440777599811554, | |
| "learning_rate": 3.897815639876673e-05, | |
| "entropy": 0.183981429412961, | |
| "num_tokens": 13465413.0, | |
| "mean_token_accuracy": 0.9499198466539382, | |
| "epoch": 2.8859060402684564, | |
| "step": 860 | |
| }, | |
| { | |
| "loss": 0.1727, | |
| "grad_norm": 0.22845511138439178, | |
| "learning_rate": 3.793846921524237e-05, | |
| "entropy": 0.17417334467172624, | |
| "num_tokens": 13625300.0, | |
| "mean_token_accuracy": 0.9522460520267486, | |
| "epoch": 2.9194630872483223, | |
| "step": 870 | |
| }, | |
| { | |
| "loss": 0.1917, | |
| "grad_norm": 0.17185327410697937, | |
| "learning_rate": 3.6904290755234604e-05, | |
| "entropy": 0.19201257564127444, | |
| "num_tokens": 13787633.0, | |
| "mean_token_accuracy": 0.9470277667045593, | |
| "epoch": 2.953020134228188, | |
| "step": 880 | |
| }, | |
| { | |
| "loss": 0.181, | |
| "grad_norm": 0.22177894413471222, | |
| "learning_rate": 3.587609334711576e-05, | |
| "entropy": 0.18371602855622768, | |
| "num_tokens": 13946986.0, | |
| "mean_token_accuracy": 0.9502452671527862, | |
| "epoch": 2.9865771812080535, | |
| "step": 890 | |
| }, | |
| { | |
| "loss": 0.1639, | |
| "grad_norm": 0.20057617127895355, | |
| "learning_rate": 3.48543465876014e-05, | |
| "entropy": 0.16733385995030403, | |
| "num_tokens": 14102933.0, | |
| "mean_token_accuracy": 0.9548174262046814, | |
| "epoch": 3.0201342281879193, | |
| "step": 900 | |
| }, | |
| { | |
| "loss": 0.1657, | |
| "grad_norm": 0.234299898147583, | |
| "learning_rate": 3.383951712727701e-05, | |
| "entropy": 0.1675445105880499, | |
| "num_tokens": 14259753.0, | |
| "mean_token_accuracy": 0.9535206586122513, | |
| "epoch": 3.053691275167785, | |
| "step": 910 | |
| }, | |
| { | |
| "loss": 0.1583, | |
| "grad_norm": 0.23432078957557678, | |
| "learning_rate": 3.2832068457469945e-05, | |
| "entropy": 0.1605758022516966, | |
| "num_tokens": 14418531.0, | |
| "mean_token_accuracy": 0.955757224559784, | |
| "epoch": 3.087248322147651, | |
| "step": 920 | |
| }, | |
| { | |
| "loss": 0.1504, | |
| "grad_norm": 0.20643459260463715, | |
| "learning_rate": 3.183246069856443e-05, | |
| "entropy": 0.15479738190770148, | |
| "num_tokens": 14573081.0, | |
| "mean_token_accuracy": 0.9577652394771576, | |
| "epoch": 3.120805369127517, | |
| "step": 930 | |
| }, | |
| { | |
| "loss": 0.1616, | |
| "grad_norm": 0.2060076892375946, | |
| "learning_rate": 3.0841150389856125e-05, | |
| "entropy": 0.16488183476030827, | |
| "num_tokens": 14727805.0, | |
| "mean_token_accuracy": 0.9542657196521759, | |
| "epoch": 3.1543624161073827, | |
| "step": 940 | |
| }, | |
| { | |
| "loss": 0.1727, | |
| "grad_norm": 0.25925537943840027, | |
| "learning_rate": 2.9858590281042152e-05, | |
| "entropy": 0.17419612146914004, | |
| "num_tokens": 14883574.0, | |
| "mean_token_accuracy": 0.9522079944610595, | |
| "epoch": 3.1879194630872485, | |
| "step": 950 | |
| }, | |
| { | |
| "loss": 0.1444, | |
| "grad_norm": 0.22550174593925476, | |
| "learning_rate": 2.888522912544202e-05, | |
| "entropy": 0.1470055654644966, | |
| "num_tokens": 15038732.0, | |
| "mean_token_accuracy": 0.9591749131679534, | |
| "epoch": 3.221476510067114, | |
| "step": 960 | |
| }, | |
| { | |
| "loss": 0.1453, | |
| "grad_norm": 0.22862796485424042, | |
| "learning_rate": 2.792151147504366e-05, | |
| "entropy": 0.14832949005067347, | |
| "num_tokens": 15193067.0, | |
| "mean_token_accuracy": 0.9592784523963929, | |
| "epoch": 3.2550335570469797, | |
| "step": 970 | |
| }, | |
| { | |
| "loss": 0.1592, | |
| "grad_norm": 0.21343277394771576, | |
| "learning_rate": 2.6967877477468397e-05, | |
| "entropy": 0.16025195196270942, | |
| "num_tokens": 15348668.0, | |
| "mean_token_accuracy": 0.9546813875436783, | |
| "epoch": 3.2885906040268456, | |
| "step": 980 | |
| }, | |
| { | |
| "loss": 0.1842, | |
| "grad_norm": 0.27152112126350403, | |
| "learning_rate": 2.6024762674947313e-05, | |
| "entropy": 0.18718509674072265, | |
| "num_tokens": 15507728.0, | |
| "mean_token_accuracy": 0.9490089803934098, | |
| "epoch": 3.3221476510067114, | |
| "step": 990 | |
| }, | |
| { | |
| "loss": 0.1613, | |
| "grad_norm": 0.20685099065303802, | |
| "learning_rate": 2.509259780540118e-05, | |
| "entropy": 0.16506212055683137, | |
| "num_tokens": 15663068.0, | |
| "mean_token_accuracy": 0.9536582648754119, | |
| "epoch": 3.3557046979865772, | |
| "step": 1000 | |
| }, | |
| { | |
| "loss": 0.1557, | |
| "grad_norm": 0.24159930646419525, | |
| "learning_rate": 2.4171808605714504e-05, | |
| "entropy": 0.15720976293087005, | |
| "num_tokens": 15820651.0, | |
| "mean_token_accuracy": 0.9559206962585449, | |
| "epoch": 3.389261744966443, | |
| "step": 1010 | |
| }, | |
| { | |
| "loss": 0.1775, | |
| "grad_norm": 0.23764103651046753, | |
| "learning_rate": 2.3262815617293517e-05, | |
| "entropy": 0.1807922873646021, | |
| "num_tokens": 15979358.0, | |
| "mean_token_accuracy": 0.9511760205030442, | |
| "epoch": 3.422818791946309, | |
| "step": 1020 | |
| }, | |
| { | |
| "loss": 0.1713, | |
| "grad_norm": 0.22108638286590576, | |
| "learning_rate": 2.2366033993997344e-05, | |
| "entropy": 0.17095785699784755, | |
| "num_tokens": 16135558.0, | |
| "mean_token_accuracy": 0.95192691385746, | |
| "epoch": 3.4563758389261743, | |
| "step": 1030 | |
| }, | |
| { | |
| "loss": 0.1682, | |
| "grad_norm": 0.24025718867778778, | |
| "learning_rate": 2.1481873312529426e-05, | |
| "entropy": 0.17155840508639814, | |
| "num_tokens": 16289493.0, | |
| "mean_token_accuracy": 0.9537844061851501, | |
| "epoch": 3.48993288590604, | |
| "step": 1040 | |
| }, | |
| { | |
| "loss": 0.1645, | |
| "grad_norm": 0.22691352665424347, | |
| "learning_rate": 2.061073738537635e-05, | |
| "entropy": 0.1673688843846321, | |
| "num_tokens": 16444253.0, | |
| "mean_token_accuracy": 0.9540765851736068, | |
| "epoch": 3.523489932885906, | |
| "step": 1050 | |
| }, | |
| { | |
| "loss": 0.1711, | |
| "grad_norm": 0.260170042514801, | |
| "learning_rate": 1.975302407637929e-05, | |
| "entropy": 0.17190844528377056, | |
| "num_tokens": 16605899.0, | |
| "mean_token_accuracy": 0.9533313482999801, | |
| "epoch": 3.557046979865772, | |
| "step": 1060 | |
| }, | |
| { | |
| "loss": 0.1703, | |
| "grad_norm": 0.22882592678070068, | |
| "learning_rate": 1.8909125119022136e-05, | |
| "entropy": 0.17202796116471292, | |
| "num_tokens": 16762980.0, | |
| "mean_token_accuracy": 0.9525971114635468, | |
| "epoch": 3.5906040268456376, | |
| "step": 1070 | |
| }, | |
| { | |
| "loss": 0.1751, | |
| "grad_norm": 0.2285274714231491, | |
| "learning_rate": 1.807942593751973e-05, | |
| "entropy": 0.17774682715535164, | |
| "num_tokens": 16923345.0, | |
| "mean_token_accuracy": 0.9512902319431304, | |
| "epoch": 3.6241610738255035, | |
| "step": 1080 | |
| }, | |
| { | |
| "loss": 0.1577, | |
| "grad_norm": 0.24551482498645782, | |
| "learning_rate": 1.7264305470787363e-05, | |
| "entropy": 0.1609206810593605, | |
| "num_tokens": 17077331.0, | |
| "mean_token_accuracy": 0.955593678355217, | |
| "epoch": 3.6577181208053693, | |
| "step": 1090 | |
| }, | |
| { | |
| "loss": 0.1558, | |
| "grad_norm": 0.22661259770393372, | |
| "learning_rate": 1.6464135999372537e-05, | |
| "entropy": 0.1576618704944849, | |
| "num_tokens": 17231136.0, | |
| "mean_token_accuracy": 0.9559727132320404, | |
| "epoch": 3.6912751677852347, | |
| "step": 1100 | |
| }, | |
| { | |
| "loss": 0.187, | |
| "grad_norm": 0.23124344646930695, | |
| "learning_rate": 1.567928297542749e-05, | |
| "entropy": 0.18841546326875686, | |
| "num_tokens": 17390871.0, | |
| "mean_token_accuracy": 0.9487693756818771, | |
| "epoch": 3.7248322147651005, | |
| "step": 1110 | |
| }, | |
| { | |
| "loss": 0.1368, | |
| "grad_norm": 0.2734222710132599, | |
| "learning_rate": 1.4910104855800427e-05, | |
| "entropy": 0.14009847678244114, | |
| "num_tokens": 17544815.0, | |
| "mean_token_accuracy": 0.9597841829061509, | |
| "epoch": 3.7583892617449663, | |
| "step": 1120 | |
| }, | |
| { | |
| "loss": 0.1557, | |
| "grad_norm": 0.25066235661506653, | |
| "learning_rate": 1.4156952938321798e-05, | |
| "entropy": 0.1570753049105406, | |
| "num_tokens": 17701337.0, | |
| "mean_token_accuracy": 0.9568644523620605, | |
| "epoch": 3.791946308724832, | |
| "step": 1130 | |
| }, | |
| { | |
| "loss": 0.1582, | |
| "grad_norm": 0.24784211814403534, | |
| "learning_rate": 1.3420171201359933e-05, | |
| "entropy": 0.15978324972093105, | |
| "num_tokens": 17855471.0, | |
| "mean_token_accuracy": 0.9565310895442962, | |
| "epoch": 3.825503355704698, | |
| "step": 1140 | |
| }, | |
| { | |
| "loss": 0.1583, | |
| "grad_norm": 0.22498124837875366, | |
| "learning_rate": 1.2700096146719931e-05, | |
| "entropy": 0.16122703738510608, | |
| "num_tokens": 18017269.0, | |
| "mean_token_accuracy": 0.955768808722496, | |
| "epoch": 3.859060402684564, | |
| "step": 1150 | |
| }, | |
| { | |
| "loss": 0.1729, | |
| "grad_norm": 0.22251562774181366, | |
| "learning_rate": 1.1997056645956967e-05, | |
| "entropy": 0.17401745431125165, | |
| "num_tokens": 18176540.0, | |
| "mean_token_accuracy": 0.9513998061418534, | |
| "epoch": 3.8926174496644297, | |
| "step": 1160 | |
| }, | |
| { | |
| "loss": 0.1429, | |
| "grad_norm": 0.2645820677280426, | |
| "learning_rate": 1.1311373790174657e-05, | |
| "entropy": 0.14712858721613883, | |
| "num_tokens": 18331179.0, | |
| "mean_token_accuracy": 0.9599408686161042, | |
| "epoch": 3.926174496644295, | |
| "step": 1170 | |
| }, | |
| { | |
| "loss": 0.1778, | |
| "grad_norm": 0.22513671219348907, | |
| "learning_rate": 1.0643360743376829e-05, | |
| "entropy": 0.178417731449008, | |
| "num_tokens": 18491744.0, | |
| "mean_token_accuracy": 0.9512795448303223, | |
| "epoch": 3.959731543624161, | |
| "step": 1180 | |
| }, | |
| { | |
| "loss": 0.1751, | |
| "grad_norm": 0.23683220148086548, | |
| "learning_rate": 9.993322599439692e-06, | |
| "entropy": 0.1774978566914797, | |
| "num_tokens": 18647922.0, | |
| "mean_token_accuracy": 0.9514397591352463, | |
| "epoch": 3.9932885906040267, | |
| "step": 1190 | |
| }, | |
| { | |
| "loss": 0.1343, | |
| "grad_norm": 0.23946121335029602, | |
| "learning_rate": 9.36155624276987e-06, | |
| "entropy": 0.13949108868837357, | |
| "num_tokens": 18799741.0, | |
| "mean_token_accuracy": 0.961316853761673, | |
| "epoch": 4.026845637583893, | |
| "step": 1200 | |
| }, | |
| { | |
| "loss": 0.1448, | |
| "grad_norm": 0.26739007234573364, | |
| "learning_rate": 8.748350212711853e-06, | |
| "entropy": 0.14928585700690747, | |
| "num_tokens": 18954239.0, | |
| "mean_token_accuracy": 0.9586472451686859, | |
| "epoch": 4.060402684563758, | |
| "step": 1210 | |
| }, | |
| { | |
| "loss": 0.1589, | |
| "grad_norm": 0.2560782730579376, | |
| "learning_rate": 8.153984571766699e-06, | |
| "entropy": 0.1610208224505186, | |
| "num_tokens": 19113299.0, | |
| "mean_token_accuracy": 0.955417987704277, | |
| "epoch": 4.093959731543624, | |
| "step": 1220 | |
| }, | |
| { | |
| "loss": 0.1566, | |
| "grad_norm": 0.2503839135169983, | |
| "learning_rate": 7.578730777682386e-06, | |
| "entropy": 0.15893662311136722, | |
| "num_tokens": 19269704.0, | |
| "mean_token_accuracy": 0.9564489126205444, | |
| "epoch": 4.12751677852349, | |
| "step": 1230 | |
| }, | |
| { | |
| "loss": 0.1729, | |
| "grad_norm": 0.2269158661365509, | |
| "learning_rate": 7.022851559473964e-06, | |
| "entropy": 0.17561149559915065, | |
| "num_tokens": 19430349.0, | |
| "mean_token_accuracy": 0.95240318775177, | |
| "epoch": 4.1610738255033555, | |
| "step": 1240 | |
| }, | |
| { | |
| "loss": 0.168, | |
| "grad_norm": 0.2869897186756134, | |
| "learning_rate": 6.486600797430492e-06, | |
| "entropy": 0.1709251943975687, | |
| "num_tokens": 19585660.0, | |
| "mean_token_accuracy": 0.9534720987081527, | |
| "epoch": 4.194630872483222, | |
| "step": 1250 | |
| }, | |
| { | |
| "loss": 0.1457, | |
| "grad_norm": 0.25000014901161194, | |
| "learning_rate": 5.9702234071631e-06, | |
| "entropy": 0.14829469434916973, | |
| "num_tokens": 19744182.0, | |
| "mean_token_accuracy": 0.9593781590461731, | |
| "epoch": 4.228187919463087, | |
| "step": 1260 | |
| }, | |
| { | |
| "loss": 0.16, | |
| "grad_norm": 0.24967129528522491, | |
| "learning_rate": 5.473955227747623e-06, | |
| "entropy": 0.16518744193017482, | |
| "num_tokens": 19902824.0, | |
| "mean_token_accuracy": 0.9548085004091262, | |
| "epoch": 4.261744966442953, | |
| "step": 1270 | |
| }, | |
| { | |
| "loss": 0.1508, | |
| "grad_norm": 0.27587053179740906, | |
| "learning_rate": 4.9980229140124905e-06, | |
| "entropy": 0.15380522198975086, | |
| "num_tokens": 20057944.0, | |
| "mean_token_accuracy": 0.9581666350364685, | |
| "epoch": 4.295302013422819, | |
| "step": 1280 | |
| }, | |
| { | |
| "loss": 0.1497, | |
| "grad_norm": 0.2772783935070038, | |
| "learning_rate": 4.542643833021254e-06, | |
| "entropy": 0.15263673029839991, | |
| "num_tokens": 20215072.0, | |
| "mean_token_accuracy": 0.9581062227487565, | |
| "epoch": 4.328859060402684, | |
| "step": 1290 | |
| }, | |
| { | |
| "loss": 0.1313, | |
| "grad_norm": 0.25381705164909363, | |
| "learning_rate": 4.108025964797135e-06, | |
| "entropy": 0.13392407521605493, | |
| "num_tokens": 20371415.0, | |
| "mean_token_accuracy": 0.9618801653385163, | |
| "epoch": 4.3624161073825505, | |
| "step": 1300 | |
| }, | |
| { | |
| "loss": 0.1442, | |
| "grad_norm": 0.318499892950058, | |
| "learning_rate": 3.69436780733462e-06, | |
| "entropy": 0.1467492014169693, | |
| "num_tokens": 20525796.0, | |
| "mean_token_accuracy": 0.9589717149734497, | |
| "epoch": 4.395973154362416, | |
| "step": 1310 | |
| }, | |
| { | |
| "loss": 0.164, | |
| "grad_norm": 0.29794761538505554, | |
| "learning_rate": 3.3018582859418446e-06, | |
| "entropy": 0.16693559624254703, | |
| "num_tokens": 20683012.0, | |
| "mean_token_accuracy": 0.9544567495584488, | |
| "epoch": 4.429530201342282, | |
| "step": 1320 | |
| }, | |
| { | |
| "loss": 0.1585, | |
| "grad_norm": 0.26992982625961304, | |
| "learning_rate": 2.930676666954846e-06, | |
| "entropy": 0.16096150763332845, | |
| "num_tokens": 20836880.0, | |
| "mean_token_accuracy": 0.955687940120697, | |
| "epoch": 4.4630872483221475, | |
| "step": 1330 | |
| }, | |
| { | |
| "loss": 0.1545, | |
| "grad_norm": 0.2633298635482788, | |
| "learning_rate": 2.580992475863381e-06, | |
| "entropy": 0.15808030292391778, | |
| "num_tokens": 20991520.0, | |
| "mean_token_accuracy": 0.9564832031726838, | |
| "epoch": 4.496644295302014, | |
| "step": 1340 | |
| }, | |
| { | |
| "loss": 0.1574, | |
| "grad_norm": 0.2697800099849701, | |
| "learning_rate": 2.2529654198854835e-06, | |
| "entropy": 0.16020409800112248, | |
| "num_tokens": 21149049.0, | |
| "mean_token_accuracy": 0.9553818762302398, | |
| "epoch": 4.530201342281879, | |
| "step": 1350 | |
| }, | |
| { | |
| "loss": 0.1434, | |
| "grad_norm": 0.26713675260543823, | |
| "learning_rate": 1.9467453150262327e-06, | |
| "entropy": 0.1468834660947323, | |
| "num_tokens": 21303482.0, | |
| "mean_token_accuracy": 0.9584787577390671, | |
| "epoch": 4.563758389261745, | |
| "step": 1360 | |
| }, | |
| { | |
| "loss": 0.1676, | |
| "grad_norm": 0.25141027569770813, | |
| "learning_rate": 1.6624720176540265e-06, | |
| "entropy": 0.17059922069311143, | |
| "num_tokens": 21462214.0, | |
| "mean_token_accuracy": 0.9529773116111755, | |
| "epoch": 4.597315436241611, | |
| "step": 1370 | |
| }, | |
| { | |
| "loss": 0.1631, | |
| "grad_norm": 0.22781124711036682, | |
| "learning_rate": 1.400275360625608e-06, | |
| "entropy": 0.16469677537679672, | |
| "num_tokens": 21617768.0, | |
| "mean_token_accuracy": 0.9540845274925231, | |
| "epoch": 4.630872483221476, | |
| "step": 1380 | |
| }, | |
| { | |
| "loss": 0.1405, | |
| "grad_norm": 0.26300156116485596, | |
| "learning_rate": 1.1602750939889774e-06, | |
| "entropy": 0.14349478296935558, | |
| "num_tokens": 21775827.0, | |
| "mean_token_accuracy": 0.9608874082565307, | |
| "epoch": 4.6644295302013425, | |
| "step": 1390 | |
| }, | |
| { | |
| "loss": 0.1648, | |
| "grad_norm": 0.28018462657928467, | |
| "learning_rate": 9.42580830291373e-07, | |
| "entropy": 0.168378734216094, | |
| "num_tokens": 21933598.0, | |
| "mean_token_accuracy": 0.9548741400241851, | |
| "epoch": 4.697986577181208, | |
| "step": 1400 | |
| }, | |
| { | |
| "loss": 0.1579, | |
| "grad_norm": 0.23573631048202515, | |
| "learning_rate": 7.472919945171631e-07, | |
| "entropy": 0.16024879328906536, | |
| "num_tokens": 22092356.0, | |
| "mean_token_accuracy": 0.9563170611858368, | |
| "epoch": 4.731543624161074, | |
| "step": 1410 | |
| }, | |
| { | |
| "loss": 0.1486, | |
| "grad_norm": 0.26423606276512146, | |
| "learning_rate": 5.74497778678662e-07, | |
| "entropy": 0.153883695602417, | |
| "num_tokens": 22245245.0, | |
| "mean_token_accuracy": 0.9581821829080581, | |
| "epoch": 4.76510067114094, | |
| "step": 1420 | |
| }, | |
| { | |
| "loss": 0.1668, | |
| "grad_norm": 0.24437865614891052, | |
| "learning_rate": 4.242771010804558e-07, | |
| "entropy": 0.16868923045694828, | |
| "num_tokens": 22403545.0, | |
| "mean_token_accuracy": 0.9531532049179077, | |
| "epoch": 4.798657718120805, | |
| "step": 1430 | |
| }, | |
| { | |
| "loss": 0.1497, | |
| "grad_norm": 0.2679818868637085, | |
| "learning_rate": 2.966985702759828e-07, | |
| "entropy": 0.15332721956074238, | |
| "num_tokens": 22559658.0, | |
| "mean_token_accuracy": 0.9572977095842361, | |
| "epoch": 4.832214765100671, | |
| "step": 1440 | |
| }, | |
| { | |
| "loss": 0.1487, | |
| "grad_norm": 0.25875014066696167, | |
| "learning_rate": 1.9182045373273838e-07, | |
| "entropy": 0.15049569718539715, | |
| "num_tokens": 22718226.0, | |
| "mean_token_accuracy": 0.9580355733633041, | |
| "epoch": 4.865771812080537, | |
| "step": 1450 | |
| }, | |
| { | |
| "loss": 0.1444, | |
| "grad_norm": 0.25537535548210144, | |
| "learning_rate": 1.0969065122041766e-07, | |
| "entropy": 0.14743491858243943, | |
| "num_tokens": 22873841.0, | |
| "mean_token_accuracy": 0.9597749501466751, | |
| "epoch": 4.899328859060403, | |
| "step": 1460 | |
| }, | |
| { | |
| "loss": 0.1584, | |
| "grad_norm": 0.28133252263069153, | |
| "learning_rate": 5.0346672934270534e-08, | |
| "entropy": 0.16207893192768097, | |
| "num_tokens": 23032944.0, | |
| "mean_token_accuracy": 0.9557940989732743, | |
| "epoch": 4.932885906040268, | |
| "step": 1470 | |
| }, | |
| { | |
| "loss": 0.1577, | |
| "grad_norm": 0.25087353587150574, | |
| "learning_rate": 1.3815622363427815e-08, | |
| "entropy": 0.16037070676684378, | |
| "num_tokens": 23191641.0, | |
| "mean_token_accuracy": 0.9560932219028473, | |
| "epoch": 4.966442953020135, | |
| "step": 1480 | |
| }, | |
| { | |
| "loss": 0.1779, | |
| "grad_norm": 0.307130366563797, | |
| "learning_rate": 1.141839123142141e-10, | |
| "entropy": 0.18243111334741116, | |
| "num_tokens": 23348865.0, | |
| "mean_token_accuracy": 0.9507445961236953, | |
| "epoch": 5.0, | |
| "step": 1490 | |
| }, | |
| { | |
| "train_runtime": 10055.6246, | |
| "train_samples_per_second": 4.74, | |
| "train_steps_per_second": 0.148, | |
| "total_flos": 1.0796462610038784e+19, | |
| "train_loss": 0.2056295836531876, | |
| "epoch": 5.0, | |
| "step": 1490 | |
| } | |
| ] |