PEFT
Safetensors
English
cybersecurity
malware-analysis
att&ck
threat-intelligence
mixtral
lora
expert-adapters
cape-sandbox
digital-forensics
Instructions to use umer07/fathom-mixtral with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- PEFT
How to use umer07/fathom-mixtral with PEFT:
from peft import PeftModel from transformers import AutoModelForCausalLM base_model = AutoModelForCausalLM.from_pretrained("mistralai/Mixtral-8x7B-Instruct-v0.1") model = PeftModel.from_pretrained(base_model, "umer07/fathom-mixtral") - Notebooks
- Google Colab
- Kaggle
| [ | |
| { | |
| "loss": 1.2771, | |
| "grad_norm": 0.48179489374160767, | |
| "learning_rate": 9e-06, | |
| "entropy": 1.0744440883398056, | |
| "num_tokens": 384772.0, | |
| "mean_token_accuracy": 0.7264965653419495, | |
| "epoch": 0.008849557522123894, | |
| "step": 10 | |
| }, | |
| { | |
| "loss": 1.2083, | |
| "grad_norm": 0.22539448738098145, | |
| "learning_rate": 1.9e-05, | |
| "entropy": 1.0771890074014663, | |
| "num_tokens": 794683.0, | |
| "mean_token_accuracy": 0.7380238056182862, | |
| "epoch": 0.017699115044247787, | |
| "step": 20 | |
| }, | |
| { | |
| "loss": 1.1201, | |
| "grad_norm": 0.22262926399707794, | |
| "learning_rate": 2.9e-05, | |
| "entropy": 1.1027102798223496, | |
| "num_tokens": 1216610.0, | |
| "mean_token_accuracy": 0.7497482478618622, | |
| "epoch": 0.02654867256637168, | |
| "step": 30 | |
| }, | |
| { | |
| "loss": 1.0997, | |
| "grad_norm": 0.2455902248620987, | |
| "learning_rate": 3.9000000000000006e-05, | |
| "entropy": 1.13816859126091, | |
| "num_tokens": 1636160.0, | |
| "mean_token_accuracy": 0.751062735915184, | |
| "epoch": 0.035398230088495575, | |
| "step": 40 | |
| }, | |
| { | |
| "loss": 0.9732, | |
| "grad_norm": 0.25331178307533264, | |
| "learning_rate": 4.9e-05, | |
| "entropy": 0.967724832892418, | |
| "num_tokens": 2047660.0, | |
| "mean_token_accuracy": 0.7753094494342804, | |
| "epoch": 0.04424778761061947, | |
| "step": 50 | |
| }, | |
| { | |
| "loss": 0.8376, | |
| "grad_norm": 0.2812139391899109, | |
| "learning_rate": 5.9e-05, | |
| "entropy": 0.8342341184616089, | |
| "num_tokens": 2442064.0, | |
| "mean_token_accuracy": 0.8068537533283233, | |
| "epoch": 0.05309734513274336, | |
| "step": 60 | |
| }, | |
| { | |
| "loss": 0.7207, | |
| "grad_norm": 0.5564813613891602, | |
| "learning_rate": 6.9e-05, | |
| "entropy": 0.7252395421266555, | |
| "num_tokens": 2867532.0, | |
| "mean_token_accuracy": 0.832821500301361, | |
| "epoch": 0.061946902654867256, | |
| "step": 70 | |
| }, | |
| { | |
| "loss": 0.6873, | |
| "grad_norm": 0.28080248832702637, | |
| "learning_rate": 7.900000000000001e-05, | |
| "entropy": 0.6883613392710686, | |
| "num_tokens": 3271081.0, | |
| "mean_token_accuracy": 0.8418284684419632, | |
| "epoch": 0.07079646017699115, | |
| "step": 80 | |
| }, | |
| { | |
| "loss": 0.6869, | |
| "grad_norm": 0.37117382884025574, | |
| "learning_rate": 8.900000000000001e-05, | |
| "entropy": 0.6866619646549225, | |
| "num_tokens": 3676002.0, | |
| "mean_token_accuracy": 0.8389175772666931, | |
| "epoch": 0.07964601769911504, | |
| "step": 90 | |
| }, | |
| { | |
| "loss": 0.641, | |
| "grad_norm": 0.34726274013519287, | |
| "learning_rate": 9.900000000000001e-05, | |
| "entropy": 0.6441589877009392, | |
| "num_tokens": 4109097.0, | |
| "mean_token_accuracy": 0.849273630976677, | |
| "epoch": 0.08849557522123894, | |
| "step": 100 | |
| }, | |
| { | |
| "loss": 0.541, | |
| "grad_norm": 0.3802001476287842, | |
| "learning_rate": 9.998116250927091e-05, | |
| "entropy": 0.54506506472826, | |
| "num_tokens": 4518063.0, | |
| "mean_token_accuracy": 0.8707349866628646, | |
| "epoch": 0.09734513274336283, | |
| "step": 110 | |
| }, | |
| { | |
| "loss": 0.5358, | |
| "grad_norm": 0.33883532881736755, | |
| "learning_rate": 9.991606348016586e-05, | |
| "entropy": 0.5439137145876884, | |
| "num_tokens": 4942538.0, | |
| "mean_token_accuracy": 0.8710391223430634, | |
| "epoch": 0.10619469026548672, | |
| "step": 120 | |
| }, | |
| { | |
| "loss": 0.513, | |
| "grad_norm": 0.2743840515613556, | |
| "learning_rate": 9.980453089389592e-05, | |
| "entropy": 0.530657310783863, | |
| "num_tokens": 5366333.0, | |
| "mean_token_accuracy": 0.8756012558937073, | |
| "epoch": 0.11504424778761062, | |
| "step": 130 | |
| }, | |
| { | |
| "loss": 0.499, | |
| "grad_norm": 0.3075107932090759, | |
| "learning_rate": 9.964666850172589e-05, | |
| "entropy": 0.5022920548915863, | |
| "num_tokens": 5770091.0, | |
| "mean_token_accuracy": 0.8816363781690597, | |
| "epoch": 0.12389380530973451, | |
| "step": 140 | |
| }, | |
| { | |
| "loss": 0.4638, | |
| "grad_norm": 0.3095848560333252, | |
| "learning_rate": 9.944262315242346e-05, | |
| "entropy": 0.47086685746908186, | |
| "num_tokens": 6186047.0, | |
| "mean_token_accuracy": 0.8876948118209839, | |
| "epoch": 0.13274336283185842, | |
| "step": 150 | |
| }, | |
| { | |
| "loss": 0.4325, | |
| "grad_norm": 0.28685304522514343, | |
| "learning_rate": 9.919258465565575e-05, | |
| "entropy": 0.43743315935134885, | |
| "num_tokens": 6585548.0, | |
| "mean_token_accuracy": 0.8952628016471863, | |
| "epoch": 0.1415929203539823, | |
| "step": 160 | |
| }, | |
| { | |
| "loss": 0.4701, | |
| "grad_norm": 0.324359267950058, | |
| "learning_rate": 9.889678560542202e-05, | |
| "entropy": 0.4746619015932083, | |
| "num_tokens": 7013633.0, | |
| "mean_token_accuracy": 0.8870637893676758, | |
| "epoch": 0.1504424778761062, | |
| "step": 170 | |
| }, | |
| { | |
| "loss": 0.4442, | |
| "grad_norm": 0.31656569242477417, | |
| "learning_rate": 9.855550116368716e-05, | |
| "entropy": 0.4526193618774414, | |
| "num_tokens": 7419505.0, | |
| "mean_token_accuracy": 0.8906016558408737, | |
| "epoch": 0.1592920353982301, | |
| "step": 180 | |
| }, | |
| { | |
| "loss": 0.412, | |
| "grad_norm": 0.30724215507507324, | |
| "learning_rate": 9.816904880441713e-05, | |
| "entropy": 0.4151700422167778, | |
| "num_tokens": 7818357.0, | |
| "mean_token_accuracy": 0.8987403184175491, | |
| "epoch": 0.168141592920354, | |
| "step": 190 | |
| }, | |
| { | |
| "loss": 0.4121, | |
| "grad_norm": 0.49001774191856384, | |
| "learning_rate": 9.773778801825405e-05, | |
| "entropy": 0.4183415174484253, | |
| "num_tokens": 8275452.0, | |
| "mean_token_accuracy": 0.898220694065094, | |
| "epoch": 0.17699115044247787, | |
| "step": 200 | |
| }, | |
| { | |
| "loss": 0.4359, | |
| "grad_norm": 0.5563516616821289, | |
| "learning_rate": 9.726211997810646e-05, | |
| "entropy": 0.44024592265486717, | |
| "num_tokens": 8689265.0, | |
| "mean_token_accuracy": 0.8926420778036117, | |
| "epoch": 0.18584070796460178, | |
| "step": 210 | |
| }, | |
| { | |
| "loss": 0.4265, | |
| "grad_norm": 0.4248098134994507, | |
| "learning_rate": 9.674248716596497e-05, | |
| "entropy": 0.43353245332837104, | |
| "num_tokens": 9108244.0, | |
| "mean_token_accuracy": 0.8954088747501373, | |
| "epoch": 0.19469026548672566, | |
| "step": 220 | |
| }, | |
| { | |
| "loss": 0.4113, | |
| "grad_norm": 0.5417165160179138, | |
| "learning_rate": 9.617937296129115e-05, | |
| "entropy": 0.4203181877732277, | |
| "num_tokens": 9508482.0, | |
| "mean_token_accuracy": 0.8986817359924316, | |
| "epoch": 0.20353982300884957, | |
| "step": 230 | |
| }, | |
| { | |
| "loss": 0.4041, | |
| "grad_norm": 0.37166956067085266, | |
| "learning_rate": 9.557330119136203e-05, | |
| "entropy": 0.4078876577317715, | |
| "num_tokens": 9929607.0, | |
| "mean_token_accuracy": 0.9026990383863449, | |
| "epoch": 0.21238938053097345, | |
| "step": 240 | |
| }, | |
| { | |
| "loss": 0.3372, | |
| "grad_norm": 0.37398409843444824, | |
| "learning_rate": 9.492483564398883e-05, | |
| "entropy": 0.3507886216044426, | |
| "num_tokens": 10352037.0, | |
| "mean_token_accuracy": 0.9149532109498978, | |
| "epoch": 0.22123893805309736, | |
| "step": 250 | |
| }, | |
| { | |
| "loss": 0.3754, | |
| "grad_norm": 0.4071371853351593, | |
| "learning_rate": 9.423457954306312e-05, | |
| "entropy": 0.3833847470581532, | |
| "num_tokens": 10751656.0, | |
| "mean_token_accuracy": 0.9078819990158081, | |
| "epoch": 0.23008849557522124, | |
| "step": 260 | |
| }, | |
| { | |
| "loss": 0.3092, | |
| "grad_norm": 0.3898226022720337, | |
| "learning_rate": 9.350317498741811e-05, | |
| "entropy": 0.31048981472849846, | |
| "num_tokens": 11177021.0, | |
| "mean_token_accuracy": 0.9235329061746598, | |
| "epoch": 0.23893805309734514, | |
| "step": 270 | |
| }, | |
| { | |
| "loss": 0.2992, | |
| "grad_norm": 0.4743488132953644, | |
| "learning_rate": 9.273130235352743e-05, | |
| "entropy": 0.3064242236316204, | |
| "num_tokens": 11589241.0, | |
| "mean_token_accuracy": 0.926247027516365, | |
| "epoch": 0.24778761061946902, | |
| "step": 280 | |
| }, | |
| { | |
| "loss": 0.3432, | |
| "grad_norm": 0.3992835283279419, | |
| "learning_rate": 9.191967966259645e-05, | |
| "entropy": 0.35372441038489344, | |
| "num_tokens": 11997925.0, | |
| "mean_token_accuracy": 0.914875665307045, | |
| "epoch": 0.25663716814159293, | |
| "step": 290 | |
| }, | |
| { | |
| "loss": 0.311, | |
| "grad_norm": 0.4893634021282196, | |
| "learning_rate": 9.10690619126356e-05, | |
| "entropy": 0.31616691052913665, | |
| "num_tokens": 12415929.0, | |
| "mean_token_accuracy": 0.9236188173294068, | |
| "epoch": 0.26548672566371684, | |
| "step": 300 | |
| }, | |
| { | |
| "loss": 0.3358, | |
| "grad_norm": 0.38468873500823975, | |
| "learning_rate": 9.018024037613646e-05, | |
| "entropy": 0.3419831946492195, | |
| "num_tokens": 12828979.0, | |
| "mean_token_accuracy": 0.9184246301651001, | |
| "epoch": 0.2743362831858407, | |
| "step": 310 | |
| }, | |
| { | |
| "loss": 0.3176, | |
| "grad_norm": 0.4959142208099365, | |
| "learning_rate": 8.925404186400408e-05, | |
| "entropy": 0.3222149942070246, | |
| "num_tokens": 13222249.0, | |
| "mean_token_accuracy": 0.921812680363655, | |
| "epoch": 0.2831858407079646, | |
| "step": 320 | |
| }, | |
| { | |
| "loss": 0.2962, | |
| "grad_norm": 0.33697429299354553, | |
| "learning_rate": 8.829132795643051e-05, | |
| "entropy": 0.30270505994558333, | |
| "num_tokens": 13653508.0, | |
| "mean_token_accuracy": 0.9271777629852295, | |
| "epoch": 0.2920353982300885, | |
| "step": 330 | |
| }, | |
| { | |
| "loss": 0.2721, | |
| "grad_norm": 0.3220033645629883, | |
| "learning_rate": 8.729299420142465e-05, | |
| "entropy": 0.2873320683836937, | |
| "num_tokens": 14076074.0, | |
| "mean_token_accuracy": 0.9307485371828079, | |
| "epoch": 0.3008849557522124, | |
| "step": 340 | |
| }, | |
| { | |
| "loss": 0.3197, | |
| "grad_norm": 0.3136242628097534, | |
| "learning_rate": 8.625996928174412e-05, | |
| "entropy": 0.332285375893116, | |
| "num_tokens": 14496295.0, | |
| "mean_token_accuracy": 0.9201685935258865, | |
| "epoch": 0.30973451327433627, | |
| "step": 350 | |
| }, | |
| { | |
| "loss": 0.3053, | |
| "grad_norm": 0.43096667528152466, | |
| "learning_rate": 8.519321415100414e-05, | |
| "entropy": 0.310824865847826, | |
| "num_tokens": 14897719.0, | |
| "mean_token_accuracy": 0.924813660979271, | |
| "epoch": 0.3185840707964602, | |
| "step": 360 | |
| }, | |
| { | |
| "loss": 0.2486, | |
| "grad_norm": 0.4815881848335266, | |
| "learning_rate": 8.409372113976712e-05, | |
| "entropy": 0.2627465195953846, | |
| "num_tokens": 15293602.0, | |
| "mean_token_accuracy": 0.9371735215187073, | |
| "epoch": 0.3274336283185841, | |
| "step": 370 | |
| }, | |
| { | |
| "loss": 0.2657, | |
| "grad_norm": 0.4008077085018158, | |
| "learning_rate": 8.296251303244413e-05, | |
| "entropy": 0.26940090730786326, | |
| "num_tokens": 15726200.0, | |
| "mean_token_accuracy": 0.9358527153730393, | |
| "epoch": 0.336283185840708, | |
| "step": 380 | |
| }, | |
| { | |
| "loss": 0.2387, | |
| "grad_norm": 0.3524218201637268, | |
| "learning_rate": 8.180064211586738e-05, | |
| "entropy": 0.24938426464796065, | |
| "num_tokens": 16150169.0, | |
| "mean_token_accuracy": 0.9398461669683457, | |
| "epoch": 0.34513274336283184, | |
| "step": 390 | |
| }, | |
| { | |
| "loss": 0.2634, | |
| "grad_norm": 0.35844066739082336, | |
| "learning_rate": 8.060918920041856e-05, | |
| "entropy": 0.2718000315129757, | |
| "num_tokens": 16558293.0, | |
| "mean_token_accuracy": 0.9348433256149292, | |
| "epoch": 0.35398230088495575, | |
| "step": 400 | |
| }, | |
| { | |
| "loss": 0.2835, | |
| "grad_norm": 0.46172916889190674, | |
| "learning_rate": 7.938926261462366e-05, | |
| "entropy": 0.2921802319586277, | |
| "num_tokens": 16997838.0, | |
| "mean_token_accuracy": 0.9297347247600556, | |
| "epoch": 0.36283185840707965, | |
| "step": 410 | |
| }, | |
| { | |
| "loss": 0.2743, | |
| "grad_norm": 0.3828360140323639, | |
| "learning_rate": 7.81419971741494e-05, | |
| "entropy": 0.2844072911888361, | |
| "num_tokens": 17414034.0, | |
| "mean_token_accuracy": 0.9316927522420884, | |
| "epoch": 0.37168141592920356, | |
| "step": 420 | |
| }, | |
| { | |
| "loss": 0.2481, | |
| "grad_norm": 0.2896506190299988, | |
| "learning_rate": 7.686855312616055e-05, | |
| "entropy": 0.2565284200012684, | |
| "num_tokens": 17851569.0, | |
| "mean_token_accuracy": 0.9383330553770065, | |
| "epoch": 0.3805309734513274, | |
| "step": 430 | |
| }, | |
| { | |
| "loss": 0.278, | |
| "grad_norm": 0.5132155418395996, | |
| "learning_rate": 7.557011507002004e-05, | |
| "entropy": 0.2847262255847454, | |
| "num_tokens": 18265230.0, | |
| "mean_token_accuracy": 0.9326902598142623, | |
| "epoch": 0.3893805309734513, | |
| "step": 440 | |
| }, | |
| { | |
| "loss": 0.2172, | |
| "grad_norm": 0.36239564418792725, | |
| "learning_rate": 7.424789085533584e-05, | |
| "entropy": 0.2237854577600956, | |
| "num_tokens": 18669967.0, | |
| "mean_token_accuracy": 0.9465960383415222, | |
| "epoch": 0.39823008849557523, | |
| "step": 450 | |
| }, | |
| { | |
| "loss": 0.2868, | |
| "grad_norm": 0.263062059879303, | |
| "learning_rate": 7.290311045837963e-05, | |
| "entropy": 0.29681268632411956, | |
| "num_tokens": 19100082.0, | |
| "mean_token_accuracy": 0.9297707736492157, | |
| "epoch": 0.40707964601769914, | |
| "step": 460 | |
| }, | |
| { | |
| "loss": 0.2535, | |
| "grad_norm": 0.25322869420051575, | |
| "learning_rate": 7.153702483792266e-05, | |
| "entropy": 0.2623436853289604, | |
| "num_tokens": 19535145.0, | |
| "mean_token_accuracy": 0.9386782139539719, | |
| "epoch": 0.415929203539823, | |
| "step": 470 | |
| }, | |
| { | |
| "loss": 0.2781, | |
| "grad_norm": 0.32808470726013184, | |
| "learning_rate": 7.015090477155288e-05, | |
| "entropy": 0.2793763652443886, | |
| "num_tokens": 19946800.0, | |
| "mean_token_accuracy": 0.9325116276741028, | |
| "epoch": 0.4247787610619469, | |
| "step": 480 | |
| }, | |
| { | |
| "loss": 0.2166, | |
| "grad_norm": 0.3056963086128235, | |
| "learning_rate": 6.874603967355603e-05, | |
| "entropy": 0.2249807395040989, | |
| "num_tokens": 20374389.0, | |
| "mean_token_accuracy": 0.9458037465810776, | |
| "epoch": 0.4336283185840708, | |
| "step": 490 | |
| }, | |
| { | |
| "loss": 0.2528, | |
| "grad_norm": 0.3542698919773102, | |
| "learning_rate": 6.732373639546025e-05, | |
| "entropy": 0.2597987335175276, | |
| "num_tokens": 20792226.0, | |
| "mean_token_accuracy": 0.9383546471595764, | |
| "epoch": 0.4424778761061947, | |
| "step": 500 | |
| }, | |
| { | |
| "loss": 0.2007, | |
| "grad_norm": 0.2978123426437378, | |
| "learning_rate": 6.588531801035993e-05, | |
| "entropy": 0.2049595110118389, | |
| "num_tokens": 21206944.0, | |
| "mean_token_accuracy": 0.9496358513832093, | |
| "epoch": 0.45132743362831856, | |
| "step": 510 | |
| }, | |
| { | |
| "loss": 0.2147, | |
| "grad_norm": 0.3151644766330719, | |
| "learning_rate": 6.443212258214983e-05, | |
| "entropy": 0.221473628282547, | |
| "num_tokens": 21597252.0, | |
| "mean_token_accuracy": 0.9471895158290863, | |
| "epoch": 0.46017699115044247, | |
| "step": 520 | |
| }, | |
| { | |
| "loss": 0.2534, | |
| "grad_norm": 0.28072014451026917, | |
| "learning_rate": 6.296550192081421e-05, | |
| "entropy": 0.25930051133036613, | |
| "num_tokens": 22019525.0, | |
| "mean_token_accuracy": 0.9380945324897766, | |
| "epoch": 0.4690265486725664, | |
| "step": 530 | |
| }, | |
| { | |
| "loss": 0.2773, | |
| "grad_norm": 0.28228074312210083, | |
| "learning_rate": 6.148682032492894e-05, | |
| "entropy": 0.2860624067485332, | |
| "num_tokens": 22450805.0, | |
| "mean_token_accuracy": 0.9322474598884583, | |
| "epoch": 0.4778761061946903, | |
| "step": 540 | |
| }, | |
| { | |
| "loss": 0.2331, | |
| "grad_norm": 0.22247536480426788, | |
| "learning_rate": 5.999745331254616e-05, | |
| "entropy": 0.2394201297312975, | |
| "num_tokens": 22860630.0, | |
| "mean_token_accuracy": 0.9421512335538864, | |
| "epoch": 0.48672566371681414, | |
| "step": 550 | |
| }, | |
| { | |
| "loss": 0.2545, | |
| "grad_norm": 0.36099445819854736, | |
| "learning_rate": 5.849878634164251e-05, | |
| "entropy": 0.26039876230061054, | |
| "num_tokens": 23276454.0, | |
| "mean_token_accuracy": 0.9390616893768311, | |
| "epoch": 0.49557522123893805, | |
| "step": 560 | |
| }, | |
| { | |
| "loss": 0.2566, | |
| "grad_norm": 0.24887335300445557, | |
| "learning_rate": 5.699221352132059e-05, | |
| "entropy": 0.2648449897766113, | |
| "num_tokens": 23710582.0, | |
| "mean_token_accuracy": 0.9375686824321747, | |
| "epoch": 0.504424778761062, | |
| "step": 570 | |
| }, | |
| { | |
| "loss": 0.2481, | |
| "grad_norm": 0.26001015305519104, | |
| "learning_rate": 5.547913631496306e-05, | |
| "entropy": 0.2597955591976643, | |
| "num_tokens": 24130902.0, | |
| "mean_token_accuracy": 0.9392797619104385, | |
| "epoch": 0.5132743362831859, | |
| "step": 580 | |
| }, | |
| { | |
| "loss": 0.2491, | |
| "grad_norm": 0.25495871901512146, | |
| "learning_rate": 5.396096223654561e-05, | |
| "entropy": 0.24957055263221264, | |
| "num_tokens": 24565476.0, | |
| "mean_token_accuracy": 0.9401059657335281, | |
| "epoch": 0.5221238938053098, | |
| "step": 590 | |
| }, | |
| { | |
| "loss": 0.2479, | |
| "grad_norm": 0.28261682391166687, | |
| "learning_rate": 5.2439103541321144e-05, | |
| "entropy": 0.2580213598906994, | |
| "num_tokens": 24971253.0, | |
| "mean_token_accuracy": 0.9390650987625122, | |
| "epoch": 0.5309734513274337, | |
| "step": 600 | |
| }, | |
| { | |
| "loss": 0.2856, | |
| "grad_norm": 0.2623113691806793, | |
| "learning_rate": 5.0914975912093854e-05, | |
| "entropy": 0.2843888055533171, | |
| "num_tokens": 25375477.0, | |
| "mean_token_accuracy": 0.9336756229400635, | |
| "epoch": 0.5398230088495575, | |
| "step": 610 | |
| }, | |
| { | |
| "loss": 0.2338, | |
| "grad_norm": 0.3092866539955139, | |
| "learning_rate": 4.938999714230467e-05, | |
| "entropy": 0.23968622721731664, | |
| "num_tokens": 25767651.0, | |
| "mean_token_accuracy": 0.9425904780626297, | |
| "epoch": 0.5486725663716814, | |
| "step": 620 | |
| }, | |
| { | |
| "loss": 0.2382, | |
| "grad_norm": 0.20757952332496643, | |
| "learning_rate": 4.78655858171533e-05, | |
| "entropy": 0.24690472409129144, | |
| "num_tokens": 26197439.0, | |
| "mean_token_accuracy": 0.9411264300346375, | |
| "epoch": 0.5575221238938053, | |
| "step": 630 | |
| }, | |
| { | |
| "loss": 0.2557, | |
| "grad_norm": 0.24662645161151886, | |
| "learning_rate": 4.634315999398393e-05, | |
| "entropy": 0.2634947098791599, | |
| "num_tokens": 26612051.0, | |
| "mean_token_accuracy": 0.9378427803516388, | |
| "epoch": 0.5663716814159292, | |
| "step": 640 | |
| }, | |
| { | |
| "loss": 0.1985, | |
| "grad_norm": 0.18498662114143372, | |
| "learning_rate": 4.48241358831617e-05, | |
| "entropy": 0.20072167329490184, | |
| "num_tokens": 27024799.0, | |
| "mean_token_accuracy": 0.9514979749917984, | |
| "epoch": 0.5752212389380531, | |
| "step": 650 | |
| }, | |
| { | |
| "loss": 0.2368, | |
| "grad_norm": 0.27973467111587524, | |
| "learning_rate": 4.3309926530667586e-05, | |
| "entropy": 0.24167309887707233, | |
| "num_tokens": 27423001.0, | |
| "mean_token_accuracy": 0.9416618674993515, | |
| "epoch": 0.584070796460177, | |
| "step": 660 | |
| }, | |
| { | |
| "loss": 0.275, | |
| "grad_norm": 0.25306880474090576, | |
| "learning_rate": 4.18019405036366e-05, | |
| "entropy": 0.2826690062880516, | |
| "num_tokens": 27822869.0, | |
| "mean_token_accuracy": 0.9324941843748092, | |
| "epoch": 0.5929203539823009, | |
| "step": 670 | |
| }, | |
| { | |
| "loss": 0.2537, | |
| "grad_norm": 0.20408447086811066, | |
| "learning_rate": 4.030158058006262e-05, | |
| "entropy": 0.25533573105931284, | |
| "num_tokens": 28246631.0, | |
| "mean_token_accuracy": 0.9390394508838653, | |
| "epoch": 0.6017699115044248, | |
| "step": 680 | |
| }, | |
| { | |
| "loss": 0.1987, | |
| "grad_norm": 0.2749539017677307, | |
| "learning_rate": 3.881024244388827e-05, | |
| "entropy": 0.2049507148563862, | |
| "num_tokens": 28686952.0, | |
| "mean_token_accuracy": 0.9508672833442688, | |
| "epoch": 0.6106194690265486, | |
| "step": 690 | |
| }, | |
| { | |
| "loss": 0.2304, | |
| "grad_norm": 0.1964893341064453, | |
| "learning_rate": 3.7329313386694065e-05, | |
| "entropy": 0.23374196365475655, | |
| "num_tokens": 29090626.0, | |
| "mean_token_accuracy": 0.9433914422988892, | |
| "epoch": 0.6194690265486725, | |
| "step": 700 | |
| }, | |
| { | |
| "loss": 0.205, | |
| "grad_norm": 0.18834583461284637, | |
| "learning_rate": 3.586017101719432e-05, | |
| "entropy": 0.21235244944691659, | |
| "num_tokens": 29510980.0, | |
| "mean_token_accuracy": 0.9498258531093597, | |
| "epoch": 0.6283185840707964, | |
| "step": 710 | |
| }, | |
| { | |
| "loss": 0.2627, | |
| "grad_norm": 0.23034313321113586, | |
| "learning_rate": 3.440418197974039e-05, | |
| "entropy": 0.26501770280301573, | |
| "num_tokens": 29922686.0, | |
| "mean_token_accuracy": 0.9365832090377808, | |
| "epoch": 0.6371681415929203, | |
| "step": 720 | |
| }, | |
| { | |
| "loss": 0.2408, | |
| "grad_norm": 0.33568480610847473, | |
| "learning_rate": 3.2962700683023376e-05, | |
| "entropy": 0.2463029097765684, | |
| "num_tokens": 30338628.0, | |
| "mean_token_accuracy": 0.940551894903183, | |
| "epoch": 0.6460176991150443, | |
| "step": 730 | |
| }, | |
| { | |
| "loss": 0.2326, | |
| "grad_norm": 0.22903338074684143, | |
| "learning_rate": 3.153706804015873e-05, | |
| "entropy": 0.23602683916687967, | |
| "num_tokens": 30751615.0, | |
| "mean_token_accuracy": 0.9433848887681962, | |
| "epoch": 0.6548672566371682, | |
| "step": 740 | |
| }, | |
| { | |
| "loss": 0.283, | |
| "grad_norm": 0.24895504117012024, | |
| "learning_rate": 3.0128610221325022e-05, | |
| "entropy": 0.28722366876900196, | |
| "num_tokens": 31149202.0, | |
| "mean_token_accuracy": 0.9306167840957642, | |
| "epoch": 0.6637168141592921, | |
| "step": 750 | |
| }, | |
| { | |
| "loss": 0.226, | |
| "grad_norm": 0.20008179545402527, | |
| "learning_rate": 2.873863742011696e-05, | |
| "entropy": 0.23511841669678687, | |
| "num_tokens": 31550325.0, | |
| "mean_token_accuracy": 0.9445117026567459, | |
| "epoch": 0.672566371681416, | |
| "step": 760 | |
| }, | |
| { | |
| "loss": 0.183, | |
| "grad_norm": 0.1967935413122177, | |
| "learning_rate": 2.7368442634760438e-05, | |
| "entropy": 0.18726294599473475, | |
| "num_tokens": 31986135.0, | |
| "mean_token_accuracy": 0.9549686163663864, | |
| "epoch": 0.6814159292035398, | |
| "step": 770 | |
| }, | |
| { | |
| "loss": 0.2182, | |
| "grad_norm": 0.18984965980052948, | |
| "learning_rate": 2.6019300465323177e-05, | |
| "entropy": 0.22221928611397743, | |
| "num_tokens": 32415282.0, | |
| "mean_token_accuracy": 0.9468269884586334, | |
| "epoch": 0.6902654867256637, | |
| "step": 780 | |
| }, | |
| { | |
| "loss": 0.2005, | |
| "grad_norm": 0.22404947876930237, | |
| "learning_rate": 2.4692465928040043e-05, | |
| "entropy": 0.20175706930458545, | |
| "num_tokens": 32833217.0, | |
| "mean_token_accuracy": 0.951060900092125, | |
| "epoch": 0.6991150442477876, | |
| "step": 790 | |
| }, | |
| { | |
| "loss": 0.21, | |
| "grad_norm": 0.16944018006324768, | |
| "learning_rate": 2.3389173287855825e-05, | |
| "entropy": 0.21427074801176788, | |
| "num_tokens": 33241388.0, | |
| "mean_token_accuracy": 0.9489449441432953, | |
| "epoch": 0.7079646017699115, | |
| "step": 800 | |
| }, | |
| { | |
| "loss": 0.2413, | |
| "grad_norm": 0.2791028916835785, | |
| "learning_rate": 2.2110634910271554e-05, | |
| "entropy": 0.24735822584480047, | |
| "num_tokens": 33638748.0, | |
| "mean_token_accuracy": 0.9416136890649796, | |
| "epoch": 0.7168141592920354, | |
| "step": 810 | |
| }, | |
| { | |
| "loss": 0.2428, | |
| "grad_norm": 0.17646969854831696, | |
| "learning_rate": 2.0858040133562383e-05, | |
| "entropy": 0.24746169298887252, | |
| "num_tokens": 34065988.0, | |
| "mean_token_accuracy": 0.9409119069576264, | |
| "epoch": 0.7256637168141593, | |
| "step": 820 | |
| }, | |
| { | |
| "loss": 0.2431, | |
| "grad_norm": 0.25427207350730896, | |
| "learning_rate": 1.963255416241626e-05, | |
| "entropy": 0.24203080534934998, | |
| "num_tokens": 34488707.0, | |
| "mean_token_accuracy": 0.9407094061374665, | |
| "epoch": 0.7345132743362832, | |
| "step": 830 | |
| }, | |
| { | |
| "loss": 0.2239, | |
| "grad_norm": 0.21476761996746063, | |
| "learning_rate": 1.843531698402222e-05, | |
| "entropy": 0.22717048823833466, | |
| "num_tokens": 34909244.0, | |
| "mean_token_accuracy": 0.9451982468366623, | |
| "epoch": 0.7433628318584071, | |
| "step": 840 | |
| }, | |
| { | |
| "loss": 0.2208, | |
| "grad_norm": 0.22185730934143066, | |
| "learning_rate": 1.7267442307617082e-05, | |
| "entropy": 0.22914122715592383, | |
| "num_tokens": 35349557.0, | |
| "mean_token_accuracy": 0.9452274709939956, | |
| "epoch": 0.7522123893805309, | |
| "step": 850 | |
| }, | |
| { | |
| "loss": 0.2294, | |
| "grad_norm": 0.1716049462556839, | |
| "learning_rate": 1.613001652847658e-05, | |
| "entropy": 0.23541589826345444, | |
| "num_tokens": 35775557.0, | |
| "mean_token_accuracy": 0.9432215124368668, | |
| "epoch": 0.7610619469026548, | |
| "step": 860 | |
| }, | |
| { | |
| "loss": 0.3027, | |
| "grad_norm": 0.22304639220237732, | |
| "learning_rate": 1.5024097717314894e-05, | |
| "entropy": 0.3032771345227957, | |
| "num_tokens": 36187680.0, | |
| "mean_token_accuracy": 0.9272681981325149, | |
| "epoch": 0.7699115044247787, | |
| "step": 870 | |
| }, | |
| { | |
| "loss": 0.2042, | |
| "grad_norm": 0.21043775975704193, | |
| "learning_rate": 1.3950714636032691e-05, | |
| "entropy": 0.20667122304439545, | |
| "num_tokens": 36599241.0, | |
| "mean_token_accuracy": 0.950884211063385, | |
| "epoch": 0.7787610619469026, | |
| "step": 880 | |
| }, | |
| { | |
| "loss": 0.2206, | |
| "grad_norm": 0.19236122071743011, | |
| "learning_rate": 1.2910865780728998e-05, | |
| "entropy": 0.22606479078531266, | |
| "num_tokens": 37006428.0, | |
| "mean_token_accuracy": 0.9452178955078125, | |
| "epoch": 0.7876106194690266, | |
| "step": 890 | |
| }, | |
| { | |
| "loss": 0.1997, | |
| "grad_norm": 0.23871539533138275, | |
| "learning_rate": 1.1905518452867475e-05, | |
| "entropy": 0.20729146413505078, | |
| "num_tokens": 37403547.0, | |
| "mean_token_accuracy": 0.9499319463968277, | |
| "epoch": 0.7964601769911505, | |
| "step": 900 | |
| }, | |
| { | |
| "loss": 0.2453, | |
| "grad_norm": 0.18100206553936005, | |
| "learning_rate": 1.0935607859460984e-05, | |
| "entropy": 0.2504092514514923, | |
| "num_tokens": 37815049.0, | |
| "mean_token_accuracy": 0.9400516986846924, | |
| "epoch": 0.8053097345132744, | |
| "step": 910 | |
| }, | |
| { | |
| "loss": 0.1926, | |
| "grad_norm": 0.1908176839351654, | |
| "learning_rate": 1.0002036243111251e-05, | |
| "entropy": 0.1982258576899767, | |
| "num_tokens": 38229740.0, | |
| "mean_token_accuracy": 0.9525273025035859, | |
| "epoch": 0.8141592920353983, | |
| "step": 920 | |
| }, | |
| { | |
| "loss": 0.2521, | |
| "grad_norm": 0.1694677472114563, | |
| "learning_rate": 9.10567204271336e-06, | |
| "entropy": 0.25635765232145785, | |
| "num_tokens": 38652542.0, | |
| "mean_token_accuracy": 0.9399167329072953, | |
| "epoch": 0.8230088495575221, | |
| "step": 930 | |
| }, | |
| { | |
| "loss": 0.2137, | |
| "grad_norm": 0.27372586727142334, | |
| "learning_rate": 8.247349085605389e-06, | |
| "entropy": 0.21369801536202432, | |
| "num_tokens": 39046941.0, | |
| "mean_token_accuracy": 0.94815693795681, | |
| "epoch": 0.831858407079646, | |
| "step": 940 | |
| }, | |
| { | |
| "loss": 0.2466, | |
| "grad_norm": 0.20555748045444489, | |
| "learning_rate": 7.4278658119149695e-06, | |
| "entropy": 0.24944488294422626, | |
| "num_tokens": 39471447.0, | |
| "mean_token_accuracy": 0.9402138769626618, | |
| "epoch": 0.8407079646017699, | |
| "step": 950 | |
| }, | |
| { | |
| "loss": 0.1957, | |
| "grad_norm": 0.1944950520992279, | |
| "learning_rate": 6.647984531824064e-06, | |
| "entropy": 0.20089373365044594, | |
| "num_tokens": 39880021.0, | |
| "mean_token_accuracy": 0.9509478449821472, | |
| "epoch": 0.8495575221238938, | |
| "step": 960 | |
| }, | |
| { | |
| "loss": 0.2061, | |
| "grad_norm": 0.17296965420246124, | |
| "learning_rate": 5.908430716443086e-06, | |
| "entropy": 0.21387975346297025, | |
| "num_tokens": 40310388.0, | |
| "mean_token_accuracy": 0.9484301716089248, | |
| "epoch": 0.8584070796460177, | |
| "step": 970 | |
| }, | |
| { | |
| "loss": 0.2323, | |
| "grad_norm": 0.20349572598934174, | |
| "learning_rate": 5.20989232295393e-06, | |
| "entropy": 0.23771359361708164, | |
| "num_tokens": 40710959.0, | |
| "mean_token_accuracy": 0.94212906062603, | |
| "epoch": 0.8672566371681416, | |
| "step": 980 | |
| }, | |
| { | |
| "loss": 0.2377, | |
| "grad_norm": 0.18183237314224243, | |
| "learning_rate": 4.5530191546496515e-06, | |
| "entropy": 0.2454454731196165, | |
| "num_tokens": 41138761.0, | |
| "mean_token_accuracy": 0.9407603412866592, | |
| "epoch": 0.8761061946902655, | |
| "step": 990 | |
| }, | |
| { | |
| "loss": 0.2071, | |
| "grad_norm": 0.177731454372406, | |
| "learning_rate": 3.938422256466185e-06, | |
| "entropy": 0.21258567087352276, | |
| "num_tokens": 41549796.0, | |
| "mean_token_accuracy": 0.948466694355011, | |
| "epoch": 0.8849557522123894, | |
| "step": 1000 | |
| }, | |
| { | |
| "loss": 0.2984, | |
| "grad_norm": 0.18465429544448853, | |
| "learning_rate": 3.3666733465682833e-06, | |
| "entropy": 0.29959365651011466, | |
| "num_tokens": 41982205.0, | |
| "mean_token_accuracy": 0.9291581392288208, | |
| "epoch": 0.8938053097345132, | |
| "step": 1010 | |
| }, | |
| { | |
| "loss": 0.223, | |
| "grad_norm": 0.18058468401432037, | |
| "learning_rate": 2.8383042845186004e-06, | |
| "entropy": 0.22986317798495293, | |
| "num_tokens": 42424950.0, | |
| "mean_token_accuracy": 0.9453666418790817, | |
| "epoch": 0.9026548672566371, | |
| "step": 1020 | |
| }, | |
| { | |
| "loss": 0.221, | |
| "grad_norm": 0.19499893486499786, | |
| "learning_rate": 2.3538065765244755e-06, | |
| "entropy": 0.22309133298695089, | |
| "num_tokens": 42834983.0, | |
| "mean_token_accuracy": 0.9468386858701706, | |
| "epoch": 0.911504424778761, | |
| "step": 1030 | |
| }, | |
| { | |
| "loss": 0.2156, | |
| "grad_norm": 0.19273503124713898, | |
| "learning_rate": 1.913630918222792e-06, | |
| "entropy": 0.21915269698947668, | |
| "num_tokens": 43234797.0, | |
| "mean_token_accuracy": 0.9469860643148422, | |
| "epoch": 0.9203539823008849, | |
| "step": 1040 | |
| }, | |
| { | |
| "loss": 0.2552, | |
| "grad_norm": 0.15608751773834229, | |
| "learning_rate": 1.5181867754280931e-06, | |
| "entropy": 0.25907820984721186, | |
| "num_tokens": 43679426.0, | |
| "mean_token_accuracy": 0.9379706591367721, | |
| "epoch": 0.9292035398230089, | |
| "step": 1050 | |
| }, | |
| { | |
| "loss": 0.2416, | |
| "grad_norm": 0.1610129177570343, | |
| "learning_rate": 1.1678420032341153e-06, | |
| "entropy": 0.24386902302503585, | |
| "num_tokens": 44112256.0, | |
| "mean_token_accuracy": 0.9416323363780975, | |
| "epoch": 0.9380530973451328, | |
| "step": 1060 | |
| }, | |
| { | |
| "loss": 0.2071, | |
| "grad_norm": 0.16213390231132507, | |
| "learning_rate": 8.629225038229049e-07, | |
| "entropy": 0.20874513685703278, | |
| "num_tokens": 44570422.0, | |
| "mean_token_accuracy": 0.9496494054794311, | |
| "epoch": 0.9469026548672567, | |
| "step": 1070 | |
| }, | |
| { | |
| "loss": 0.1747, | |
| "grad_norm": 0.18272142112255096, | |
| "learning_rate": 6.037119232999266e-07, | |
| "entropy": 0.17933553121984006, | |
| "num_tokens": 44972484.0, | |
| "mean_token_accuracy": 0.9556117475032806, | |
| "epoch": 0.9557522123893806, | |
| "step": 1080 | |
| }, | |
| { | |
| "loss": 0.2177, | |
| "grad_norm": 0.21331429481506348, | |
| "learning_rate": 3.904513878371818e-07, | |
| "entropy": 0.2203224040567875, | |
| "num_tokens": 45399966.0, | |
| "mean_token_accuracy": 0.9465226858854294, | |
| "epoch": 0.9646017699115044, | |
| "step": 1090 | |
| }, | |
| { | |
| "loss": 0.2361, | |
| "grad_norm": 0.19762705266475677, | |
| "learning_rate": 2.233392793697442e-07, | |
| "entropy": 0.23971713967621328, | |
| "num_tokens": 45841913.0, | |
| "mean_token_accuracy": 0.9428269326686859, | |
| "epoch": 0.9734513274336283, | |
| "step": 1100 | |
| }, | |
| { | |
| "loss": 0.254, | |
| "grad_norm": 0.21293021738529205, | |
| "learning_rate": 1.0253105105438865e-07, | |
| "entropy": 0.25558883510529995, | |
| "num_tokens": 46272798.0, | |
| "mean_token_accuracy": 0.9393420517444611, | |
| "epoch": 0.9823008849557522, | |
| "step": 1110 | |
| }, | |
| { | |
| "loss": 0.2335, | |
| "grad_norm": 0.1932452768087387, | |
| "learning_rate": 2.8139082661954307e-08, | |
| "entropy": 0.24025008846074342, | |
| "num_tokens": 46703041.0, | |
| "mean_token_accuracy": 0.9413352489471436, | |
| "epoch": 0.9911504424778761, | |
| "step": 1120 | |
| }, | |
| { | |
| "loss": 0.2372, | |
| "grad_norm": 0.1784687340259552, | |
| "learning_rate": 2.32576038022847e-10, | |
| "entropy": 0.23823871053755283, | |
| "num_tokens": 47110855.0, | |
| "mean_token_accuracy": 0.943606698513031, | |
| "epoch": 1.0, | |
| "step": 1130 | |
| }, | |
| { | |
| "train_runtime": 18643.3649, | |
| "train_samples_per_second": 1.94, | |
| "train_steps_per_second": 0.061, | |
| "total_flos": 2.0704071601101865e+19, | |
| "train_loss": 0.33413480024422165, | |
| "epoch": 1.0, | |
| "step": 1130 | |
| } | |
| ] |