Instructions to use elozeiri/Jais2-random with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- PEFT
How to use elozeiri/Jais2-random with PEFT:
from peft import PeftModel from transformers import AutoModelForCausalLM base_model = AutoModelForCausalLM.from_pretrained("/lustre/scratch/users/mohamed.anwar/ONEDRIVE/Projects/jais_plus/checkpoints/20251130_8B_DPO") model = PeftModel.from_pretrained(base_model, "elozeiri/Jais2-random") - Transformers
How to use elozeiri/Jais2-random with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="elozeiri/Jais2-random") messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("elozeiri/Jais2-random", dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use elozeiri/Jais2-random with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "elozeiri/Jais2-random" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "elozeiri/Jais2-random", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/elozeiri/Jais2-random
- SGLang
How to use elozeiri/Jais2-random with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "elozeiri/Jais2-random" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "elozeiri/Jais2-random", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "elozeiri/Jais2-random" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "elozeiri/Jais2-random", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use elozeiri/Jais2-random with Docker Model Runner:
docker model run hf.co/elozeiri/Jais2-random
Invalid JSON:Unexpected token 'N', ..."al_loss": NaN,
"... is not valid JSON
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.9660792116732992, | |
| "eval_steps": 1000, | |
| "global_step": 41500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 1.2857219433784486, | |
| "epoch": 0.002368770134546144, | |
| "grad_norm": 9.544422149658203, | |
| "learning_rate": 1.1605873993368073e-06, | |
| "loss": 2.8961, | |
| "mean_token_accuracy": 0.6633071088790894, | |
| "num_tokens": 1213722.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 1.5439852488040924, | |
| "epoch": 0.004737540269092288, | |
| "grad_norm": 1.5218281745910645, | |
| "learning_rate": 2.3448602558029374e-06, | |
| "loss": 2.0745, | |
| "mean_token_accuracy": 0.6788832449913025, | |
| "num_tokens": 2440523.0, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 1.9061457157135009, | |
| "epoch": 0.007106310403638431, | |
| "grad_norm": 1.1667029857635498, | |
| "learning_rate": 3.529133112269067e-06, | |
| "loss": 1.5243, | |
| "mean_token_accuracy": 0.6922976732254028, | |
| "num_tokens": 3673044.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 1.8150238823890685, | |
| "epoch": 0.009475080538184575, | |
| "grad_norm": 1.093643307685852, | |
| "learning_rate": 4.713405968735197e-06, | |
| "loss": 1.4194, | |
| "mean_token_accuracy": 0.7095924293994904, | |
| "num_tokens": 4883924.0, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 1.7932313251495362, | |
| "epoch": 0.011843850672730718, | |
| "grad_norm": 0.9533292055130005, | |
| "learning_rate": 5.897678825201327e-06, | |
| "loss": 1.4224, | |
| "mean_token_accuracy": 0.7063059556484222, | |
| "num_tokens": 6115751.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 1.7181093657016755, | |
| "epoch": 0.014212620807276862, | |
| "grad_norm": 1.2956347465515137, | |
| "learning_rate": 7.0819516816674565e-06, | |
| "loss": 1.3637, | |
| "mean_token_accuracy": 0.7182639849185943, | |
| "num_tokens": 7344600.0, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 1.7070274019241334, | |
| "epoch": 0.016581390941823006, | |
| "grad_norm": 1.0166376829147339, | |
| "learning_rate": 8.266224538133587e-06, | |
| "loss": 1.3641, | |
| "mean_token_accuracy": 0.7153780800104141, | |
| "num_tokens": 8564072.0, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 1.7232308828830718, | |
| "epoch": 0.01895016107636915, | |
| "grad_norm": 0.9041787981987, | |
| "learning_rate": 9.450497394599716e-06, | |
| "loss": 1.4065, | |
| "mean_token_accuracy": 0.7088368773460388, | |
| "num_tokens": 9774641.0, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 1.691476699113846, | |
| "epoch": 0.02131893121091529, | |
| "grad_norm": 1.104785680770874, | |
| "learning_rate": 1.0634770251065847e-05, | |
| "loss": 1.3464, | |
| "mean_token_accuracy": 0.7208190321922302, | |
| "num_tokens": 11023876.0, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 1.7134545636177063, | |
| "epoch": 0.023687701345461436, | |
| "grad_norm": 1.029800295829773, | |
| "learning_rate": 1.1819043107531975e-05, | |
| "loss": 1.3758, | |
| "mean_token_accuracy": 0.7151617485284806, | |
| "num_tokens": 12267386.0, | |
| "step": 500 | |
| }, | |
| { | |
| "entropy": 1.6694060420989991, | |
| "epoch": 0.02605647148000758, | |
| "grad_norm": 0.9847853183746338, | |
| "learning_rate": 1.3003315963998106e-05, | |
| "loss": 1.3374, | |
| "mean_token_accuracy": 0.7215989363193512, | |
| "num_tokens": 13530115.0, | |
| "step": 550 | |
| }, | |
| { | |
| "entropy": 1.6969332695007324, | |
| "epoch": 0.028425241614553724, | |
| "grad_norm": 0.9018113017082214, | |
| "learning_rate": 1.4187588820464234e-05, | |
| "loss": 1.3446, | |
| "mean_token_accuracy": 0.720185512304306, | |
| "num_tokens": 14757305.0, | |
| "step": 600 | |
| }, | |
| { | |
| "entropy": 1.7514292740821837, | |
| "epoch": 0.03079401174909987, | |
| "grad_norm": 0.803989827632904, | |
| "learning_rate": 1.5371861676930365e-05, | |
| "loss": 1.4127, | |
| "mean_token_accuracy": 0.7072590082883835, | |
| "num_tokens": 15958099.0, | |
| "step": 650 | |
| }, | |
| { | |
| "entropy": 1.7325732719898224, | |
| "epoch": 0.03316278188364601, | |
| "grad_norm": 0.865963876247406, | |
| "learning_rate": 1.6556134533396493e-05, | |
| "loss": 1.3412, | |
| "mean_token_accuracy": 0.71946579515934, | |
| "num_tokens": 17188325.0, | |
| "step": 700 | |
| }, | |
| { | |
| "entropy": 1.707287894487381, | |
| "epoch": 0.03553155201819216, | |
| "grad_norm": 0.8039044141769409, | |
| "learning_rate": 1.7740407389862628e-05, | |
| "loss": 1.3502, | |
| "mean_token_accuracy": 0.7187299233675003, | |
| "num_tokens": 18423224.0, | |
| "step": 750 | |
| }, | |
| { | |
| "entropy": 1.708844404220581, | |
| "epoch": 0.0379003221527383, | |
| "grad_norm": 0.8659459948539734, | |
| "learning_rate": 1.8924680246328755e-05, | |
| "loss": 1.3596, | |
| "mean_token_accuracy": 0.7151121199131012, | |
| "num_tokens": 19656464.0, | |
| "step": 800 | |
| }, | |
| { | |
| "entropy": 1.7051723492145539, | |
| "epoch": 0.04026909228728444, | |
| "grad_norm": 0.8470927476882935, | |
| "learning_rate": 2.0108953102794883e-05, | |
| "loss": 1.309, | |
| "mean_token_accuracy": 0.723027645945549, | |
| "num_tokens": 20908664.0, | |
| "step": 850 | |
| }, | |
| { | |
| "entropy": 1.6959775292873382, | |
| "epoch": 0.04263786242183058, | |
| "grad_norm": 1.0128496885299683, | |
| "learning_rate": 2.1293225959261014e-05, | |
| "loss": 1.3337, | |
| "mean_token_accuracy": 0.720455265045166, | |
| "num_tokens": 22165141.0, | |
| "step": 900 | |
| }, | |
| { | |
| "entropy": 1.7231877827644348, | |
| "epoch": 0.04500663255637673, | |
| "grad_norm": 0.9077188968658447, | |
| "learning_rate": 2.2477498815727142e-05, | |
| "loss": 1.3309, | |
| "mean_token_accuracy": 0.7221323251724243, | |
| "num_tokens": 23408996.0, | |
| "step": 950 | |
| }, | |
| { | |
| "entropy": 1.7131755888462066, | |
| "epoch": 0.04737540269092287, | |
| "grad_norm": 0.9538567066192627, | |
| "learning_rate": 2.3661771672193277e-05, | |
| "loss": 1.3434, | |
| "mean_token_accuracy": 0.718843805193901, | |
| "num_tokens": 24626341.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.04737540269092287, | |
| "eval_entropy": 1.1688038776145298, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.7489374687524091, | |
| "eval_num_tokens": 24626341.0, | |
| "eval_runtime": 739.6274, | |
| "eval_samples_per_second": 33.551, | |
| "eval_steps_per_second": 4.194, | |
| "step": 1000 | |
| }, | |
| { | |
| "entropy": 1.7510324358940124, | |
| "epoch": 0.049744172825469016, | |
| "grad_norm": 0.8725846409797668, | |
| "learning_rate": 2.4846044528659405e-05, | |
| "loss": 1.3756, | |
| "mean_token_accuracy": 0.714400834441185, | |
| "num_tokens": 25873918.0, | |
| "step": 1050 | |
| }, | |
| { | |
| "entropy": 1.7741026413440704, | |
| "epoch": 0.05211294296001516, | |
| "grad_norm": 0.9376819729804993, | |
| "learning_rate": 2.6030317385125536e-05, | |
| "loss": 1.3554, | |
| "mean_token_accuracy": 0.7169699442386627, | |
| "num_tokens": 27067052.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "entropy": 1.7553015303611756, | |
| "epoch": 0.054481713094561304, | |
| "grad_norm": 0.7493100166320801, | |
| "learning_rate": 2.7214590241591663e-05, | |
| "loss": 1.3385, | |
| "mean_token_accuracy": 0.721844300031662, | |
| "num_tokens": 28308409.0, | |
| "step": 1150 | |
| }, | |
| { | |
| "entropy": 1.7428362345695496, | |
| "epoch": 0.05685048322910745, | |
| "grad_norm": 0.7889260053634644, | |
| "learning_rate": 2.8398863098057795e-05, | |
| "loss": 1.3293, | |
| "mean_token_accuracy": 0.7205719447135925, | |
| "num_tokens": 29542172.0, | |
| "step": 1200 | |
| }, | |
| { | |
| "entropy": 1.727198257446289, | |
| "epoch": 0.05921925336365359, | |
| "grad_norm": 0.8638942837715149, | |
| "learning_rate": 2.9583135954523922e-05, | |
| "loss": 1.3176, | |
| "mean_token_accuracy": 0.7266954278945923, | |
| "num_tokens": 30777592.0, | |
| "step": 1250 | |
| }, | |
| { | |
| "entropy": 1.7321817111968993, | |
| "epoch": 0.06158802349819974, | |
| "grad_norm": 0.8561661839485168, | |
| "learning_rate": 3.076740881099006e-05, | |
| "loss": 1.3116, | |
| "mean_token_accuracy": 0.7248632162809372, | |
| "num_tokens": 32008108.0, | |
| "step": 1300 | |
| }, | |
| { | |
| "entropy": 1.7352443253993988, | |
| "epoch": 0.06395679363274588, | |
| "grad_norm": 0.8668932914733887, | |
| "learning_rate": 3.1951681667456185e-05, | |
| "loss": 1.3422, | |
| "mean_token_accuracy": 0.7209743493795395, | |
| "num_tokens": 33257843.0, | |
| "step": 1350 | |
| }, | |
| { | |
| "entropy": 1.7172068011760713, | |
| "epoch": 0.06632556376729203, | |
| "grad_norm": 1.030638337135315, | |
| "learning_rate": 3.313595452392231e-05, | |
| "loss": 1.3161, | |
| "mean_token_accuracy": 0.7226567584276199, | |
| "num_tokens": 34486893.0, | |
| "step": 1400 | |
| }, | |
| { | |
| "entropy": 1.7621229577064514, | |
| "epoch": 0.06869433390183817, | |
| "grad_norm": 1.0166395902633667, | |
| "learning_rate": 3.432022738038844e-05, | |
| "loss": 1.3644, | |
| "mean_token_accuracy": 0.7157200646400451, | |
| "num_tokens": 35701555.0, | |
| "step": 1450 | |
| }, | |
| { | |
| "entropy": 1.739331885576248, | |
| "epoch": 0.07106310403638431, | |
| "grad_norm": 0.6904604434967041, | |
| "learning_rate": 3.550450023685457e-05, | |
| "loss": 1.3599, | |
| "mean_token_accuracy": 0.716941955089569, | |
| "num_tokens": 36936472.0, | |
| "step": 1500 | |
| }, | |
| { | |
| "entropy": 1.7242075634002685, | |
| "epoch": 0.07343187417093046, | |
| "grad_norm": 0.8110722303390503, | |
| "learning_rate": 3.66887730933207e-05, | |
| "loss": 1.3374, | |
| "mean_token_accuracy": 0.7217743951082229, | |
| "num_tokens": 38178198.0, | |
| "step": 1550 | |
| }, | |
| { | |
| "entropy": 1.7162011814117433, | |
| "epoch": 0.0758006443054766, | |
| "grad_norm": 0.8524773716926575, | |
| "learning_rate": 3.787304594978684e-05, | |
| "loss": 1.3228, | |
| "mean_token_accuracy": 0.7227801591157913, | |
| "num_tokens": 39412346.0, | |
| "step": 1600 | |
| }, | |
| { | |
| "entropy": 1.7332351410388946, | |
| "epoch": 0.07816941444002275, | |
| "grad_norm": 0.7344287037849426, | |
| "learning_rate": 3.9057318806252965e-05, | |
| "loss": 1.3343, | |
| "mean_token_accuracy": 0.721068668961525, | |
| "num_tokens": 40640112.0, | |
| "step": 1650 | |
| }, | |
| { | |
| "entropy": 1.7606168591976166, | |
| "epoch": 0.08053818457456888, | |
| "grad_norm": 0.8270729184150696, | |
| "learning_rate": 4.024159166271909e-05, | |
| "loss": 1.3587, | |
| "mean_token_accuracy": 0.7182448714971542, | |
| "num_tokens": 41866196.0, | |
| "step": 1700 | |
| }, | |
| { | |
| "entropy": 1.7047703182697296, | |
| "epoch": 0.08290695470911502, | |
| "grad_norm": 0.8977941870689392, | |
| "learning_rate": 4.142586451918522e-05, | |
| "loss": 1.3278, | |
| "mean_token_accuracy": 0.7222372907400131, | |
| "num_tokens": 43124550.0, | |
| "step": 1750 | |
| }, | |
| { | |
| "entropy": 1.7705276823043823, | |
| "epoch": 0.08527572484366117, | |
| "grad_norm": 0.8741844892501831, | |
| "learning_rate": 4.2610137375651355e-05, | |
| "loss": 1.3573, | |
| "mean_token_accuracy": 0.7164474505186081, | |
| "num_tokens": 44315237.0, | |
| "step": 1800 | |
| }, | |
| { | |
| "entropy": 1.7388102066516877, | |
| "epoch": 0.08764449497820731, | |
| "grad_norm": 0.8004917502403259, | |
| "learning_rate": 4.379441023211748e-05, | |
| "loss": 1.3342, | |
| "mean_token_accuracy": 0.7217385923862457, | |
| "num_tokens": 45536080.0, | |
| "step": 1850 | |
| }, | |
| { | |
| "entropy": 1.7502016520500183, | |
| "epoch": 0.09001326511275345, | |
| "grad_norm": 0.8822757005691528, | |
| "learning_rate": 4.497868308858361e-05, | |
| "loss": 1.336, | |
| "mean_token_accuracy": 0.7232905811071396, | |
| "num_tokens": 46770718.0, | |
| "step": 1900 | |
| }, | |
| { | |
| "entropy": 1.767259726524353, | |
| "epoch": 0.0923820352472996, | |
| "grad_norm": 0.7748751640319824, | |
| "learning_rate": 4.616295594504974e-05, | |
| "loss": 1.3553, | |
| "mean_token_accuracy": 0.7182161051034928, | |
| "num_tokens": 47996075.0, | |
| "step": 1950 | |
| }, | |
| { | |
| "entropy": 1.6973827588558197, | |
| "epoch": 0.09475080538184574, | |
| "grad_norm": 0.8286657333374023, | |
| "learning_rate": 4.7347228801515866e-05, | |
| "loss": 1.3257, | |
| "mean_token_accuracy": 0.7228487819433213, | |
| "num_tokens": 49257764.0, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.09475080538184574, | |
| "eval_entropy": 1.204862184174056, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.7511153636268459, | |
| "eval_num_tokens": 49257764.0, | |
| "eval_runtime": 739.5936, | |
| "eval_samples_per_second": 33.552, | |
| "eval_steps_per_second": 4.194, | |
| "step": 2000 | |
| }, | |
| { | |
| "entropy": 1.7507979416847228, | |
| "epoch": 0.09711957551639189, | |
| "grad_norm": 0.8749801516532898, | |
| "learning_rate": 4.8531501657982e-05, | |
| "loss": 1.3637, | |
| "mean_token_accuracy": 0.7162546402215958, | |
| "num_tokens": 50488090.0, | |
| "step": 2050 | |
| }, | |
| { | |
| "entropy": 1.7249326765537263, | |
| "epoch": 0.09948834565093803, | |
| "grad_norm": 0.9060792922973633, | |
| "learning_rate": 4.9715774514448135e-05, | |
| "loss": 1.325, | |
| "mean_token_accuracy": 0.7223574507236481, | |
| "num_tokens": 51760869.0, | |
| "step": 2100 | |
| }, | |
| { | |
| "entropy": 1.7315819489955901, | |
| "epoch": 0.10185711578548418, | |
| "grad_norm": 0.7879400253295898, | |
| "learning_rate": 5.090004737091426e-05, | |
| "loss": 1.3279, | |
| "mean_token_accuracy": 0.7231736582517624, | |
| "num_tokens": 52971004.0, | |
| "step": 2150 | |
| }, | |
| { | |
| "entropy": 1.721841138601303, | |
| "epoch": 0.10422588592003032, | |
| "grad_norm": 0.6798914074897766, | |
| "learning_rate": 5.208432022738039e-05, | |
| "loss": 1.3486, | |
| "mean_token_accuracy": 0.7193009465932846, | |
| "num_tokens": 54200559.0, | |
| "step": 2200 | |
| }, | |
| { | |
| "entropy": 1.732561513185501, | |
| "epoch": 0.10659465605457646, | |
| "grad_norm": 0.9104458689689636, | |
| "learning_rate": 5.326859308384652e-05, | |
| "loss": 1.313, | |
| "mean_token_accuracy": 0.7249197036027908, | |
| "num_tokens": 55431492.0, | |
| "step": 2250 | |
| }, | |
| { | |
| "entropy": 1.745149908065796, | |
| "epoch": 0.10896342618912261, | |
| "grad_norm": 1.0170321464538574, | |
| "learning_rate": 5.4452865940312646e-05, | |
| "loss": 1.3327, | |
| "mean_token_accuracy": 0.7201522195339203, | |
| "num_tokens": 56685982.0, | |
| "step": 2300 | |
| }, | |
| { | |
| "entropy": 1.7817789494991303, | |
| "epoch": 0.11133219632366875, | |
| "grad_norm": 0.8275519013404846, | |
| "learning_rate": 5.5637138796778774e-05, | |
| "loss": 1.3902, | |
| "mean_token_accuracy": 0.7141750353574753, | |
| "num_tokens": 57930317.0, | |
| "step": 2350 | |
| }, | |
| { | |
| "entropy": 1.7808838784694672, | |
| "epoch": 0.1137009664582149, | |
| "grad_norm": 0.8482922315597534, | |
| "learning_rate": 5.6821411653244915e-05, | |
| "loss": 1.3555, | |
| "mean_token_accuracy": 0.7165615385770798, | |
| "num_tokens": 59178908.0, | |
| "step": 2400 | |
| }, | |
| { | |
| "entropy": 1.7483756732940674, | |
| "epoch": 0.11606973659276104, | |
| "grad_norm": 1.1124041080474854, | |
| "learning_rate": 5.800568450971104e-05, | |
| "loss": 1.3078, | |
| "mean_token_accuracy": 0.7251748180389405, | |
| "num_tokens": 60407544.0, | |
| "step": 2450 | |
| }, | |
| { | |
| "entropy": 1.7590064382553101, | |
| "epoch": 0.11843850672730719, | |
| "grad_norm": 0.8734819889068604, | |
| "learning_rate": 5.918995736617717e-05, | |
| "loss": 1.3515, | |
| "mean_token_accuracy": 0.7170740348100663, | |
| "num_tokens": 61646580.0, | |
| "step": 2500 | |
| }, | |
| { | |
| "entropy": 1.7470902466773988, | |
| "epoch": 0.12080727686185333, | |
| "grad_norm": 0.9874738454818726, | |
| "learning_rate": 6.03742302226433e-05, | |
| "loss": 1.3449, | |
| "mean_token_accuracy": 0.7202254205942153, | |
| "num_tokens": 62847914.0, | |
| "step": 2550 | |
| }, | |
| { | |
| "entropy": 1.7077771651744842, | |
| "epoch": 0.12317604699639947, | |
| "grad_norm": 0.7741467952728271, | |
| "learning_rate": 6.155850307910943e-05, | |
| "loss": 1.3253, | |
| "mean_token_accuracy": 0.721799430847168, | |
| "num_tokens": 64110298.0, | |
| "step": 2600 | |
| }, | |
| { | |
| "entropy": 1.7353104615211488, | |
| "epoch": 0.12554481713094562, | |
| "grad_norm": 0.878971517086029, | |
| "learning_rate": 6.274277593557556e-05, | |
| "loss": 1.3365, | |
| "mean_token_accuracy": 0.7212816894054412, | |
| "num_tokens": 65347419.0, | |
| "step": 2650 | |
| }, | |
| { | |
| "entropy": 1.734018679857254, | |
| "epoch": 0.12791358726549176, | |
| "grad_norm": 0.9191023111343384, | |
| "learning_rate": 6.392704879204168e-05, | |
| "loss": 1.3317, | |
| "mean_token_accuracy": 0.7217869812250137, | |
| "num_tokens": 66593613.0, | |
| "step": 2700 | |
| }, | |
| { | |
| "entropy": 1.7515969347953797, | |
| "epoch": 0.1302823574000379, | |
| "grad_norm": 0.9526401162147522, | |
| "learning_rate": 6.511132164850782e-05, | |
| "loss": 1.3381, | |
| "mean_token_accuracy": 0.7212713253498078, | |
| "num_tokens": 67818628.0, | |
| "step": 2750 | |
| }, | |
| { | |
| "entropy": 1.732736051082611, | |
| "epoch": 0.13265112753458405, | |
| "grad_norm": 0.9250634908676147, | |
| "learning_rate": 6.629559450497395e-05, | |
| "loss": 1.3267, | |
| "mean_token_accuracy": 0.7237467241287231, | |
| "num_tokens": 69063328.0, | |
| "step": 2800 | |
| }, | |
| { | |
| "entropy": 1.7254245734214784, | |
| "epoch": 0.1350198976691302, | |
| "grad_norm": 0.8667979836463928, | |
| "learning_rate": 6.747986736144007e-05, | |
| "loss": 1.3669, | |
| "mean_token_accuracy": 0.7158471101522446, | |
| "num_tokens": 70325294.0, | |
| "step": 2850 | |
| }, | |
| { | |
| "entropy": 1.7342670309543609, | |
| "epoch": 0.13738866780367634, | |
| "grad_norm": 0.9424638748168945, | |
| "learning_rate": 6.866414021790622e-05, | |
| "loss": 1.352, | |
| "mean_token_accuracy": 0.7186112779378891, | |
| "num_tokens": 71578584.0, | |
| "step": 2900 | |
| }, | |
| { | |
| "entropy": 1.717711169719696, | |
| "epoch": 0.13975743793822248, | |
| "grad_norm": 0.8827985525131226, | |
| "learning_rate": 6.984841307437234e-05, | |
| "loss": 1.3708, | |
| "mean_token_accuracy": 0.7152537268400192, | |
| "num_tokens": 72849472.0, | |
| "step": 2950 | |
| }, | |
| { | |
| "entropy": 1.7510717618465423, | |
| "epoch": 0.14212620807276863, | |
| "grad_norm": 0.8640701174736023, | |
| "learning_rate": 7.103268593083848e-05, | |
| "loss": 1.3576, | |
| "mean_token_accuracy": 0.7165306961536407, | |
| "num_tokens": 74089751.0, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.14212620807276863, | |
| "eval_entropy": 1.1560545717993527, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.7539014699553613, | |
| "eval_num_tokens": 74089751.0, | |
| "eval_runtime": 739.5182, | |
| "eval_samples_per_second": 33.556, | |
| "eval_steps_per_second": 4.195, | |
| "step": 3000 | |
| }, | |
| { | |
| "entropy": 1.7481833016872406, | |
| "epoch": 0.14449497820731477, | |
| "grad_norm": 0.8835089206695557, | |
| "learning_rate": 7.22169587873046e-05, | |
| "loss": 1.3735, | |
| "mean_token_accuracy": 0.7145194208621979, | |
| "num_tokens": 75300868.0, | |
| "step": 3050 | |
| }, | |
| { | |
| "entropy": 1.7272928488254546, | |
| "epoch": 0.14686374834186092, | |
| "grad_norm": 0.858995258808136, | |
| "learning_rate": 7.340123164377073e-05, | |
| "loss": 1.3456, | |
| "mean_token_accuracy": 0.7180565488338471, | |
| "num_tokens": 76542456.0, | |
| "step": 3100 | |
| }, | |
| { | |
| "entropy": 1.7239915192127229, | |
| "epoch": 0.14923251847640706, | |
| "grad_norm": 1.038167953491211, | |
| "learning_rate": 7.458550450023685e-05, | |
| "loss": 1.3364, | |
| "mean_token_accuracy": 0.7197621566057205, | |
| "num_tokens": 77797173.0, | |
| "step": 3150 | |
| }, | |
| { | |
| "entropy": 1.7232668161392213, | |
| "epoch": 0.1516012886109532, | |
| "grad_norm": 0.7863021492958069, | |
| "learning_rate": 7.576977735670299e-05, | |
| "loss": 1.3252, | |
| "mean_token_accuracy": 0.7219555181264877, | |
| "num_tokens": 79036677.0, | |
| "step": 3200 | |
| }, | |
| { | |
| "entropy": 1.7575169241428374, | |
| "epoch": 0.15397005874549935, | |
| "grad_norm": 1.122582197189331, | |
| "learning_rate": 7.695405021316912e-05, | |
| "loss": 1.3513, | |
| "mean_token_accuracy": 0.7171193498373032, | |
| "num_tokens": 80257412.0, | |
| "step": 3250 | |
| }, | |
| { | |
| "entropy": 1.7478620946407317, | |
| "epoch": 0.1563388288800455, | |
| "grad_norm": 0.8442687392234802, | |
| "learning_rate": 7.813832306963524e-05, | |
| "loss": 1.3689, | |
| "mean_token_accuracy": 0.7146556586027145, | |
| "num_tokens": 81452995.0, | |
| "step": 3300 | |
| }, | |
| { | |
| "entropy": 1.7156465804576875, | |
| "epoch": 0.1587075990145916, | |
| "grad_norm": 0.8353444337844849, | |
| "learning_rate": 7.932259592610138e-05, | |
| "loss": 1.341, | |
| "mean_token_accuracy": 0.7198166775703431, | |
| "num_tokens": 82720102.0, | |
| "step": 3350 | |
| }, | |
| { | |
| "entropy": 1.6997444534301758, | |
| "epoch": 0.16107636914913775, | |
| "grad_norm": 1.0969985723495483, | |
| "learning_rate": 8.050686878256751e-05, | |
| "loss": 1.3462, | |
| "mean_token_accuracy": 0.7181236177682877, | |
| "num_tokens": 83970755.0, | |
| "step": 3400 | |
| }, | |
| { | |
| "entropy": 1.7316451609134673, | |
| "epoch": 0.1634451392836839, | |
| "grad_norm": 1.048732876777649, | |
| "learning_rate": 8.169114163903365e-05, | |
| "loss": 1.3286, | |
| "mean_token_accuracy": 0.7228594154119492, | |
| "num_tokens": 85196313.0, | |
| "step": 3450 | |
| }, | |
| { | |
| "entropy": 1.6936505138874054, | |
| "epoch": 0.16581390941823004, | |
| "grad_norm": 0.9473629593849182, | |
| "learning_rate": 8.287541449549977e-05, | |
| "loss": 1.3178, | |
| "mean_token_accuracy": 0.7219874155521393, | |
| "num_tokens": 86444399.0, | |
| "step": 3500 | |
| }, | |
| { | |
| "entropy": 1.7614711892604829, | |
| "epoch": 0.1681826795527762, | |
| "grad_norm": 1.0644205808639526, | |
| "learning_rate": 8.40596873519659e-05, | |
| "loss": 1.3485, | |
| "mean_token_accuracy": 0.7177842026948928, | |
| "num_tokens": 87668028.0, | |
| "step": 3550 | |
| }, | |
| { | |
| "entropy": 1.760385752916336, | |
| "epoch": 0.17055144968732233, | |
| "grad_norm": 0.8554447293281555, | |
| "learning_rate": 8.524396020843202e-05, | |
| "loss": 1.3655, | |
| "mean_token_accuracy": 0.7148052769899368, | |
| "num_tokens": 88876622.0, | |
| "step": 3600 | |
| }, | |
| { | |
| "entropy": 1.7391455006599426, | |
| "epoch": 0.17292021982186848, | |
| "grad_norm": 0.8997156023979187, | |
| "learning_rate": 8.642823306489816e-05, | |
| "loss": 1.3729, | |
| "mean_token_accuracy": 0.7139494162797928, | |
| "num_tokens": 90122997.0, | |
| "step": 3650 | |
| }, | |
| { | |
| "entropy": 1.6978785967826844, | |
| "epoch": 0.17528898995641462, | |
| "grad_norm": 0.9306835532188416, | |
| "learning_rate": 8.761250592136429e-05, | |
| "loss": 1.3101, | |
| "mean_token_accuracy": 0.7250325381755829, | |
| "num_tokens": 91386702.0, | |
| "step": 3700 | |
| }, | |
| { | |
| "entropy": 1.7522969400882722, | |
| "epoch": 0.17765776009096076, | |
| "grad_norm": 0.8477308750152588, | |
| "learning_rate": 8.879677877783041e-05, | |
| "loss": 1.3688, | |
| "mean_token_accuracy": 0.715588583946228, | |
| "num_tokens": 92610301.0, | |
| "step": 3750 | |
| }, | |
| { | |
| "entropy": 1.7447042429447175, | |
| "epoch": 0.1800265302255069, | |
| "grad_norm": 1.0386239290237427, | |
| "learning_rate": 8.998105163429655e-05, | |
| "loss": 1.3431, | |
| "mean_token_accuracy": 0.718235713839531, | |
| "num_tokens": 93824209.0, | |
| "step": 3800 | |
| }, | |
| { | |
| "entropy": 1.7764518535137177, | |
| "epoch": 0.18239530036005305, | |
| "grad_norm": 1.1934313774108887, | |
| "learning_rate": 9.116532449076267e-05, | |
| "loss": 1.377, | |
| "mean_token_accuracy": 0.7121944260597229, | |
| "num_tokens": 95073205.0, | |
| "step": 3850 | |
| }, | |
| { | |
| "entropy": 1.7176523733139037, | |
| "epoch": 0.1847640704945992, | |
| "grad_norm": 0.9109567403793335, | |
| "learning_rate": 9.234959734722882e-05, | |
| "loss": 1.3269, | |
| "mean_token_accuracy": 0.7219234961271286, | |
| "num_tokens": 96282593.0, | |
| "step": 3900 | |
| }, | |
| { | |
| "entropy": 1.7826895797252655, | |
| "epoch": 0.18713284062914534, | |
| "grad_norm": 0.8581134080886841, | |
| "learning_rate": 9.353387020369494e-05, | |
| "loss": 1.4049, | |
| "mean_token_accuracy": 0.7105602127313614, | |
| "num_tokens": 97491980.0, | |
| "step": 3950 | |
| }, | |
| { | |
| "entropy": 1.7406088852882384, | |
| "epoch": 0.18950161076369149, | |
| "grad_norm": 0.8592116236686707, | |
| "learning_rate": 9.471814306016107e-05, | |
| "loss": 1.3514, | |
| "mean_token_accuracy": 0.7164700603485108, | |
| "num_tokens": 98726298.0, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.18950161076369149, | |
| "eval_entropy": 1.1647965725124612, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.7535035212354303, | |
| "eval_num_tokens": 98726298.0, | |
| "eval_runtime": 741.2483, | |
| "eval_samples_per_second": 33.477, | |
| "eval_steps_per_second": 4.185, | |
| "step": 4000 | |
| }, | |
| { | |
| "entropy": 1.7602094197273255, | |
| "epoch": 0.19187038089823763, | |
| "grad_norm": 1.0990040302276611, | |
| "learning_rate": 9.590241591662719e-05, | |
| "loss": 1.3787, | |
| "mean_token_accuracy": 0.7126823592185975, | |
| "num_tokens": 99961531.0, | |
| "step": 4050 | |
| }, | |
| { | |
| "entropy": 1.736720016002655, | |
| "epoch": 0.19423915103278377, | |
| "grad_norm": 0.886349081993103, | |
| "learning_rate": 9.708668877309333e-05, | |
| "loss": 1.3486, | |
| "mean_token_accuracy": 0.7173503488302231, | |
| "num_tokens": 101187732.0, | |
| "step": 4100 | |
| }, | |
| { | |
| "entropy": 1.772811095714569, | |
| "epoch": 0.19660792116732992, | |
| "grad_norm": 1.147083044052124, | |
| "learning_rate": 9.827096162955945e-05, | |
| "loss": 1.4016, | |
| "mean_token_accuracy": 0.7078080683946609, | |
| "num_tokens": 102372338.0, | |
| "step": 4150 | |
| }, | |
| { | |
| "entropy": 1.737178726196289, | |
| "epoch": 0.19897669130187606, | |
| "grad_norm": 1.1906094551086426, | |
| "learning_rate": 9.945523448602558e-05, | |
| "loss": 1.3823, | |
| "mean_token_accuracy": 0.7129582542181016, | |
| "num_tokens": 103594020.0, | |
| "step": 4200 | |
| }, | |
| { | |
| "entropy": 1.739368189573288, | |
| "epoch": 0.2013454614364222, | |
| "grad_norm": 0.8465049862861633, | |
| "learning_rate": 9.999987539454218e-05, | |
| "loss": 1.384, | |
| "mean_token_accuracy": 0.7121477049589157, | |
| "num_tokens": 104839897.0, | |
| "step": 4250 | |
| }, | |
| { | |
| "entropy": 1.728913918733597, | |
| "epoch": 0.20371423157096835, | |
| "grad_norm": 1.0396977663040161, | |
| "learning_rate": 9.999898657946416e-05, | |
| "loss": 1.4049, | |
| "mean_token_accuracy": 0.709805850982666, | |
| "num_tokens": 106084752.0, | |
| "step": 4300 | |
| }, | |
| { | |
| "entropy": 1.7548469495773316, | |
| "epoch": 0.2060830017055145, | |
| "grad_norm": 1.1842293739318848, | |
| "learning_rate": 9.999724314980077e-05, | |
| "loss": 1.3883, | |
| "mean_token_accuracy": 0.7103092032670975, | |
| "num_tokens": 107308027.0, | |
| "step": 4350 | |
| }, | |
| { | |
| "entropy": 1.7609304535388945, | |
| "epoch": 0.20845177184006064, | |
| "grad_norm": 0.8410583138465881, | |
| "learning_rate": 9.999464513535188e-05, | |
| "loss": 1.3632, | |
| "mean_token_accuracy": 0.7141008460521698, | |
| "num_tokens": 108532695.0, | |
| "step": 4400 | |
| }, | |
| { | |
| "entropy": 1.7612316942214965, | |
| "epoch": 0.21082054197460678, | |
| "grad_norm": 0.9074947237968445, | |
| "learning_rate": 9.999119258052436e-05, | |
| "loss": 1.3728, | |
| "mean_token_accuracy": 0.7128197175264358, | |
| "num_tokens": 109768914.0, | |
| "step": 4450 | |
| }, | |
| { | |
| "entropy": 1.7695635759830475, | |
| "epoch": 0.21318931210915293, | |
| "grad_norm": 0.9042698740959167, | |
| "learning_rate": 9.99868855443315e-05, | |
| "loss": 1.3519, | |
| "mean_token_accuracy": 0.7166950708627701, | |
| "num_tokens": 110984584.0, | |
| "step": 4500 | |
| }, | |
| { | |
| "entropy": 1.7432436084747314, | |
| "epoch": 0.21555808224369907, | |
| "grad_norm": 1.2357442378997803, | |
| "learning_rate": 9.99817241003919e-05, | |
| "loss": 1.334, | |
| "mean_token_accuracy": 0.7201163339614868, | |
| "num_tokens": 112235932.0, | |
| "step": 4550 | |
| }, | |
| { | |
| "entropy": 1.7642862284183503, | |
| "epoch": 0.21792685237824522, | |
| "grad_norm": 1.0687198638916016, | |
| "learning_rate": 9.997570833692829e-05, | |
| "loss": 1.3798, | |
| "mean_token_accuracy": 0.7113319665193558, | |
| "num_tokens": 113455353.0, | |
| "step": 4600 | |
| }, | |
| { | |
| "entropy": 1.7590344095230102, | |
| "epoch": 0.22029562251279136, | |
| "grad_norm": 1.1026127338409424, | |
| "learning_rate": 9.996883835676589e-05, | |
| "loss": 1.3825, | |
| "mean_token_accuracy": 0.7098899132013321, | |
| "num_tokens": 114694421.0, | |
| "step": 4650 | |
| }, | |
| { | |
| "entropy": 1.7447860455513, | |
| "epoch": 0.2226643926473375, | |
| "grad_norm": 1.0826524496078491, | |
| "learning_rate": 9.99611142773308e-05, | |
| "loss": 1.3484, | |
| "mean_token_accuracy": 0.7184046697616577, | |
| "num_tokens": 115913968.0, | |
| "step": 4700 | |
| }, | |
| { | |
| "entropy": 1.7905651438236236, | |
| "epoch": 0.22503316278188365, | |
| "grad_norm": 1.1828806400299072, | |
| "learning_rate": 9.995253623064793e-05, | |
| "loss": 1.4072, | |
| "mean_token_accuracy": 0.7065826892852783, | |
| "num_tokens": 117100168.0, | |
| "step": 4750 | |
| }, | |
| { | |
| "entropy": 1.7732587778568267, | |
| "epoch": 0.2274019329164298, | |
| "grad_norm": 0.8388417959213257, | |
| "learning_rate": 9.994310436333872e-05, | |
| "loss": 1.3876, | |
| "mean_token_accuracy": 0.7099131292104721, | |
| "num_tokens": 118323063.0, | |
| "step": 4800 | |
| }, | |
| { | |
| "entropy": 1.7498207116127014, | |
| "epoch": 0.22977070305097594, | |
| "grad_norm": 0.9928333759307861, | |
| "learning_rate": 9.993281883661866e-05, | |
| "loss": 1.3248, | |
| "mean_token_accuracy": 0.7209601724147796, | |
| "num_tokens": 119542247.0, | |
| "step": 4850 | |
| }, | |
| { | |
| "entropy": 1.7807526588439941, | |
| "epoch": 0.23213947318552208, | |
| "grad_norm": 1.180126428604126, | |
| "learning_rate": 9.992167982629455e-05, | |
| "loss": 1.3807, | |
| "mean_token_accuracy": 0.7116306042671203, | |
| "num_tokens": 120783656.0, | |
| "step": 4900 | |
| }, | |
| { | |
| "entropy": 1.7660968756675721, | |
| "epoch": 0.23450824332006823, | |
| "grad_norm": 1.035225749015808, | |
| "learning_rate": 9.990968752276143e-05, | |
| "loss": 1.3906, | |
| "mean_token_accuracy": 0.7096653944253921, | |
| "num_tokens": 122014053.0, | |
| "step": 4950 | |
| }, | |
| { | |
| "entropy": 1.7683662581443786, | |
| "epoch": 0.23687701345461437, | |
| "grad_norm": 0.8732820153236389, | |
| "learning_rate": 9.989684213099944e-05, | |
| "loss": 1.363, | |
| "mean_token_accuracy": 0.7147561728954315, | |
| "num_tokens": 123247491.0, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.23687701345461437, | |
| "eval_entropy": 1.1902209509963915, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.7525513527433308, | |
| "eval_num_tokens": 123247491.0, | |
| "eval_runtime": 749.4439, | |
| "eval_samples_per_second": 33.111, | |
| "eval_steps_per_second": 4.139, | |
| "step": 5000 | |
| }, | |
| { | |
| "entropy": 1.7724631798267365, | |
| "epoch": 0.23924578358916052, | |
| "grad_norm": 1.2394686937332153, | |
| "learning_rate": 9.988314387057021e-05, | |
| "loss": 1.4029, | |
| "mean_token_accuracy": 0.7083960479497909, | |
| "num_tokens": 124486744.0, | |
| "step": 5050 | |
| }, | |
| { | |
| "entropy": 1.7794454956054688, | |
| "epoch": 0.24161455372370666, | |
| "grad_norm": 1.031551718711853, | |
| "learning_rate": 9.986859297561312e-05, | |
| "loss": 1.3872, | |
| "mean_token_accuracy": 0.7082083231210708, | |
| "num_tokens": 125689651.0, | |
| "step": 5100 | |
| }, | |
| { | |
| "entropy": 1.8115082442760468, | |
| "epoch": 0.2439833238582528, | |
| "grad_norm": 1.238067388534546, | |
| "learning_rate": 9.985318969484139e-05, | |
| "loss": 1.4075, | |
| "mean_token_accuracy": 0.7077406024932862, | |
| "num_tokens": 126912476.0, | |
| "step": 5150 | |
| }, | |
| { | |
| "entropy": 1.7362813007831575, | |
| "epoch": 0.24635209399279895, | |
| "grad_norm": 0.9080651998519897, | |
| "learning_rate": 9.983693429153769e-05, | |
| "loss": 1.3715, | |
| "mean_token_accuracy": 0.7125364172458649, | |
| "num_tokens": 128141273.0, | |
| "step": 5200 | |
| }, | |
| { | |
| "entropy": 1.7462396609783173, | |
| "epoch": 0.2487208641273451, | |
| "grad_norm": 0.9258147478103638, | |
| "learning_rate": 9.981982704354978e-05, | |
| "loss": 1.3539, | |
| "mean_token_accuracy": 0.7153694558143616, | |
| "num_tokens": 129367296.0, | |
| "step": 5250 | |
| }, | |
| { | |
| "entropy": 1.7526134848594666, | |
| "epoch": 0.25108963426189124, | |
| "grad_norm": 1.0741764307022095, | |
| "learning_rate": 9.980186824328563e-05, | |
| "loss": 1.3639, | |
| "mean_token_accuracy": 0.7122530096769333, | |
| "num_tokens": 130622992.0, | |
| "step": 5300 | |
| }, | |
| { | |
| "entropy": 1.8136487221717834, | |
| "epoch": 0.25345840439643735, | |
| "grad_norm": 1.079744815826416, | |
| "learning_rate": 9.978305819770852e-05, | |
| "loss": 1.3934, | |
| "mean_token_accuracy": 0.7090709501504898, | |
| "num_tokens": 131844647.0, | |
| "step": 5350 | |
| }, | |
| { | |
| "entropy": 1.7428915858268739, | |
| "epoch": 0.2558271745309835, | |
| "grad_norm": 1.0281189680099487, | |
| "learning_rate": 9.976339722833178e-05, | |
| "loss": 1.357, | |
| "mean_token_accuracy": 0.7154221564531327, | |
| "num_tokens": 133100147.0, | |
| "step": 5400 | |
| }, | |
| { | |
| "entropy": 1.8012803518772125, | |
| "epoch": 0.25819594466552964, | |
| "grad_norm": 1.2619256973266602, | |
| "learning_rate": 9.974288567121322e-05, | |
| "loss": 1.4075, | |
| "mean_token_accuracy": 0.7054576027393341, | |
| "num_tokens": 134303236.0, | |
| "step": 5450 | |
| }, | |
| { | |
| "entropy": 1.7738253235816956, | |
| "epoch": 0.2605647148000758, | |
| "grad_norm": 1.0344356298446655, | |
| "learning_rate": 9.972152387694946e-05, | |
| "loss": 1.3516, | |
| "mean_token_accuracy": 0.7141925716400146, | |
| "num_tokens": 135527480.0, | |
| "step": 5500 | |
| }, | |
| { | |
| "entropy": 1.7168458807468414, | |
| "epoch": 0.26293348493462193, | |
| "grad_norm": 1.062092661857605, | |
| "learning_rate": 9.969931221066992e-05, | |
| "loss": 1.3439, | |
| "mean_token_accuracy": 0.7171407097578049, | |
| "num_tokens": 136777268.0, | |
| "step": 5550 | |
| }, | |
| { | |
| "entropy": 1.7599689650535584, | |
| "epoch": 0.2653022550691681, | |
| "grad_norm": 0.9637967348098755, | |
| "learning_rate": 9.96762510520306e-05, | |
| "loss": 1.3794, | |
| "mean_token_accuracy": 0.7112497627735138, | |
| "num_tokens": 137993796.0, | |
| "step": 5600 | |
| }, | |
| { | |
| "entropy": 1.7565060186386108, | |
| "epoch": 0.2676710252037142, | |
| "grad_norm": 0.9759653806686401, | |
| "learning_rate": 9.965234079520751e-05, | |
| "loss": 1.3797, | |
| "mean_token_accuracy": 0.7126868903636933, | |
| "num_tokens": 139236029.0, | |
| "step": 5650 | |
| }, | |
| { | |
| "entropy": 1.7332323002815246, | |
| "epoch": 0.2700397953382604, | |
| "grad_norm": 1.1588467359542847, | |
| "learning_rate": 9.962758184889003e-05, | |
| "loss": 1.3803, | |
| "mean_token_accuracy": 0.710934864282608, | |
| "num_tokens": 140453476.0, | |
| "step": 5700 | |
| }, | |
| { | |
| "entropy": 1.763832380771637, | |
| "epoch": 0.2724085654728065, | |
| "grad_norm": 0.8555989861488342, | |
| "learning_rate": 9.960197463627388e-05, | |
| "loss": 1.3641, | |
| "mean_token_accuracy": 0.7138992995023727, | |
| "num_tokens": 141647360.0, | |
| "step": 5750 | |
| }, | |
| { | |
| "entropy": 1.7501352691650391, | |
| "epoch": 0.2747773356073527, | |
| "grad_norm": 0.9515321850776672, | |
| "learning_rate": 9.957551959505387e-05, | |
| "loss": 1.4013, | |
| "mean_token_accuracy": 0.7074063158035279, | |
| "num_tokens": 142881658.0, | |
| "step": 5800 | |
| }, | |
| { | |
| "entropy": 1.7732372057437897, | |
| "epoch": 0.2771461057418988, | |
| "grad_norm": 1.0687644481658936, | |
| "learning_rate": 9.954821717741643e-05, | |
| "loss": 1.3726, | |
| "mean_token_accuracy": 0.7110266560316085, | |
| "num_tokens": 144097656.0, | |
| "step": 5850 | |
| }, | |
| { | |
| "entropy": 1.8280004715919496, | |
| "epoch": 0.27951487587644497, | |
| "grad_norm": 0.9914586544036865, | |
| "learning_rate": 9.952006785003194e-05, | |
| "loss": 1.4141, | |
| "mean_token_accuracy": 0.7043382048606872, | |
| "num_tokens": 145304660.0, | |
| "step": 5900 | |
| }, | |
| { | |
| "entropy": 1.7428024232387542, | |
| "epoch": 0.2818836460109911, | |
| "grad_norm": 0.9063569903373718, | |
| "learning_rate": 9.949107209404665e-05, | |
| "loss": 1.3871, | |
| "mean_token_accuracy": 0.7085927510261536, | |
| "num_tokens": 146556399.0, | |
| "step": 5950 | |
| }, | |
| { | |
| "entropy": 1.7348086619377137, | |
| "epoch": 0.28425241614553726, | |
| "grad_norm": 1.1388063430786133, | |
| "learning_rate": 9.946123040507451e-05, | |
| "loss": 1.4059, | |
| "mean_token_accuracy": 0.7068395394086838, | |
| "num_tokens": 147781528.0, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.28425241614553726, | |
| "eval_entropy": 1.1620629866956358, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.7536570436719308, | |
| "eval_num_tokens": 147781528.0, | |
| "eval_runtime": 742.238, | |
| "eval_samples_per_second": 33.433, | |
| "eval_steps_per_second": 4.179, | |
| "step": 6000 | |
| }, | |
| { | |
| "entropy": 1.7318350422382354, | |
| "epoch": 0.2866211862800834, | |
| "grad_norm": 1.0696161985397339, | |
| "learning_rate": 9.943054329318873e-05, | |
| "loss": 1.3689, | |
| "mean_token_accuracy": 0.7137463581562042, | |
| "num_tokens": 148993131.0, | |
| "step": 6050 | |
| }, | |
| { | |
| "entropy": 1.7341815280914306, | |
| "epoch": 0.28898995641462955, | |
| "grad_norm": 1.211084246635437, | |
| "learning_rate": 9.9399011282913e-05, | |
| "loss": 1.3396, | |
| "mean_token_accuracy": 0.7190863400697708, | |
| "num_tokens": 150231439.0, | |
| "step": 6100 | |
| }, | |
| { | |
| "entropy": 1.8086679303646087, | |
| "epoch": 0.29135872654917566, | |
| "grad_norm": 0.997982919216156, | |
| "learning_rate": 9.936663491321256e-05, | |
| "loss": 1.3991, | |
| "mean_token_accuracy": 0.7076171565055848, | |
| "num_tokens": 151425872.0, | |
| "step": 6150 | |
| }, | |
| { | |
| "entropy": 1.7646045112609863, | |
| "epoch": 0.29372749668372183, | |
| "grad_norm": 1.0052849054336548, | |
| "learning_rate": 9.9333414737485e-05, | |
| "loss": 1.3833, | |
| "mean_token_accuracy": 0.7115501266717911, | |
| "num_tokens": 152649154.0, | |
| "step": 6200 | |
| }, | |
| { | |
| "entropy": 1.7603888380527497, | |
| "epoch": 0.29609626681826795, | |
| "grad_norm": 1.1485621929168701, | |
| "learning_rate": 9.929935132355075e-05, | |
| "loss": 1.3774, | |
| "mean_token_accuracy": 0.7107777494192123, | |
| "num_tokens": 153909546.0, | |
| "step": 6250 | |
| }, | |
| { | |
| "entropy": 1.810437490940094, | |
| "epoch": 0.2984650369528141, | |
| "grad_norm": 1.1413508653640747, | |
| "learning_rate": 9.926444525364341e-05, | |
| "loss": 1.378, | |
| "mean_token_accuracy": 0.711902762055397, | |
| "num_tokens": 155120315.0, | |
| "step": 6300 | |
| }, | |
| { | |
| "entropy": 1.7656940996646882, | |
| "epoch": 0.30083380708736024, | |
| "grad_norm": 0.8839899897575378, | |
| "learning_rate": 9.922869712439981e-05, | |
| "loss": 1.3904, | |
| "mean_token_accuracy": 0.7087905770540237, | |
| "num_tokens": 156368001.0, | |
| "step": 6350 | |
| }, | |
| { | |
| "entropy": 1.7679949700832367, | |
| "epoch": 0.3032025772219064, | |
| "grad_norm": 1.285138726234436, | |
| "learning_rate": 9.91921075468498e-05, | |
| "loss": 1.3891, | |
| "mean_token_accuracy": 0.7098447853326797, | |
| "num_tokens": 157568259.0, | |
| "step": 6400 | |
| }, | |
| { | |
| "entropy": 1.775840550661087, | |
| "epoch": 0.3055713473564525, | |
| "grad_norm": 1.10303795337677, | |
| "learning_rate": 9.915467714640578e-05, | |
| "loss": 1.3918, | |
| "mean_token_accuracy": 0.7079905581474304, | |
| "num_tokens": 158791523.0, | |
| "step": 6450 | |
| }, | |
| { | |
| "entropy": 1.7338063383102418, | |
| "epoch": 0.3079401174909987, | |
| "grad_norm": 1.0604420900344849, | |
| "learning_rate": 9.911640656285203e-05, | |
| "loss": 1.3554, | |
| "mean_token_accuracy": 0.714795948266983, | |
| "num_tokens": 160073528.0, | |
| "step": 6500 | |
| }, | |
| { | |
| "entropy": 1.7426982474327088, | |
| "epoch": 0.3103088876255448, | |
| "grad_norm": 0.9847440123558044, | |
| "learning_rate": 9.907729645033379e-05, | |
| "loss": 1.3512, | |
| "mean_token_accuracy": 0.7151961398124694, | |
| "num_tokens": 161312761.0, | |
| "step": 6550 | |
| }, | |
| { | |
| "entropy": 1.8005949878692626, | |
| "epoch": 0.312677657760091, | |
| "grad_norm": 1.2713630199432373, | |
| "learning_rate": 9.903734747734607e-05, | |
| "loss": 1.3597, | |
| "mean_token_accuracy": 0.7128104782104492, | |
| "num_tokens": 162512008.0, | |
| "step": 6600 | |
| }, | |
| { | |
| "entropy": 1.8041615283489227, | |
| "epoch": 0.3150464278946371, | |
| "grad_norm": 0.99453204870224, | |
| "learning_rate": 9.899656032672221e-05, | |
| "loss": 1.3642, | |
| "mean_token_accuracy": 0.7122291630506515, | |
| "num_tokens": 163702726.0, | |
| "step": 6650 | |
| }, | |
| { | |
| "entropy": 1.7597569704055787, | |
| "epoch": 0.3174151980291832, | |
| "grad_norm": 1.2227306365966797, | |
| "learning_rate": 9.895493569562221e-05, | |
| "loss": 1.3276, | |
| "mean_token_accuracy": 0.7197510945796967, | |
| "num_tokens": 164943131.0, | |
| "step": 6700 | |
| }, | |
| { | |
| "entropy": 1.7358013463020325, | |
| "epoch": 0.3197839681637294, | |
| "grad_norm": 1.1400933265686035, | |
| "learning_rate": 9.891247429552082e-05, | |
| "loss": 1.384, | |
| "mean_token_accuracy": 0.7089168894290924, | |
| "num_tokens": 166167321.0, | |
| "step": 6750 | |
| }, | |
| { | |
| "entropy": 1.7530862140655517, | |
| "epoch": 0.3221527382982755, | |
| "grad_norm": 1.2036629915237427, | |
| "learning_rate": 9.886917685219541e-05, | |
| "loss": 1.3398, | |
| "mean_token_accuracy": 0.7184527868032455, | |
| "num_tokens": 167397732.0, | |
| "step": 6800 | |
| }, | |
| { | |
| "entropy": 1.7854076647758483, | |
| "epoch": 0.3245215084328217, | |
| "grad_norm": 1.2987496852874756, | |
| "learning_rate": 9.88250441057135e-05, | |
| "loss": 1.394, | |
| "mean_token_accuracy": 0.7086141872406005, | |
| "num_tokens": 168618527.0, | |
| "step": 6850 | |
| }, | |
| { | |
| "entropy": 1.7509974801540376, | |
| "epoch": 0.3268902785673678, | |
| "grad_norm": 1.056751012802124, | |
| "learning_rate": 9.878007681042014e-05, | |
| "loss": 1.3389, | |
| "mean_token_accuracy": 0.7182145416736603, | |
| "num_tokens": 169856441.0, | |
| "step": 6900 | |
| }, | |
| { | |
| "entropy": 1.740920853614807, | |
| "epoch": 0.32925904870191397, | |
| "grad_norm": 1.1730940341949463, | |
| "learning_rate": 9.873427573492507e-05, | |
| "loss": 1.3387, | |
| "mean_token_accuracy": 0.718420038819313, | |
| "num_tokens": 171123051.0, | |
| "step": 6950 | |
| }, | |
| { | |
| "entropy": 1.7686040151119231, | |
| "epoch": 0.3316278188364601, | |
| "grad_norm": 1.139112949371338, | |
| "learning_rate": 9.868764166208946e-05, | |
| "loss": 1.373, | |
| "mean_token_accuracy": 0.7120095008611679, | |
| "num_tokens": 172342540.0, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.3316278188364601, | |
| "eval_entropy": 1.1930562926445525, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.7542100738587493, | |
| "eval_num_tokens": 172342540.0, | |
| "eval_runtime": 741.3646, | |
| "eval_samples_per_second": 33.472, | |
| "eval_steps_per_second": 4.184, | |
| "step": 7000 | |
| }, | |
| { | |
| "entropy": 1.7598109781742095, | |
| "epoch": 0.33399658897100626, | |
| "grad_norm": 1.2180997133255005, | |
| "learning_rate": 9.864017538901267e-05, | |
| "loss": 1.4032, | |
| "mean_token_accuracy": 0.7083274441957473, | |
| "num_tokens": 173589426.0, | |
| "step": 7050 | |
| }, | |
| { | |
| "entropy": 1.7652622890472411, | |
| "epoch": 0.3363653591055524, | |
| "grad_norm": 1.3037455081939697, | |
| "learning_rate": 9.859187772701853e-05, | |
| "loss": 1.369, | |
| "mean_token_accuracy": 0.7140497547388077, | |
| "num_tokens": 174848839.0, | |
| "step": 7100 | |
| }, | |
| { | |
| "entropy": 1.7913592505455016, | |
| "epoch": 0.33873412924009855, | |
| "grad_norm": 1.1562169790267944, | |
| "learning_rate": 9.854274950164149e-05, | |
| "loss": 1.3837, | |
| "mean_token_accuracy": 0.7107916122674942, | |
| "num_tokens": 176055919.0, | |
| "step": 7150 | |
| }, | |
| { | |
| "entropy": 1.7908745443820953, | |
| "epoch": 0.34110289937464466, | |
| "grad_norm": 1.2559897899627686, | |
| "learning_rate": 9.849279155261252e-05, | |
| "loss": 1.3907, | |
| "mean_token_accuracy": 0.7087368202209473, | |
| "num_tokens": 177277309.0, | |
| "step": 7200 | |
| }, | |
| { | |
| "entropy": 1.753930516242981, | |
| "epoch": 0.34347166950919084, | |
| "grad_norm": 0.9901047348976135, | |
| "learning_rate": 9.844200473384479e-05, | |
| "loss": 1.3527, | |
| "mean_token_accuracy": 0.716761229634285, | |
| "num_tokens": 178518563.0, | |
| "step": 7250 | |
| }, | |
| { | |
| "entropy": 1.739516668319702, | |
| "epoch": 0.34584043964373695, | |
| "grad_norm": 1.2106683254241943, | |
| "learning_rate": 9.8390389913419e-05, | |
| "loss": 1.3725, | |
| "mean_token_accuracy": 0.7121469175815582, | |
| "num_tokens": 179742683.0, | |
| "step": 7300 | |
| }, | |
| { | |
| "entropy": 1.756061052083969, | |
| "epoch": 0.3482092097782831, | |
| "grad_norm": 1.0457638502120972, | |
| "learning_rate": 9.833794797356861e-05, | |
| "loss": 1.3701, | |
| "mean_token_accuracy": 0.7125989294052124, | |
| "num_tokens": 180940666.0, | |
| "step": 7350 | |
| }, | |
| { | |
| "entropy": 1.7689040386676789, | |
| "epoch": 0.35057797991282924, | |
| "grad_norm": 0.9141308069229126, | |
| "learning_rate": 9.828467981066472e-05, | |
| "loss": 1.3718, | |
| "mean_token_accuracy": 0.7115379917621613, | |
| "num_tokens": 182184090.0, | |
| "step": 7400 | |
| }, | |
| { | |
| "entropy": 1.7089093339443207, | |
| "epoch": 0.3529467500473754, | |
| "grad_norm": 0.8629412055015564, | |
| "learning_rate": 9.823058633520074e-05, | |
| "loss": 1.3324, | |
| "mean_token_accuracy": 0.7187563890218734, | |
| "num_tokens": 183446222.0, | |
| "step": 7450 | |
| }, | |
| { | |
| "entropy": 1.776807938814163, | |
| "epoch": 0.35531552018192153, | |
| "grad_norm": 0.9498484134674072, | |
| "learning_rate": 9.817566847177689e-05, | |
| "loss": 1.375, | |
| "mean_token_accuracy": 0.7121974611282349, | |
| "num_tokens": 184676077.0, | |
| "step": 7500 | |
| }, | |
| { | |
| "entropy": 1.8064971625804902, | |
| "epoch": 0.3576842903164677, | |
| "grad_norm": 1.0395594835281372, | |
| "learning_rate": 9.811992715908434e-05, | |
| "loss": 1.3748, | |
| "mean_token_accuracy": 0.7101496076583862, | |
| "num_tokens": 185903667.0, | |
| "step": 7550 | |
| }, | |
| { | |
| "entropy": 1.756836792230606, | |
| "epoch": 0.3600530604510138, | |
| "grad_norm": 0.9577502608299255, | |
| "learning_rate": 9.806336334988918e-05, | |
| "loss": 1.3556, | |
| "mean_token_accuracy": 0.7159949284791947, | |
| "num_tokens": 187154538.0, | |
| "step": 7600 | |
| }, | |
| { | |
| "entropy": 1.7699120783805846, | |
| "epoch": 0.36242183058556, | |
| "grad_norm": 1.4034383296966553, | |
| "learning_rate": 9.800597801101612e-05, | |
| "loss": 1.3911, | |
| "mean_token_accuracy": 0.7097045290470123, | |
| "num_tokens": 188378482.0, | |
| "step": 7650 | |
| }, | |
| { | |
| "entropy": 1.787111645936966, | |
| "epoch": 0.3647906007201061, | |
| "grad_norm": 1.0781787633895874, | |
| "learning_rate": 9.794777212333202e-05, | |
| "loss": 1.3937, | |
| "mean_token_accuracy": 0.7096772521734238, | |
| "num_tokens": 189611171.0, | |
| "step": 7700 | |
| }, | |
| { | |
| "entropy": 1.777391802072525, | |
| "epoch": 0.3671593708546523, | |
| "grad_norm": 1.1259112358093262, | |
| "learning_rate": 9.7888746681729e-05, | |
| "loss": 1.3448, | |
| "mean_token_accuracy": 0.7169349992275238, | |
| "num_tokens": 190834562.0, | |
| "step": 7750 | |
| }, | |
| { | |
| "entropy": 1.7815845644474029, | |
| "epoch": 0.3695281409891984, | |
| "grad_norm": 1.1464273929595947, | |
| "learning_rate": 9.782890269510765e-05, | |
| "loss": 1.4057, | |
| "mean_token_accuracy": 0.7066523498296737, | |
| "num_tokens": 192054556.0, | |
| "step": 7800 | |
| }, | |
| { | |
| "entropy": 1.7850996911525727, | |
| "epoch": 0.37189691112374457, | |
| "grad_norm": 1.0448256731033325, | |
| "learning_rate": 9.776824118635952e-05, | |
| "loss": 1.3829, | |
| "mean_token_accuracy": 0.7095517975091934, | |
| "num_tokens": 193268475.0, | |
| "step": 7850 | |
| }, | |
| { | |
| "entropy": 1.8061986804008483, | |
| "epoch": 0.3742656812582907, | |
| "grad_norm": 0.9750792384147644, | |
| "learning_rate": 9.770676319234984e-05, | |
| "loss": 1.3863, | |
| "mean_token_accuracy": 0.7090413582324981, | |
| "num_tokens": 194477246.0, | |
| "step": 7900 | |
| }, | |
| { | |
| "entropy": 1.7675806987285614, | |
| "epoch": 0.37663445139283686, | |
| "grad_norm": 1.0662715435028076, | |
| "learning_rate": 9.764446976389974e-05, | |
| "loss": 1.3617, | |
| "mean_token_accuracy": 0.712408259510994, | |
| "num_tokens": 195727604.0, | |
| "step": 7950 | |
| }, | |
| { | |
| "entropy": 1.7661338579654693, | |
| "epoch": 0.37900322152738297, | |
| "grad_norm": 1.0620079040527344, | |
| "learning_rate": 9.758136196576822e-05, | |
| "loss": 1.3594, | |
| "mean_token_accuracy": 0.7141281938552857, | |
| "num_tokens": 196957775.0, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.37900322152738297, | |
| "eval_entropy": 1.1821323013705334, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.7540838839660376, | |
| "eval_num_tokens": 196957775.0, | |
| "eval_runtime": 746.7244, | |
| "eval_samples_per_second": 33.232, | |
| "eval_steps_per_second": 4.154, | |
| "step": 8000 | |
| }, | |
| { | |
| "entropy": 1.8103419041633606, | |
| "epoch": 0.38137199166192914, | |
| "grad_norm": 0.9513231515884399, | |
| "learning_rate": 9.751744087663406e-05, | |
| "loss": 1.3912, | |
| "mean_token_accuracy": 0.7097796177864075, | |
| "num_tokens": 198135404.0, | |
| "step": 8050 | |
| }, | |
| { | |
| "entropy": 1.7960492491722106, | |
| "epoch": 0.38374076179647526, | |
| "grad_norm": 1.0502028465270996, | |
| "learning_rate": 9.74527075890773e-05, | |
| "loss": 1.4075, | |
| "mean_token_accuracy": 0.7074997735023498, | |
| "num_tokens": 199322966.0, | |
| "step": 8100 | |
| }, | |
| { | |
| "entropy": 1.8010617554187776, | |
| "epoch": 0.38610953193102143, | |
| "grad_norm": 1.0754374265670776, | |
| "learning_rate": 9.73871632095606e-05, | |
| "loss": 1.3893, | |
| "mean_token_accuracy": 0.7116775345802308, | |
| "num_tokens": 200538368.0, | |
| "step": 8150 | |
| }, | |
| { | |
| "entropy": 1.7480302667617797, | |
| "epoch": 0.38847830206556755, | |
| "grad_norm": 1.074485421180725, | |
| "learning_rate": 9.732080885841031e-05, | |
| "loss": 1.3824, | |
| "mean_token_accuracy": 0.7114830583333969, | |
| "num_tokens": 201768017.0, | |
| "step": 8200 | |
| }, | |
| { | |
| "entropy": 1.7346595871448516, | |
| "epoch": 0.3908470722001137, | |
| "grad_norm": 1.2857214212417603, | |
| "learning_rate": 9.725364566979737e-05, | |
| "loss": 1.3483, | |
| "mean_token_accuracy": 0.7171267950534821, | |
| "num_tokens": 203001309.0, | |
| "step": 8250 | |
| }, | |
| { | |
| "entropy": 1.7614091503620148, | |
| "epoch": 0.39321584233465984, | |
| "grad_norm": 0.9842163324356079, | |
| "learning_rate": 9.718567479171784e-05, | |
| "loss": 1.3712, | |
| "mean_token_accuracy": 0.7125260305404663, | |
| "num_tokens": 204234311.0, | |
| "step": 8300 | |
| }, | |
| { | |
| "entropy": 1.7672381138801574, | |
| "epoch": 0.395584612469206, | |
| "grad_norm": 1.098926067352295, | |
| "learning_rate": 9.711689738597335e-05, | |
| "loss": 1.4068, | |
| "mean_token_accuracy": 0.7051201003789902, | |
| "num_tokens": 205440916.0, | |
| "step": 8350 | |
| }, | |
| { | |
| "entropy": 1.7645212149620055, | |
| "epoch": 0.3979533826037521, | |
| "grad_norm": 1.0630714893341064, | |
| "learning_rate": 9.70473146281512e-05, | |
| "loss": 1.3971, | |
| "mean_token_accuracy": 0.7092112845182419, | |
| "num_tokens": 206679396.0, | |
| "step": 8400 | |
| }, | |
| { | |
| "entropy": 1.7202996456623076, | |
| "epoch": 0.4003221527382983, | |
| "grad_norm": 0.9493738412857056, | |
| "learning_rate": 9.697692770760431e-05, | |
| "loss": 1.349, | |
| "mean_token_accuracy": 0.7158361315727234, | |
| "num_tokens": 207946846.0, | |
| "step": 8450 | |
| }, | |
| { | |
| "entropy": 1.7327898812294007, | |
| "epoch": 0.4026909228728444, | |
| "grad_norm": 0.8810617327690125, | |
| "learning_rate": 9.690573782743082e-05, | |
| "loss": 1.3631, | |
| "mean_token_accuracy": 0.7150676685571671, | |
| "num_tokens": 209162939.0, | |
| "step": 8500 | |
| }, | |
| { | |
| "entropy": 1.7277316284179687, | |
| "epoch": 0.4050596930073906, | |
| "grad_norm": 1.0136702060699463, | |
| "learning_rate": 9.683374620445361e-05, | |
| "loss": 1.3714, | |
| "mean_token_accuracy": 0.7120017749071121, | |
| "num_tokens": 210427784.0, | |
| "step": 8550 | |
| }, | |
| { | |
| "entropy": 1.7886472380161285, | |
| "epoch": 0.4074284631419367, | |
| "grad_norm": 1.0549664497375488, | |
| "learning_rate": 9.676095406919943e-05, | |
| "loss": 1.3664, | |
| "mean_token_accuracy": 0.7133744984865189, | |
| "num_tokens": 211638614.0, | |
| "step": 8600 | |
| }, | |
| { | |
| "entropy": 1.747572809457779, | |
| "epoch": 0.4097972332764829, | |
| "grad_norm": 1.1670211553573608, | |
| "learning_rate": 9.668736266587792e-05, | |
| "loss": 1.3495, | |
| "mean_token_accuracy": 0.7146046167612076, | |
| "num_tokens": 212839094.0, | |
| "step": 8650 | |
| }, | |
| { | |
| "entropy": 1.7699014341831207, | |
| "epoch": 0.412166003411029, | |
| "grad_norm": 1.0434460639953613, | |
| "learning_rate": 9.66129732523603e-05, | |
| "loss": 1.3686, | |
| "mean_token_accuracy": 0.713732448220253, | |
| "num_tokens": 214078185.0, | |
| "step": 8700 | |
| }, | |
| { | |
| "entropy": 1.7851570510864259, | |
| "epoch": 0.41453477354557516, | |
| "grad_norm": 1.0788432359695435, | |
| "learning_rate": 9.653778710015788e-05, | |
| "loss": 1.3735, | |
| "mean_token_accuracy": 0.7115869015455246, | |
| "num_tokens": 215291596.0, | |
| "step": 8750 | |
| }, | |
| { | |
| "entropy": 1.771958166360855, | |
| "epoch": 0.4169035436801213, | |
| "grad_norm": 0.9727463722229004, | |
| "learning_rate": 9.646180549440038e-05, | |
| "loss": 1.3858, | |
| "mean_token_accuracy": 0.7092594999074936, | |
| "num_tokens": 216522630.0, | |
| "step": 8800 | |
| }, | |
| { | |
| "entropy": 1.7790643846988679, | |
| "epoch": 0.41927231381466745, | |
| "grad_norm": 1.125771403312683, | |
| "learning_rate": 9.638502973381389e-05, | |
| "loss": 1.3779, | |
| "mean_token_accuracy": 0.7110064566135407, | |
| "num_tokens": 217765170.0, | |
| "step": 8850 | |
| }, | |
| { | |
| "entropy": 1.7827233350276948, | |
| "epoch": 0.42164108394921357, | |
| "grad_norm": 1.0116534233093262, | |
| "learning_rate": 9.63074611306987e-05, | |
| "loss": 1.3625, | |
| "mean_token_accuracy": 0.7156530952453614, | |
| "num_tokens": 218976869.0, | |
| "step": 8900 | |
| }, | |
| { | |
| "entropy": 1.75447958111763, | |
| "epoch": 0.4240098540837597, | |
| "grad_norm": 1.3180460929870605, | |
| "learning_rate": 9.622910101090686e-05, | |
| "loss": 1.3936, | |
| "mean_token_accuracy": 0.7107756125926972, | |
| "num_tokens": 220217849.0, | |
| "step": 8950 | |
| }, | |
| { | |
| "entropy": 1.748335200548172, | |
| "epoch": 0.42637862421830586, | |
| "grad_norm": 0.986765444278717, | |
| "learning_rate": 9.614995071381956e-05, | |
| "loss": 1.3734, | |
| "mean_token_accuracy": 0.7136638331413269, | |
| "num_tokens": 221451171.0, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.42637862421830586, | |
| "eval_entropy": 1.1782315272926747, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.7542952842323493, | |
| "eval_num_tokens": 221451171.0, | |
| "eval_runtime": 744.3949, | |
| "eval_samples_per_second": 33.336, | |
| "eval_steps_per_second": 4.167, | |
| "step": 9000 | |
| }, | |
| { | |
| "entropy": 1.762521461248398, | |
| "epoch": 0.428747394352852, | |
| "grad_norm": 1.1056315898895264, | |
| "learning_rate": 9.607001159232418e-05, | |
| "loss": 1.3411, | |
| "mean_token_accuracy": 0.7177901411056519, | |
| "num_tokens": 222644153.0, | |
| "step": 9050 | |
| }, | |
| { | |
| "entropy": 1.7698546504974366, | |
| "epoch": 0.43111616448739815, | |
| "grad_norm": 1.0218158960342407, | |
| "learning_rate": 9.59892850127912e-05, | |
| "loss": 1.3568, | |
| "mean_token_accuracy": 0.7160427170991898, | |
| "num_tokens": 223885271.0, | |
| "step": 9100 | |
| }, | |
| { | |
| "entropy": 1.7873007321357728, | |
| "epoch": 0.43348493462194426, | |
| "grad_norm": 1.0137804746627808, | |
| "learning_rate": 9.590777235505085e-05, | |
| "loss": 1.3578, | |
| "mean_token_accuracy": 0.7130710703134536, | |
| "num_tokens": 225093029.0, | |
| "step": 9150 | |
| }, | |
| { | |
| "entropy": 1.7597880065441132, | |
| "epoch": 0.43585370475649043, | |
| "grad_norm": 1.0279192924499512, | |
| "learning_rate": 9.582547501236947e-05, | |
| "loss": 1.3552, | |
| "mean_token_accuracy": 0.7151528036594391, | |
| "num_tokens": 226339608.0, | |
| "step": 9200 | |
| }, | |
| { | |
| "entropy": 1.7488136601448059, | |
| "epoch": 0.43822247489103655, | |
| "grad_norm": 1.2627191543579102, | |
| "learning_rate": 9.574239439142576e-05, | |
| "loss": 1.3368, | |
| "mean_token_accuracy": 0.7172259968519211, | |
| "num_tokens": 227578157.0, | |
| "step": 9250 | |
| }, | |
| { | |
| "entropy": 1.8161335122585296, | |
| "epoch": 0.4405912450255827, | |
| "grad_norm": 1.4642895460128784, | |
| "learning_rate": 9.56585319122867e-05, | |
| "loss": 1.3891, | |
| "mean_token_accuracy": 0.7093940156698227, | |
| "num_tokens": 228810604.0, | |
| "step": 9300 | |
| }, | |
| { | |
| "entropy": 1.7846631932258605, | |
| "epoch": 0.44296001516012884, | |
| "grad_norm": 1.0811119079589844, | |
| "learning_rate": 9.557388900838334e-05, | |
| "loss": 1.3681, | |
| "mean_token_accuracy": 0.7125671052932739, | |
| "num_tokens": 230055004.0, | |
| "step": 9350 | |
| }, | |
| { | |
| "entropy": 1.748952749967575, | |
| "epoch": 0.445328785294675, | |
| "grad_norm": 1.0202217102050781, | |
| "learning_rate": 9.548846712648616e-05, | |
| "loss": 1.355, | |
| "mean_token_accuracy": 0.7164496505260467, | |
| "num_tokens": 231284769.0, | |
| "step": 9400 | |
| }, | |
| { | |
| "entropy": 1.754820455312729, | |
| "epoch": 0.44769755542922113, | |
| "grad_norm": 1.2328052520751953, | |
| "learning_rate": 9.540226772668053e-05, | |
| "loss": 1.3402, | |
| "mean_token_accuracy": 0.7169833314418793, | |
| "num_tokens": 232505637.0, | |
| "step": 9450 | |
| }, | |
| { | |
| "entropy": 1.7367425131797791, | |
| "epoch": 0.4500663255637673, | |
| "grad_norm": 1.0527913570404053, | |
| "learning_rate": 9.531529228234155e-05, | |
| "loss": 1.3576, | |
| "mean_token_accuracy": 0.7145136260986328, | |
| "num_tokens": 233725437.0, | |
| "step": 9500 | |
| }, | |
| { | |
| "entropy": 1.733099582195282, | |
| "epoch": 0.4524350956983134, | |
| "grad_norm": 0.8144567608833313, | |
| "learning_rate": 9.522754228010906e-05, | |
| "loss": 1.3282, | |
| "mean_token_accuracy": 0.720543931722641, | |
| "num_tokens": 234955358.0, | |
| "step": 9550 | |
| }, | |
| { | |
| "entropy": 1.733365514278412, | |
| "epoch": 0.4548038658328596, | |
| "grad_norm": 1.0677859783172607, | |
| "learning_rate": 9.513901921986206e-05, | |
| "loss": 1.3275, | |
| "mean_token_accuracy": 0.7202348792552948, | |
| "num_tokens": 236197729.0, | |
| "step": 9600 | |
| }, | |
| { | |
| "entropy": 1.7879818844795228, | |
| "epoch": 0.4571726359674057, | |
| "grad_norm": 1.0054843425750732, | |
| "learning_rate": 9.504972461469319e-05, | |
| "loss": 1.3617, | |
| "mean_token_accuracy": 0.7137482041120529, | |
| "num_tokens": 237418727.0, | |
| "step": 9650 | |
| }, | |
| { | |
| "entropy": 1.7609144997596742, | |
| "epoch": 0.4595414061019519, | |
| "grad_norm": 1.252611756324768, | |
| "learning_rate": 9.495965999088285e-05, | |
| "loss": 1.3773, | |
| "mean_token_accuracy": 0.7108764094114304, | |
| "num_tokens": 238640440.0, | |
| "step": 9700 | |
| }, | |
| { | |
| "entropy": 1.7785017716884612, | |
| "epoch": 0.461910176236498, | |
| "grad_norm": 1.1619056463241577, | |
| "learning_rate": 9.486882688787305e-05, | |
| "loss": 1.3769, | |
| "mean_token_accuracy": 0.7111158293485641, | |
| "num_tokens": 239845699.0, | |
| "step": 9750 | |
| }, | |
| { | |
| "entropy": 1.7622488391399385, | |
| "epoch": 0.46427894637104417, | |
| "grad_norm": 1.2110604047775269, | |
| "learning_rate": 9.477722685824114e-05, | |
| "loss": 1.3853, | |
| "mean_token_accuracy": 0.7111801999807358, | |
| "num_tokens": 241057039.0, | |
| "step": 9800 | |
| }, | |
| { | |
| "entropy": 1.771475486755371, | |
| "epoch": 0.4666477165055903, | |
| "grad_norm": 0.9056064486503601, | |
| "learning_rate": 9.46848614676733e-05, | |
| "loss": 1.3612, | |
| "mean_token_accuracy": 0.7140835148096084, | |
| "num_tokens": 242271603.0, | |
| "step": 9850 | |
| }, | |
| { | |
| "entropy": 1.7718469250202178, | |
| "epoch": 0.46901648664013645, | |
| "grad_norm": 1.2525917291641235, | |
| "learning_rate": 9.459173229493772e-05, | |
| "loss": 1.3937, | |
| "mean_token_accuracy": 0.7090546947717666, | |
| "num_tokens": 243506199.0, | |
| "step": 9900 | |
| }, | |
| { | |
| "entropy": 1.7770234513282777, | |
| "epoch": 0.47138525677468257, | |
| "grad_norm": 1.0945196151733398, | |
| "learning_rate": 9.449784093185765e-05, | |
| "loss": 1.3913, | |
| "mean_token_accuracy": 0.7097006791830063, | |
| "num_tokens": 244728720.0, | |
| "step": 9950 | |
| }, | |
| { | |
| "entropy": 1.7675727343559264, | |
| "epoch": 0.47375402690922874, | |
| "grad_norm": 0.9690260291099548, | |
| "learning_rate": 9.440318898328419e-05, | |
| "loss": 1.3915, | |
| "mean_token_accuracy": 0.7102323162555695, | |
| "num_tokens": 245938116.0, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.47375402690922874, | |
| "eval_entropy": 1.1980976138977295, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.7557047149064385, | |
| "eval_num_tokens": 245938116.0, | |
| "eval_runtime": 744.8001, | |
| "eval_samples_per_second": 33.318, | |
| "eval_steps_per_second": 4.165, | |
| "step": 10000 | |
| }, | |
| { | |
| "entropy": 1.8029465341567994, | |
| "epoch": 0.47612279704377486, | |
| "grad_norm": 1.1126075983047485, | |
| "learning_rate": 9.430777806706885e-05, | |
| "loss": 1.425, | |
| "mean_token_accuracy": 0.7034233027696609, | |
| "num_tokens": 247144026.0, | |
| "step": 10050 | |
| }, | |
| { | |
| "entropy": 1.7569603097438813, | |
| "epoch": 0.47849156717832103, | |
| "grad_norm": 1.0091259479522705, | |
| "learning_rate": 9.421160981403587e-05, | |
| "loss": 1.3778, | |
| "mean_token_accuracy": 0.7116102015972138, | |
| "num_tokens": 248387083.0, | |
| "step": 10100 | |
| }, | |
| { | |
| "entropy": 1.730289832353592, | |
| "epoch": 0.48086033731286715, | |
| "grad_norm": 0.9621230959892273, | |
| "learning_rate": 9.411468586795443e-05, | |
| "loss": 1.3592, | |
| "mean_token_accuracy": 0.7129039680957794, | |
| "num_tokens": 249644502.0, | |
| "step": 10150 | |
| }, | |
| { | |
| "entropy": 1.7547185254096984, | |
| "epoch": 0.4832291074474133, | |
| "grad_norm": 1.0875402688980103, | |
| "learning_rate": 9.401700788551047e-05, | |
| "loss": 1.3664, | |
| "mean_token_accuracy": 0.7126635414361954, | |
| "num_tokens": 250876166.0, | |
| "step": 10200 | |
| }, | |
| { | |
| "entropy": 1.7428116750717164, | |
| "epoch": 0.48559787758195944, | |
| "grad_norm": 1.006138563156128, | |
| "learning_rate": 9.391857753627837e-05, | |
| "loss": 1.3673, | |
| "mean_token_accuracy": 0.7143008214235306, | |
| "num_tokens": 252091179.0, | |
| "step": 10250 | |
| }, | |
| { | |
| "entropy": 1.7458369052410125, | |
| "epoch": 0.4879666477165056, | |
| "grad_norm": 1.001531720161438, | |
| "learning_rate": 9.381939650269249e-05, | |
| "loss": 1.3674, | |
| "mean_token_accuracy": 0.7141269159317016, | |
| "num_tokens": 253307058.0, | |
| "step": 10300 | |
| }, | |
| { | |
| "entropy": 1.7744270980358123, | |
| "epoch": 0.4903354178510517, | |
| "grad_norm": 1.0080331563949585, | |
| "learning_rate": 9.371946648001835e-05, | |
| "loss": 1.383, | |
| "mean_token_accuracy": 0.7098779672384262, | |
| "num_tokens": 254550553.0, | |
| "step": 10350 | |
| }, | |
| { | |
| "entropy": 1.7707626497745514, | |
| "epoch": 0.4927041879855979, | |
| "grad_norm": 1.0779789686203003, | |
| "learning_rate": 9.361878917632365e-05, | |
| "loss": 1.3529, | |
| "mean_token_accuracy": 0.7156933480501175, | |
| "num_tokens": 255800272.0, | |
| "step": 10400 | |
| }, | |
| { | |
| "entropy": 1.7735213398933412, | |
| "epoch": 0.495072958120144, | |
| "grad_norm": 0.9861488342285156, | |
| "learning_rate": 9.351736631244914e-05, | |
| "loss": 1.352, | |
| "mean_token_accuracy": 0.7177729392051697, | |
| "num_tokens": 257029917.0, | |
| "step": 10450 | |
| }, | |
| { | |
| "entropy": 1.7585961294174195, | |
| "epoch": 0.4974417282546902, | |
| "grad_norm": 1.0564011335372925, | |
| "learning_rate": 9.341519962197912e-05, | |
| "loss": 1.3421, | |
| "mean_token_accuracy": 0.7166235017776489, | |
| "num_tokens": 258269464.0, | |
| "step": 10500 | |
| }, | |
| { | |
| "entropy": 1.745290095806122, | |
| "epoch": 0.4998104983892363, | |
| "grad_norm": 1.0212265253067017, | |
| "learning_rate": 9.331229085121185e-05, | |
| "loss": 1.3898, | |
| "mean_token_accuracy": 0.710235812664032, | |
| "num_tokens": 259531127.0, | |
| "step": 10550 | |
| }, | |
| { | |
| "entropy": 1.7529479134082795, | |
| "epoch": 0.5021792685237825, | |
| "grad_norm": 1.391863226890564, | |
| "learning_rate": 9.320864175912972e-05, | |
| "loss": 1.378, | |
| "mean_token_accuracy": 0.7103132110834122, | |
| "num_tokens": 260797490.0, | |
| "step": 10600 | |
| }, | |
| { | |
| "entropy": 1.75710902094841, | |
| "epoch": 0.5045480386583286, | |
| "grad_norm": 1.0978041887283325, | |
| "learning_rate": 9.310425411736916e-05, | |
| "loss": 1.3386, | |
| "mean_token_accuracy": 0.7186200088262558, | |
| "num_tokens": 262023377.0, | |
| "step": 10650 | |
| }, | |
| { | |
| "entropy": 1.7746818363666534, | |
| "epoch": 0.5069168087928747, | |
| "grad_norm": 1.0323866605758667, | |
| "learning_rate": 9.299912971019036e-05, | |
| "loss": 1.3641, | |
| "mean_token_accuracy": 0.7137188649177552, | |
| "num_tokens": 263266765.0, | |
| "step": 10700 | |
| }, | |
| { | |
| "entropy": 1.754684933423996, | |
| "epoch": 0.5092855789274209, | |
| "grad_norm": 0.9584967494010925, | |
| "learning_rate": 9.289327033444674e-05, | |
| "loss": 1.3668, | |
| "mean_token_accuracy": 0.7127582091093063, | |
| "num_tokens": 264493871.0, | |
| "step": 10750 | |
| }, | |
| { | |
| "entropy": 1.7670053398609162, | |
| "epoch": 0.511654349061967, | |
| "grad_norm": 1.0315459966659546, | |
| "learning_rate": 9.278667779955437e-05, | |
| "loss": 1.3966, | |
| "mean_token_accuracy": 0.7078602635860443, | |
| "num_tokens": 265716107.0, | |
| "step": 10800 | |
| }, | |
| { | |
| "entropy": 1.7422236442565917, | |
| "epoch": 0.5140231191965132, | |
| "grad_norm": 1.066741943359375, | |
| "learning_rate": 9.267935392746081e-05, | |
| "loss": 1.3224, | |
| "mean_token_accuracy": 0.7229005527496338, | |
| "num_tokens": 266969953.0, | |
| "step": 10850 | |
| }, | |
| { | |
| "entropy": 1.7432917177677154, | |
| "epoch": 0.5163918893310593, | |
| "grad_norm": 1.0382195711135864, | |
| "learning_rate": 9.25713005526142e-05, | |
| "loss": 1.3466, | |
| "mean_token_accuracy": 0.7158039021492004, | |
| "num_tokens": 268225977.0, | |
| "step": 10900 | |
| }, | |
| { | |
| "entropy": 1.7296686470508575, | |
| "epoch": 0.5187606594656055, | |
| "grad_norm": 1.1235915422439575, | |
| "learning_rate": 9.246251952193176e-05, | |
| "loss": 1.3222, | |
| "mean_token_accuracy": 0.7224133855104446, | |
| "num_tokens": 269466793.0, | |
| "step": 10950 | |
| }, | |
| { | |
| "entropy": 1.735136388540268, | |
| "epoch": 0.5211294296001516, | |
| "grad_norm": 0.990793764591217, | |
| "learning_rate": 9.235301269476832e-05, | |
| "loss": 1.3191, | |
| "mean_token_accuracy": 0.7210667967796326, | |
| "num_tokens": 270708159.0, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.5211294296001516, | |
| "eval_entropy": 1.181336676109844, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.7557077165440849, | |
| "eval_num_tokens": 270708159.0, | |
| "eval_runtime": 747.4423, | |
| "eval_samples_per_second": 33.2, | |
| "eval_steps_per_second": 4.15, | |
| "step": 11000 | |
| }, | |
| { | |
| "entropy": 1.751279581785202, | |
| "epoch": 0.5234981997346978, | |
| "grad_norm": 1.3285608291625977, | |
| "learning_rate": 9.224278194288444e-05, | |
| "loss": 1.3385, | |
| "mean_token_accuracy": 0.7199172627925873, | |
| "num_tokens": 271927990.0, | |
| "step": 11050 | |
| }, | |
| { | |
| "entropy": 1.7399055349826813, | |
| "epoch": 0.5258669698692439, | |
| "grad_norm": 1.395338535308838, | |
| "learning_rate": 9.213182915041445e-05, | |
| "loss": 1.3314, | |
| "mean_token_accuracy": 0.7194273501634598, | |
| "num_tokens": 273153187.0, | |
| "step": 11100 | |
| }, | |
| { | |
| "entropy": 1.73090322971344, | |
| "epoch": 0.52823574000379, | |
| "grad_norm": 1.6059190034866333, | |
| "learning_rate": 9.202015621383431e-05, | |
| "loss": 1.3223, | |
| "mean_token_accuracy": 0.7193130904436111, | |
| "num_tokens": 274381622.0, | |
| "step": 11150 | |
| }, | |
| { | |
| "entropy": 1.7387698328495025, | |
| "epoch": 0.5306045101383362, | |
| "grad_norm": 0.9370666742324829, | |
| "learning_rate": 9.190776504192909e-05, | |
| "loss": 1.3606, | |
| "mean_token_accuracy": 0.7134118205308915, | |
| "num_tokens": 275611193.0, | |
| "step": 11200 | |
| }, | |
| { | |
| "entropy": 1.7551235890388488, | |
| "epoch": 0.5329732802728823, | |
| "grad_norm": 0.972176730632782, | |
| "learning_rate": 9.179465755576045e-05, | |
| "loss": 1.4027, | |
| "mean_token_accuracy": 0.7102609771490097, | |
| "num_tokens": 276860392.0, | |
| "step": 11250 | |
| }, | |
| { | |
| "entropy": 1.742105484008789, | |
| "epoch": 0.5353420504074284, | |
| "grad_norm": 1.3585799932479858, | |
| "learning_rate": 9.16808356886337e-05, | |
| "loss": 1.3869, | |
| "mean_token_accuracy": 0.7101844340562821, | |
| "num_tokens": 278102635.0, | |
| "step": 11300 | |
| }, | |
| { | |
| "entropy": 1.7393697941303252, | |
| "epoch": 0.5377108205419746, | |
| "grad_norm": 0.9401509165763855, | |
| "learning_rate": 9.156630138606484e-05, | |
| "loss": 1.3764, | |
| "mean_token_accuracy": 0.7136105120182037, | |
| "num_tokens": 279342491.0, | |
| "step": 11350 | |
| }, | |
| { | |
| "entropy": 1.7620924258232116, | |
| "epoch": 0.5400795906765208, | |
| "grad_norm": 1.03669273853302, | |
| "learning_rate": 9.145105660574725e-05, | |
| "loss": 1.3836, | |
| "mean_token_accuracy": 0.7112589359283448, | |
| "num_tokens": 280562523.0, | |
| "step": 11400 | |
| }, | |
| { | |
| "entropy": 1.7693402111530303, | |
| "epoch": 0.5424483608110668, | |
| "grad_norm": 1.0556858777999878, | |
| "learning_rate": 9.133510331751828e-05, | |
| "loss": 1.3543, | |
| "mean_token_accuracy": 0.7159368151426315, | |
| "num_tokens": 281804551.0, | |
| "step": 11450 | |
| }, | |
| { | |
| "entropy": 1.7435523355007172, | |
| "epoch": 0.544817130945613, | |
| "grad_norm": 1.36162531375885, | |
| "learning_rate": 9.121844350332549e-05, | |
| "loss": 1.3505, | |
| "mean_token_accuracy": 0.7172021287679672, | |
| "num_tokens": 283039847.0, | |
| "step": 11500 | |
| }, | |
| { | |
| "entropy": 1.7690045988559724, | |
| "epoch": 0.5471859010801592, | |
| "grad_norm": 1.1119062900543213, | |
| "learning_rate": 9.110107915719292e-05, | |
| "loss": 1.3536, | |
| "mean_token_accuracy": 0.7164638632535935, | |
| "num_tokens": 284295808.0, | |
| "step": 11550 | |
| }, | |
| { | |
| "entropy": 1.7988950431346893, | |
| "epoch": 0.5495546712147054, | |
| "grad_norm": 1.2980992794036865, | |
| "learning_rate": 9.098301228518683e-05, | |
| "loss": 1.387, | |
| "mean_token_accuracy": 0.7079293090105057, | |
| "num_tokens": 285481962.0, | |
| "step": 11600 | |
| }, | |
| { | |
| "entropy": 1.7422871506214141, | |
| "epoch": 0.5519234413492514, | |
| "grad_norm": 1.0130205154418945, | |
| "learning_rate": 9.086424490538157e-05, | |
| "loss": 1.3488, | |
| "mean_token_accuracy": 0.7166511958837509, | |
| "num_tokens": 286739692.0, | |
| "step": 11650 | |
| }, | |
| { | |
| "entropy": 1.7421106839179992, | |
| "epoch": 0.5542922114837976, | |
| "grad_norm": 1.0390921831130981, | |
| "learning_rate": 9.074477904782495e-05, | |
| "loss": 1.3213, | |
| "mean_token_accuracy": 0.7222142660617829, | |
| "num_tokens": 287953436.0, | |
| "step": 11700 | |
| }, | |
| { | |
| "entropy": 1.7164568746089934, | |
| "epoch": 0.5566609816183438, | |
| "grad_norm": 0.9376536011695862, | |
| "learning_rate": 9.062461675450366e-05, | |
| "loss": 1.3204, | |
| "mean_token_accuracy": 0.7219431722164154, | |
| "num_tokens": 289187059.0, | |
| "step": 11750 | |
| }, | |
| { | |
| "entropy": 1.7607939064502716, | |
| "epoch": 0.5590297517528899, | |
| "grad_norm": 1.1221693754196167, | |
| "learning_rate": 9.050376007930831e-05, | |
| "loss": 1.358, | |
| "mean_token_accuracy": 0.7148712009191514, | |
| "num_tokens": 290395472.0, | |
| "step": 11800 | |
| }, | |
| { | |
| "entropy": 1.7365293169021607, | |
| "epoch": 0.561398521887436, | |
| "grad_norm": 1.2102606296539307, | |
| "learning_rate": 9.038221108799832e-05, | |
| "loss": 1.3362, | |
| "mean_token_accuracy": 0.7193614053726196, | |
| "num_tokens": 291650032.0, | |
| "step": 11850 | |
| }, | |
| { | |
| "entropy": 1.7262990617752074, | |
| "epoch": 0.5637672920219822, | |
| "grad_norm": 1.1103631258010864, | |
| "learning_rate": 9.025997185816662e-05, | |
| "loss": 1.3304, | |
| "mean_token_accuracy": 0.7197805154323578, | |
| "num_tokens": 292891757.0, | |
| "step": 11900 | |
| }, | |
| { | |
| "entropy": 1.7870515859127045, | |
| "epoch": 0.5661360621565283, | |
| "grad_norm": 1.2359330654144287, | |
| "learning_rate": 9.013704447920407e-05, | |
| "loss": 1.3947, | |
| "mean_token_accuracy": 0.7112246352434158, | |
| "num_tokens": 294108078.0, | |
| "step": 11950 | |
| }, | |
| { | |
| "entropy": 1.7402713179588318, | |
| "epoch": 0.5685048322910745, | |
| "grad_norm": 1.2696958780288696, | |
| "learning_rate": 9.001343105226397e-05, | |
| "loss": 1.3456, | |
| "mean_token_accuracy": 0.7186821699142456, | |
| "num_tokens": 295347523.0, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.5685048322910745, | |
| "eval_entropy": 1.1782386238578055, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.755168300715283, | |
| "eval_num_tokens": 295347523.0, | |
| "eval_runtime": 747.9571, | |
| "eval_samples_per_second": 33.177, | |
| "eval_steps_per_second": 4.147, | |
| "step": 12000 | |
| }, | |
| { | |
| "entropy": 1.7823381924629211, | |
| "epoch": 0.5708736024256206, | |
| "grad_norm": 1.090854287147522, | |
| "learning_rate": 8.988913369022585e-05, | |
| "loss": 1.3752, | |
| "mean_token_accuracy": 0.7125837200880051, | |
| "num_tokens": 296596547.0, | |
| "step": 12050 | |
| }, | |
| { | |
| "entropy": 1.7829466736316681, | |
| "epoch": 0.5732423725601667, | |
| "grad_norm": 0.9252607226371765, | |
| "learning_rate": 8.976415451765952e-05, | |
| "loss": 1.3646, | |
| "mean_token_accuracy": 0.7142701143026352, | |
| "num_tokens": 297794903.0, | |
| "step": 12100 | |
| }, | |
| { | |
| "entropy": 1.7680902397632599, | |
| "epoch": 0.5756111426947129, | |
| "grad_norm": 1.0627940893173218, | |
| "learning_rate": 8.96384956707888e-05, | |
| "loss": 1.3628, | |
| "mean_token_accuracy": 0.7150187093019486, | |
| "num_tokens": 299029935.0, | |
| "step": 12150 | |
| }, | |
| { | |
| "entropy": 1.7492178344726563, | |
| "epoch": 0.5779799128292591, | |
| "grad_norm": 1.2822635173797607, | |
| "learning_rate": 8.951215929745486e-05, | |
| "loss": 1.3594, | |
| "mean_token_accuracy": 0.71372334420681, | |
| "num_tokens": 300256525.0, | |
| "step": 12200 | |
| }, | |
| { | |
| "entropy": 1.7705177330970765, | |
| "epoch": 0.5803486829638052, | |
| "grad_norm": 1.1303389072418213, | |
| "learning_rate": 8.93851475570796e-05, | |
| "loss": 1.3498, | |
| "mean_token_accuracy": 0.7154391181468963, | |
| "num_tokens": 301466189.0, | |
| "step": 12250 | |
| }, | |
| { | |
| "entropy": 1.7693954205513, | |
| "epoch": 0.5827174530983513, | |
| "grad_norm": 1.0360733270645142, | |
| "learning_rate": 8.925746262062879e-05, | |
| "loss": 1.3523, | |
| "mean_token_accuracy": 0.71549709379673, | |
| "num_tokens": 302677547.0, | |
| "step": 12300 | |
| }, | |
| { | |
| "entropy": 1.7499237847328186, | |
| "epoch": 0.5850862232328975, | |
| "grad_norm": 1.2163889408111572, | |
| "learning_rate": 8.912910667057482e-05, | |
| "loss": 1.3219, | |
| "mean_token_accuracy": 0.7233135092258454, | |
| "num_tokens": 303884552.0, | |
| "step": 12350 | |
| }, | |
| { | |
| "entropy": 1.7528709161281586, | |
| "epoch": 0.5874549933674437, | |
| "grad_norm": 1.0694142580032349, | |
| "learning_rate": 8.900008190085946e-05, | |
| "loss": 1.3695, | |
| "mean_token_accuracy": 0.7140274894237518, | |
| "num_tokens": 305112064.0, | |
| "step": 12400 | |
| }, | |
| { | |
| "entropy": 1.7593362927436829, | |
| "epoch": 0.5898237635019897, | |
| "grad_norm": 0.9559013247489929, | |
| "learning_rate": 8.887039051685646e-05, | |
| "loss": 1.3538, | |
| "mean_token_accuracy": 0.7164691358804702, | |
| "num_tokens": 306349750.0, | |
| "step": 12450 | |
| }, | |
| { | |
| "entropy": 1.7250176286697387, | |
| "epoch": 0.5921925336365359, | |
| "grad_norm": 1.0856672525405884, | |
| "learning_rate": 8.874003473533372e-05, | |
| "loss": 1.3617, | |
| "mean_token_accuracy": 0.7142321610450745, | |
| "num_tokens": 307589875.0, | |
| "step": 12500 | |
| }, | |
| { | |
| "entropy": 1.7204779553413392, | |
| "epoch": 0.5945613037710821, | |
| "grad_norm": 1.0638339519500732, | |
| "learning_rate": 8.860901678441542e-05, | |
| "loss": 1.3523, | |
| "mean_token_accuracy": 0.7161801540851593, | |
| "num_tokens": 308844739.0, | |
| "step": 12550 | |
| }, | |
| { | |
| "entropy": 1.7241905891895295, | |
| "epoch": 0.5969300739056282, | |
| "grad_norm": 1.0859177112579346, | |
| "learning_rate": 8.847733890354397e-05, | |
| "loss": 1.3558, | |
| "mean_token_accuracy": 0.714522579908371, | |
| "num_tokens": 310070098.0, | |
| "step": 12600 | |
| }, | |
| { | |
| "entropy": 1.735662100315094, | |
| "epoch": 0.5992988440401743, | |
| "grad_norm": 1.1100165843963623, | |
| "learning_rate": 8.834500334344178e-05, | |
| "loss": 1.363, | |
| "mean_token_accuracy": 0.7140331470966339, | |
| "num_tokens": 311292251.0, | |
| "step": 12650 | |
| }, | |
| { | |
| "entropy": 1.7426686155796052, | |
| "epoch": 0.6016676141747205, | |
| "grad_norm": 1.109788179397583, | |
| "learning_rate": 8.821201236607266e-05, | |
| "loss": 1.3491, | |
| "mean_token_accuracy": 0.7144311499595642, | |
| "num_tokens": 312573175.0, | |
| "step": 12700 | |
| }, | |
| { | |
| "entropy": 1.7310996508598329, | |
| "epoch": 0.6040363843092666, | |
| "grad_norm": 1.4260696172714233, | |
| "learning_rate": 8.807836824460329e-05, | |
| "loss": 1.3352, | |
| "mean_token_accuracy": 0.7185973340272903, | |
| "num_tokens": 313821355.0, | |
| "step": 12750 | |
| }, | |
| { | |
| "entropy": 1.7413757181167602, | |
| "epoch": 0.6064051544438128, | |
| "grad_norm": 0.9746555685997009, | |
| "learning_rate": 8.794407326336427e-05, | |
| "loss": 1.3168, | |
| "mean_token_accuracy": 0.7220592141151428, | |
| "num_tokens": 315041303.0, | |
| "step": 12800 | |
| }, | |
| { | |
| "entropy": 1.7303865098953246, | |
| "epoch": 0.6087739245783589, | |
| "grad_norm": 0.892135739326477, | |
| "learning_rate": 8.780912971781112e-05, | |
| "loss": 1.3201, | |
| "mean_token_accuracy": 0.7211132681369782, | |
| "num_tokens": 316288409.0, | |
| "step": 12850 | |
| }, | |
| { | |
| "entropy": 1.7497126710414888, | |
| "epoch": 0.611142694712905, | |
| "grad_norm": 1.199959397315979, | |
| "learning_rate": 8.767353991448503e-05, | |
| "loss": 1.3052, | |
| "mean_token_accuracy": 0.7245729100704194, | |
| "num_tokens": 317526338.0, | |
| "step": 12900 | |
| }, | |
| { | |
| "entropy": 1.7545914590358733, | |
| "epoch": 0.6135114648474512, | |
| "grad_norm": 0.9794778227806091, | |
| "learning_rate": 8.753730617097342e-05, | |
| "loss": 1.3417, | |
| "mean_token_accuracy": 0.7178518337011337, | |
| "num_tokens": 318776423.0, | |
| "step": 12950 | |
| }, | |
| { | |
| "entropy": 1.761199436187744, | |
| "epoch": 0.6158802349819974, | |
| "grad_norm": 1.115660548210144, | |
| "learning_rate": 8.740043081587043e-05, | |
| "loss": 1.3428, | |
| "mean_token_accuracy": 0.71872696518898, | |
| "num_tokens": 319970665.0, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.6158802349819974, | |
| "eval_entropy": 1.173806337542414, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.755712005311562, | |
| "eval_num_tokens": 319970665.0, | |
| "eval_runtime": 729.7637, | |
| "eval_samples_per_second": 34.004, | |
| "eval_steps_per_second": 4.251, | |
| "step": 13000 | |
| }, | |
| { | |
| "entropy": 1.7386520493030548, | |
| "epoch": 0.6182490051165435, | |
| "grad_norm": 1.0832492113113403, | |
| "learning_rate": 8.726291618873692e-05, | |
| "loss": 1.3185, | |
| "mean_token_accuracy": 0.7225498640537262, | |
| "num_tokens": 321195496.0, | |
| "step": 13050 | |
| }, | |
| { | |
| "entropy": 1.788149139881134, | |
| "epoch": 0.6206177752510896, | |
| "grad_norm": 1.0728507041931152, | |
| "learning_rate": 8.712476464006069e-05, | |
| "loss": 1.3687, | |
| "mean_token_accuracy": 0.7138838738203048, | |
| "num_tokens": 322394051.0, | |
| "step": 13100 | |
| }, | |
| { | |
| "entropy": 1.7250337314605713, | |
| "epoch": 0.6229865453856358, | |
| "grad_norm": 0.9454106688499451, | |
| "learning_rate": 8.698597853121613e-05, | |
| "loss": 1.3206, | |
| "mean_token_accuracy": 0.7232500827312469, | |
| "num_tokens": 323646049.0, | |
| "step": 13150 | |
| }, | |
| { | |
| "entropy": 1.7228785872459411, | |
| "epoch": 0.625355315520182, | |
| "grad_norm": 1.074063777923584, | |
| "learning_rate": 8.684656023442404e-05, | |
| "loss": 1.3416, | |
| "mean_token_accuracy": 0.7188435053825378, | |
| "num_tokens": 324901290.0, | |
| "step": 13200 | |
| }, | |
| { | |
| "entropy": 1.7498575222492219, | |
| "epoch": 0.627724085654728, | |
| "grad_norm": 1.3152785301208496, | |
| "learning_rate": 8.670651213271087e-05, | |
| "loss": 1.3495, | |
| "mean_token_accuracy": 0.7163092708587646, | |
| "num_tokens": 326143599.0, | |
| "step": 13250 | |
| }, | |
| { | |
| "entropy": 1.773080164194107, | |
| "epoch": 0.6300928557892742, | |
| "grad_norm": 1.117574691772461, | |
| "learning_rate": 8.656583661986815e-05, | |
| "loss": 1.3716, | |
| "mean_token_accuracy": 0.7143875294923783, | |
| "num_tokens": 327369948.0, | |
| "step": 13300 | |
| }, | |
| { | |
| "entropy": 1.756659119129181, | |
| "epoch": 0.6324616259238204, | |
| "grad_norm": 1.0091075897216797, | |
| "learning_rate": 8.642453610041152e-05, | |
| "loss": 1.3815, | |
| "mean_token_accuracy": 0.7113278949260712, | |
| "num_tokens": 328609411.0, | |
| "step": 13350 | |
| }, | |
| { | |
| "entropy": 1.7753915119171142, | |
| "epoch": 0.6348303960583664, | |
| "grad_norm": 0.9333689212799072, | |
| "learning_rate": 8.628261298953963e-05, | |
| "loss": 1.3478, | |
| "mean_token_accuracy": 0.7168344795703888, | |
| "num_tokens": 329812629.0, | |
| "step": 13400 | |
| }, | |
| { | |
| "entropy": 1.7191705119609832, | |
| "epoch": 0.6371991661929126, | |
| "grad_norm": 0.9254161715507507, | |
| "learning_rate": 8.614006971309287e-05, | |
| "loss": 1.32, | |
| "mean_token_accuracy": 0.7235176879167556, | |
| "num_tokens": 331045306.0, | |
| "step": 13450 | |
| }, | |
| { | |
| "entropy": 1.7539090728759765, | |
| "epoch": 0.6395679363274588, | |
| "grad_norm": 1.135908842086792, | |
| "learning_rate": 8.599690870751189e-05, | |
| "loss": 1.2991, | |
| "mean_token_accuracy": 0.7238886666297912, | |
| "num_tokens": 332265198.0, | |
| "step": 13500 | |
| }, | |
| { | |
| "entropy": 1.7759091782569885, | |
| "epoch": 0.641936706462005, | |
| "grad_norm": 0.9939352869987488, | |
| "learning_rate": 8.585313241979593e-05, | |
| "loss": 1.3446, | |
| "mean_token_accuracy": 0.7167073094844818, | |
| "num_tokens": 333478621.0, | |
| "step": 13550 | |
| }, | |
| { | |
| "entropy": 1.8198255062103272, | |
| "epoch": 0.644305476596551, | |
| "grad_norm": 1.1110658645629883, | |
| "learning_rate": 8.570874330746109e-05, | |
| "loss": 1.3429, | |
| "mean_token_accuracy": 0.7163071328401566, | |
| "num_tokens": 334679776.0, | |
| "step": 13600 | |
| }, | |
| { | |
| "entropy": 1.7491872441768646, | |
| "epoch": 0.6466742467310972, | |
| "grad_norm": 1.102397084236145, | |
| "learning_rate": 8.556374383849815e-05, | |
| "loss": 1.3429, | |
| "mean_token_accuracy": 0.7170016378164291, | |
| "num_tokens": 335924366.0, | |
| "step": 13650 | |
| }, | |
| { | |
| "entropy": 1.7320797193050383, | |
| "epoch": 0.6490430168656434, | |
| "grad_norm": 0.9770281910896301, | |
| "learning_rate": 8.541813649133064e-05, | |
| "loss": 1.3012, | |
| "mean_token_accuracy": 0.7252740359306336, | |
| "num_tokens": 337177387.0, | |
| "step": 13700 | |
| }, | |
| { | |
| "entropy": 1.7851051843166352, | |
| "epoch": 0.6514117870001895, | |
| "grad_norm": 1.2061119079589844, | |
| "learning_rate": 8.52719237547722e-05, | |
| "loss": 1.3423, | |
| "mean_token_accuracy": 0.7174914568662644, | |
| "num_tokens": 338406873.0, | |
| "step": 13750 | |
| }, | |
| { | |
| "entropy": 1.7197813856601716, | |
| "epoch": 0.6537805571347356, | |
| "grad_norm": 1.0583444833755493, | |
| "learning_rate": 8.512510812798426e-05, | |
| "loss": 1.3451, | |
| "mean_token_accuracy": 0.7177790975570679, | |
| "num_tokens": 339627417.0, | |
| "step": 13800 | |
| }, | |
| { | |
| "entropy": 1.7355473148822784, | |
| "epoch": 0.6561493272692818, | |
| "grad_norm": 1.1621958017349243, | |
| "learning_rate": 8.49776921204332e-05, | |
| "loss": 1.3587, | |
| "mean_token_accuracy": 0.7146015846729279, | |
| "num_tokens": 340857014.0, | |
| "step": 13850 | |
| }, | |
| { | |
| "entropy": 1.7453387939929963, | |
| "epoch": 0.6585180974038279, | |
| "grad_norm": 1.0361634492874146, | |
| "learning_rate": 8.48296782518475e-05, | |
| "loss": 1.3769, | |
| "mean_token_accuracy": 0.7130674320459366, | |
| "num_tokens": 342093808.0, | |
| "step": 13900 | |
| }, | |
| { | |
| "entropy": 1.7622779953479766, | |
| "epoch": 0.6608868675383741, | |
| "grad_norm": 1.2546138763427734, | |
| "learning_rate": 8.468106905217465e-05, | |
| "loss": 1.348, | |
| "mean_token_accuracy": 0.7160887461900711, | |
| "num_tokens": 343326476.0, | |
| "step": 13950 | |
| }, | |
| { | |
| "entropy": 1.7349292349815368, | |
| "epoch": 0.6632556376729202, | |
| "grad_norm": 0.9984197020530701, | |
| "learning_rate": 8.453186706153789e-05, | |
| "loss": 1.301, | |
| "mean_token_accuracy": 0.7255065280199051, | |
| "num_tokens": 344557978.0, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.6632556376729202, | |
| "eval_entropy": 1.1754964998965876, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.7568369234841843, | |
| "eval_num_tokens": 344557978.0, | |
| "eval_runtime": 728.4569, | |
| "eval_samples_per_second": 34.065, | |
| "eval_steps_per_second": 4.258, | |
| "step": 14000 | |
| }, | |
| { | |
| "entropy": 1.786487684249878, | |
| "epoch": 0.6656244078074663, | |
| "grad_norm": 1.1214771270751953, | |
| "learning_rate": 8.438207483019291e-05, | |
| "loss": 1.3981, | |
| "mean_token_accuracy": 0.7088551700115204, | |
| "num_tokens": 345789604.0, | |
| "step": 14050 | |
| }, | |
| { | |
| "entropy": 1.7951791512966155, | |
| "epoch": 0.6679931779420125, | |
| "grad_norm": 1.0142500400543213, | |
| "learning_rate": 8.42316949184841e-05, | |
| "loss": 1.3948, | |
| "mean_token_accuracy": 0.7093900120258332, | |
| "num_tokens": 347013721.0, | |
| "step": 14100 | |
| }, | |
| { | |
| "entropy": 1.7639971029758454, | |
| "epoch": 0.6703619480765587, | |
| "grad_norm": 1.0026280879974365, | |
| "learning_rate": 8.408072989680087e-05, | |
| "loss": 1.3031, | |
| "mean_token_accuracy": 0.7246174013614655, | |
| "num_tokens": 348211806.0, | |
| "step": 14150 | |
| }, | |
| { | |
| "entropy": 1.7443763566017152, | |
| "epoch": 0.6727307182111048, | |
| "grad_norm": 1.0735307931900024, | |
| "learning_rate": 8.39291823455337e-05, | |
| "loss": 1.3052, | |
| "mean_token_accuracy": 0.7236789721250534, | |
| "num_tokens": 349491496.0, | |
| "step": 14200 | |
| }, | |
| { | |
| "entropy": 1.7840288174152374, | |
| "epoch": 0.6750994883456509, | |
| "grad_norm": 1.1240233182907104, | |
| "learning_rate": 8.377705485503007e-05, | |
| "loss": 1.3545, | |
| "mean_token_accuracy": 0.7152435338497162, | |
| "num_tokens": 350709829.0, | |
| "step": 14250 | |
| }, | |
| { | |
| "entropy": 1.7674752044677735, | |
| "epoch": 0.6774682584801971, | |
| "grad_norm": 0.9507238864898682, | |
| "learning_rate": 8.36243500255501e-05, | |
| "loss": 1.3193, | |
| "mean_token_accuracy": 0.7238988935947418, | |
| "num_tokens": 351910781.0, | |
| "step": 14300 | |
| }, | |
| { | |
| "entropy": 1.7877480947971345, | |
| "epoch": 0.6798370286147433, | |
| "grad_norm": 0.9499313831329346, | |
| "learning_rate": 8.34710704672222e-05, | |
| "loss": 1.3392, | |
| "mean_token_accuracy": 0.7172271001338959, | |
| "num_tokens": 353134751.0, | |
| "step": 14350 | |
| }, | |
| { | |
| "entropy": 1.7766230964660645, | |
| "epoch": 0.6822057987492893, | |
| "grad_norm": 1.2760356664657593, | |
| "learning_rate": 8.331721879999841e-05, | |
| "loss": 1.3595, | |
| "mean_token_accuracy": 0.7147215807437897, | |
| "num_tokens": 354350033.0, | |
| "step": 14400 | |
| }, | |
| { | |
| "entropy": 1.7614100205898284, | |
| "epoch": 0.6845745688838355, | |
| "grad_norm": 1.043785572052002, | |
| "learning_rate": 8.316279765360957e-05, | |
| "loss": 1.3879, | |
| "mean_token_accuracy": 0.7108203011751175, | |
| "num_tokens": 355573758.0, | |
| "step": 14450 | |
| }, | |
| { | |
| "entropy": 1.7683514368534088, | |
| "epoch": 0.6869433390183817, | |
| "grad_norm": 1.1136603355407715, | |
| "learning_rate": 8.300780966752049e-05, | |
| "loss": 1.3451, | |
| "mean_token_accuracy": 0.7161549615859986, | |
| "num_tokens": 356822721.0, | |
| "step": 14500 | |
| }, | |
| { | |
| "entropy": 1.7504412484169007, | |
| "epoch": 0.6893121091529278, | |
| "grad_norm": 1.132605791091919, | |
| "learning_rate": 8.28522574908847e-05, | |
| "loss": 1.3433, | |
| "mean_token_accuracy": 0.7193030816316605, | |
| "num_tokens": 358090609.0, | |
| "step": 14550 | |
| }, | |
| { | |
| "entropy": 1.7415921115875244, | |
| "epoch": 0.6916808792874739, | |
| "grad_norm": 0.901418924331665, | |
| "learning_rate": 8.269614378249932e-05, | |
| "loss": 1.3098, | |
| "mean_token_accuracy": 0.7223669987916946, | |
| "num_tokens": 359334849.0, | |
| "step": 14600 | |
| }, | |
| { | |
| "entropy": 1.72568878531456, | |
| "epoch": 0.6940496494220201, | |
| "grad_norm": 1.2013583183288574, | |
| "learning_rate": 8.253947121075942e-05, | |
| "loss": 1.3413, | |
| "mean_token_accuracy": 0.7166631370782852, | |
| "num_tokens": 360565890.0, | |
| "step": 14650 | |
| }, | |
| { | |
| "entropy": 1.7351550233364106, | |
| "epoch": 0.6964184195565662, | |
| "grad_norm": 0.9248843193054199, | |
| "learning_rate": 8.238224245361262e-05, | |
| "loss": 1.3269, | |
| "mean_token_accuracy": 0.7205180561542511, | |
| "num_tokens": 361780402.0, | |
| "step": 14700 | |
| }, | |
| { | |
| "entropy": 1.7353229641914367, | |
| "epoch": 0.6987871896911124, | |
| "grad_norm": 0.9147818088531494, | |
| "learning_rate": 8.222446019851314e-05, | |
| "loss": 1.3239, | |
| "mean_token_accuracy": 0.7209709006547927, | |
| "num_tokens": 362998310.0, | |
| "step": 14750 | |
| }, | |
| { | |
| "entropy": 1.7628222048282622, | |
| "epoch": 0.7011559598256585, | |
| "grad_norm": 1.0660256147384644, | |
| "learning_rate": 8.206612714237601e-05, | |
| "loss": 1.3736, | |
| "mean_token_accuracy": 0.7127251303195954, | |
| "num_tokens": 364192705.0, | |
| "step": 14800 | |
| }, | |
| { | |
| "entropy": 1.7622958242893219, | |
| "epoch": 0.7035247299602047, | |
| "grad_norm": 1.133527398109436, | |
| "learning_rate": 8.190724599153083e-05, | |
| "loss": 1.3252, | |
| "mean_token_accuracy": 0.7197421258687973, | |
| "num_tokens": 365419544.0, | |
| "step": 14850 | |
| }, | |
| { | |
| "entropy": 1.7733166551589965, | |
| "epoch": 0.7058935000947508, | |
| "grad_norm": 1.0449475049972534, | |
| "learning_rate": 8.174781946167563e-05, | |
| "loss": 1.3422, | |
| "mean_token_accuracy": 0.7184215635061264, | |
| "num_tokens": 366668472.0, | |
| "step": 14900 | |
| }, | |
| { | |
| "entropy": 1.7824318826198577, | |
| "epoch": 0.708262270229297, | |
| "grad_norm": 0.9425482749938965, | |
| "learning_rate": 8.158785027783038e-05, | |
| "loss": 1.351, | |
| "mean_token_accuracy": 0.7144128715991974, | |
| "num_tokens": 367883921.0, | |
| "step": 14950 | |
| }, | |
| { | |
| "entropy": 1.7383503484725953, | |
| "epoch": 0.7106310403638431, | |
| "grad_norm": 1.0266870260238647, | |
| "learning_rate": 8.14273411742905e-05, | |
| "loss": 1.3003, | |
| "mean_token_accuracy": 0.7255125510692596, | |
| "num_tokens": 369125857.0, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.7106310403638431, | |
| "eval_entropy": 1.1784183711370755, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.7566592082102632, | |
| "eval_num_tokens": 369125857.0, | |
| "eval_runtime": 728.4679, | |
| "eval_samples_per_second": 34.065, | |
| "eval_steps_per_second": 4.258, | |
| "step": 15000 | |
| }, | |
| { | |
| "entropy": 1.7569481348991394, | |
| "epoch": 0.7129998104983892, | |
| "grad_norm": 1.0710499286651611, | |
| "learning_rate": 8.126629489457998e-05, | |
| "loss": 1.3493, | |
| "mean_token_accuracy": 0.7171416920423508, | |
| "num_tokens": 370360655.0, | |
| "step": 15050 | |
| }, | |
| { | |
| "entropy": 1.7639206099510192, | |
| "epoch": 0.7153685806329354, | |
| "grad_norm": 1.023205041885376, | |
| "learning_rate": 8.110471419140461e-05, | |
| "loss": 1.3816, | |
| "mean_token_accuracy": 0.7107687264680862, | |
| "num_tokens": 371611998.0, | |
| "step": 15100 | |
| }, | |
| { | |
| "entropy": 1.7124925446510315, | |
| "epoch": 0.7177373507674816, | |
| "grad_norm": 1.4796391725540161, | |
| "learning_rate": 8.094260182660491e-05, | |
| "loss": 1.3103, | |
| "mean_token_accuracy": 0.7245303303003311, | |
| "num_tokens": 372852886.0, | |
| "step": 15150 | |
| }, | |
| { | |
| "entropy": 1.7330104005336762, | |
| "epoch": 0.7201061209020276, | |
| "grad_norm": 1.1054223775863647, | |
| "learning_rate": 8.077996057110881e-05, | |
| "loss": 1.3446, | |
| "mean_token_accuracy": 0.7186214071512222, | |
| "num_tokens": 374060791.0, | |
| "step": 15200 | |
| }, | |
| { | |
| "entropy": 1.781588876247406, | |
| "epoch": 0.7224748910365738, | |
| "grad_norm": 1.2375303506851196, | |
| "learning_rate": 8.06167932048845e-05, | |
| "loss": 1.3815, | |
| "mean_token_accuracy": 0.7106660062074661, | |
| "num_tokens": 375260162.0, | |
| "step": 15250 | |
| }, | |
| { | |
| "entropy": 1.7753887116909026, | |
| "epoch": 0.72484366117112, | |
| "grad_norm": 1.0260518789291382, | |
| "learning_rate": 8.045310251689269e-05, | |
| "loss": 1.3782, | |
| "mean_token_accuracy": 0.7120629328489304, | |
| "num_tokens": 376480540.0, | |
| "step": 15300 | |
| }, | |
| { | |
| "entropy": 1.7605856931209565, | |
| "epoch": 0.7272124313056662, | |
| "grad_norm": 1.0135972499847412, | |
| "learning_rate": 8.028889130503908e-05, | |
| "loss": 1.3664, | |
| "mean_token_accuracy": 0.714390983581543, | |
| "num_tokens": 377707870.0, | |
| "step": 15350 | |
| }, | |
| { | |
| "entropy": 1.7473648416996002, | |
| "epoch": 0.7295812014402122, | |
| "grad_norm": 1.5319159030914307, | |
| "learning_rate": 8.012416237612651e-05, | |
| "loss": 1.3251, | |
| "mean_token_accuracy": 0.7199866360425949, | |
| "num_tokens": 378945180.0, | |
| "step": 15400 | |
| }, | |
| { | |
| "entropy": 1.7613112390041352, | |
| "epoch": 0.7319499715747584, | |
| "grad_norm": 1.1516921520233154, | |
| "learning_rate": 7.995891854580694e-05, | |
| "loss": 1.3398, | |
| "mean_token_accuracy": 0.7185401087999344, | |
| "num_tokens": 380202318.0, | |
| "step": 15450 | |
| }, | |
| { | |
| "entropy": 1.7631869399547577, | |
| "epoch": 0.7343187417093046, | |
| "grad_norm": 1.2842717170715332, | |
| "learning_rate": 7.979316263853338e-05, | |
| "loss": 1.3246, | |
| "mean_token_accuracy": 0.7208184325695037, | |
| "num_tokens": 381422244.0, | |
| "step": 15500 | |
| }, | |
| { | |
| "entropy": 1.7426608395576477, | |
| "epoch": 0.7366875118438506, | |
| "grad_norm": 1.2845314741134644, | |
| "learning_rate": 7.962689748751158e-05, | |
| "loss": 1.3073, | |
| "mean_token_accuracy": 0.7258092379570007, | |
| "num_tokens": 382656317.0, | |
| "step": 15550 | |
| }, | |
| { | |
| "entropy": 1.7413418543338777, | |
| "epoch": 0.7390562819783968, | |
| "grad_norm": 1.1051653623580933, | |
| "learning_rate": 7.94601259346516e-05, | |
| "loss": 1.3248, | |
| "mean_token_accuracy": 0.7227061313390731, | |
| "num_tokens": 383887770.0, | |
| "step": 15600 | |
| }, | |
| { | |
| "entropy": 1.7754042732715607, | |
| "epoch": 0.741425052112943, | |
| "grad_norm": 1.0099495649337769, | |
| "learning_rate": 7.929285083051921e-05, | |
| "loss": 1.3818, | |
| "mean_token_accuracy": 0.713128559589386, | |
| "num_tokens": 385130213.0, | |
| "step": 15650 | |
| }, | |
| { | |
| "entropy": 1.7583998191356658, | |
| "epoch": 0.7437938222474891, | |
| "grad_norm": 1.0357869863510132, | |
| "learning_rate": 7.912507503428728e-05, | |
| "loss": 1.3513, | |
| "mean_token_accuracy": 0.716811910867691, | |
| "num_tokens": 386352005.0, | |
| "step": 15700 | |
| }, | |
| { | |
| "entropy": 1.7743099415302277, | |
| "epoch": 0.7461625923820352, | |
| "grad_norm": 1.10836660861969, | |
| "learning_rate": 7.895680141368678e-05, | |
| "loss": 1.3314, | |
| "mean_token_accuracy": 0.7205884575843811, | |
| "num_tokens": 387565047.0, | |
| "step": 15750 | |
| }, | |
| { | |
| "entropy": 1.7909239864349364, | |
| "epoch": 0.7485313625165814, | |
| "grad_norm": 1.0026726722717285, | |
| "learning_rate": 7.87880328449578e-05, | |
| "loss": 1.3547, | |
| "mean_token_accuracy": 0.7177285236120224, | |
| "num_tokens": 388787505.0, | |
| "step": 15800 | |
| }, | |
| { | |
| "entropy": 1.7829215788841248, | |
| "epoch": 0.7509001326511275, | |
| "grad_norm": 1.3079992532730103, | |
| "learning_rate": 7.86187722128004e-05, | |
| "loss": 1.329, | |
| "mean_token_accuracy": 0.720749350786209, | |
| "num_tokens": 390046573.0, | |
| "step": 15850 | |
| }, | |
| { | |
| "entropy": 1.7563760423660277, | |
| "epoch": 0.7532689027856737, | |
| "grad_norm": 1.1663581132888794, | |
| "learning_rate": 7.844902241032535e-05, | |
| "loss": 1.3364, | |
| "mean_token_accuracy": 0.7199984455108642, | |
| "num_tokens": 391284239.0, | |
| "step": 15900 | |
| }, | |
| { | |
| "entropy": 1.7583712506294251, | |
| "epoch": 0.7556376729202198, | |
| "grad_norm": 1.0669708251953125, | |
| "learning_rate": 7.827878633900461e-05, | |
| "loss": 1.3233, | |
| "mean_token_accuracy": 0.7232204431295395, | |
| "num_tokens": 392511286.0, | |
| "step": 15950 | |
| }, | |
| { | |
| "entropy": 1.7710711109638213, | |
| "epoch": 0.7580064430547659, | |
| "grad_norm": 1.1993380784988403, | |
| "learning_rate": 7.81080669086217e-05, | |
| "loss": 1.3633, | |
| "mean_token_accuracy": 0.7153352189064026, | |
| "num_tokens": 393762386.0, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.7580064430547659, | |
| "eval_entropy": 1.205138478718751, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.7560521679240454, | |
| "eval_num_tokens": 393762386.0, | |
| "eval_runtime": 728.478, | |
| "eval_samples_per_second": 34.064, | |
| "eval_steps_per_second": 4.258, | |
| "step": 16000 | |
| }, | |
| { | |
| "entropy": 1.7863057303428649, | |
| "epoch": 0.7603752131893121, | |
| "grad_norm": 0.9311954975128174, | |
| "learning_rate": 7.793686703722212e-05, | |
| "loss": 1.3477, | |
| "mean_token_accuracy": 0.7189880239963532, | |
| "num_tokens": 394971495.0, | |
| "step": 16050 | |
| }, | |
| { | |
| "entropy": 1.747820656299591, | |
| "epoch": 0.7627439833238583, | |
| "grad_norm": 1.0288971662521362, | |
| "learning_rate": 7.776518965106327e-05, | |
| "loss": 1.3034, | |
| "mean_token_accuracy": 0.7258507144451142, | |
| "num_tokens": 396221548.0, | |
| "step": 16100 | |
| }, | |
| { | |
| "entropy": 1.7468533515930176, | |
| "epoch": 0.7651127534584043, | |
| "grad_norm": 1.0182217359542847, | |
| "learning_rate": 7.759303768456463e-05, | |
| "loss": 1.3123, | |
| "mean_token_accuracy": 0.7229688459634781, | |
| "num_tokens": 397439687.0, | |
| "step": 16150 | |
| }, | |
| { | |
| "entropy": 1.7685637962818146, | |
| "epoch": 0.7674815235929505, | |
| "grad_norm": 1.1658631563186646, | |
| "learning_rate": 7.742041408025747e-05, | |
| "loss": 1.3163, | |
| "mean_token_accuracy": 0.7229421508312225, | |
| "num_tokens": 398648499.0, | |
| "step": 16200 | |
| }, | |
| { | |
| "entropy": 1.74519140958786, | |
| "epoch": 0.7698502937274967, | |
| "grad_norm": 1.0480293035507202, | |
| "learning_rate": 7.724732178873456e-05, | |
| "loss": 1.3396, | |
| "mean_token_accuracy": 0.7191978305578232, | |
| "num_tokens": 399900933.0, | |
| "step": 16250 | |
| }, | |
| { | |
| "entropy": 1.73216095328331, | |
| "epoch": 0.7722190638620429, | |
| "grad_norm": 1.105089783668518, | |
| "learning_rate": 7.707376376859984e-05, | |
| "loss": 1.3092, | |
| "mean_token_accuracy": 0.7250679528713226, | |
| "num_tokens": 401117830.0, | |
| "step": 16300 | |
| }, | |
| { | |
| "entropy": 1.76406853556633, | |
| "epoch": 0.7745878339965889, | |
| "grad_norm": 1.2632781267166138, | |
| "learning_rate": 7.689974298641773e-05, | |
| "loss": 1.3509, | |
| "mean_token_accuracy": 0.7167004567384719, | |
| "num_tokens": 402347744.0, | |
| "step": 16350 | |
| }, | |
| { | |
| "entropy": 1.7978839790821075, | |
| "epoch": 0.7769566041311351, | |
| "grad_norm": 1.0637677907943726, | |
| "learning_rate": 7.672526241666248e-05, | |
| "loss": 1.3469, | |
| "mean_token_accuracy": 0.71729552090168, | |
| "num_tokens": 403549647.0, | |
| "step": 16400 | |
| }, | |
| { | |
| "entropy": 1.764045135974884, | |
| "epoch": 0.7793253742656813, | |
| "grad_norm": 0.9130464196205139, | |
| "learning_rate": 7.655032504166735e-05, | |
| "loss": 1.3204, | |
| "mean_token_accuracy": 0.7207730168104172, | |
| "num_tokens": 404774771.0, | |
| "step": 16450 | |
| }, | |
| { | |
| "entropy": 1.7537085354328155, | |
| "epoch": 0.7816941444002274, | |
| "grad_norm": 1.1020361185073853, | |
| "learning_rate": 7.637493385157358e-05, | |
| "loss": 1.327, | |
| "mean_token_accuracy": 0.7206742608547211, | |
| "num_tokens": 406011265.0, | |
| "step": 16500 | |
| }, | |
| { | |
| "entropy": 1.755173259973526, | |
| "epoch": 0.7840629145347735, | |
| "grad_norm": 0.9496687650680542, | |
| "learning_rate": 7.619909184427934e-05, | |
| "loss": 1.3013, | |
| "mean_token_accuracy": 0.7237769782543182, | |
| "num_tokens": 407262276.0, | |
| "step": 16550 | |
| }, | |
| { | |
| "entropy": 1.8000263261795044, | |
| "epoch": 0.7864316846693197, | |
| "grad_norm": 1.295494556427002, | |
| "learning_rate": 7.602280202538839e-05, | |
| "loss": 1.3753, | |
| "mean_token_accuracy": 0.7130093973875046, | |
| "num_tokens": 408508718.0, | |
| "step": 16600 | |
| }, | |
| { | |
| "entropy": 1.746513249874115, | |
| "epoch": 0.7888004548038658, | |
| "grad_norm": 1.1544225215911865, | |
| "learning_rate": 7.584606740815885e-05, | |
| "loss": 1.3246, | |
| "mean_token_accuracy": 0.7214300912618637, | |
| "num_tokens": 409745538.0, | |
| "step": 16650 | |
| }, | |
| { | |
| "entropy": 1.8098858451843263, | |
| "epoch": 0.791169224938412, | |
| "grad_norm": 0.9912792444229126, | |
| "learning_rate": 7.566889101345156e-05, | |
| "loss": 1.3452, | |
| "mean_token_accuracy": 0.7167094177007676, | |
| "num_tokens": 410988780.0, | |
| "step": 16700 | |
| }, | |
| { | |
| "entropy": 1.735760669708252, | |
| "epoch": 0.7935379950729581, | |
| "grad_norm": 0.9103946685791016, | |
| "learning_rate": 7.549127586967853e-05, | |
| "loss": 1.3319, | |
| "mean_token_accuracy": 0.7208295828104019, | |
| "num_tokens": 412261045.0, | |
| "step": 16750 | |
| }, | |
| { | |
| "entropy": 1.7235812985897063, | |
| "epoch": 0.7959067652075043, | |
| "grad_norm": 0.9902112483978271, | |
| "learning_rate": 7.531322501275114e-05, | |
| "loss": 1.3523, | |
| "mean_token_accuracy": 0.7184983837604523, | |
| "num_tokens": 413490577.0, | |
| "step": 16800 | |
| }, | |
| { | |
| "entropy": 1.7275058662891387, | |
| "epoch": 0.7982755353420504, | |
| "grad_norm": 0.857623279094696, | |
| "learning_rate": 7.513474148602826e-05, | |
| "loss": 1.3474, | |
| "mean_token_accuracy": 0.71783855676651, | |
| "num_tokens": 414734324.0, | |
| "step": 16850 | |
| }, | |
| { | |
| "entropy": 1.712952392101288, | |
| "epoch": 0.8006443054765966, | |
| "grad_norm": 0.8611600399017334, | |
| "learning_rate": 7.495582834026421e-05, | |
| "loss": 1.3284, | |
| "mean_token_accuracy": 0.7218652653694153, | |
| "num_tokens": 415979550.0, | |
| "step": 16900 | |
| }, | |
| { | |
| "entropy": 1.7426096272468568, | |
| "epoch": 0.8030130756111427, | |
| "grad_norm": 1.153913140296936, | |
| "learning_rate": 7.47764886335567e-05, | |
| "loss": 1.3673, | |
| "mean_token_accuracy": 0.7147996026277542, | |
| "num_tokens": 417172690.0, | |
| "step": 16950 | |
| }, | |
| { | |
| "entropy": 1.7134468042850495, | |
| "epoch": 0.8053818457456888, | |
| "grad_norm": 0.9624414443969727, | |
| "learning_rate": 7.459672543129438e-05, | |
| "loss": 1.3301, | |
| "mean_token_accuracy": 0.7208444583415985, | |
| "num_tokens": 418396867.0, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.8053818457456888, | |
| "eval_entropy": 1.1603839400696954, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.7568193746719877, | |
| "eval_num_tokens": 418396867.0, | |
| "eval_runtime": 730.007, | |
| "eval_samples_per_second": 33.993, | |
| "eval_steps_per_second": 4.249, | |
| "step": 17000 | |
| }, | |
| { | |
| "entropy": 1.7083112740516662, | |
| "epoch": 0.807750615880235, | |
| "grad_norm": 1.0578949451446533, | |
| "learning_rate": 7.441654180610466e-05, | |
| "loss": 1.3116, | |
| "mean_token_accuracy": 0.7241713929176331, | |
| "num_tokens": 419620242.0, | |
| "step": 17050 | |
| }, | |
| { | |
| "entropy": 1.7101111936569213, | |
| "epoch": 0.8101193860147812, | |
| "grad_norm": 1.4528478384017944, | |
| "learning_rate": 7.423594083780106e-05, | |
| "loss": 1.2894, | |
| "mean_token_accuracy": 0.7281060153245926, | |
| "num_tokens": 420840365.0, | |
| "step": 17100 | |
| }, | |
| { | |
| "entropy": 1.7118054771423339, | |
| "epoch": 0.8124881561493272, | |
| "grad_norm": 0.9906657338142395, | |
| "learning_rate": 7.405492561333052e-05, | |
| "loss": 1.3313, | |
| "mean_token_accuracy": 0.7208691501617431, | |
| "num_tokens": 422103109.0, | |
| "step": 17150 | |
| }, | |
| { | |
| "entropy": 1.743634682893753, | |
| "epoch": 0.8148569262838734, | |
| "grad_norm": 1.1013976335525513, | |
| "learning_rate": 7.387349922672082e-05, | |
| "loss": 1.3435, | |
| "mean_token_accuracy": 0.7182679337263107, | |
| "num_tokens": 423336510.0, | |
| "step": 17200 | |
| }, | |
| { | |
| "entropy": 1.7483525812625884, | |
| "epoch": 0.8172256964184196, | |
| "grad_norm": 1.1249130964279175, | |
| "learning_rate": 7.369166477902753e-05, | |
| "loss": 1.3356, | |
| "mean_token_accuracy": 0.7192718476057053, | |
| "num_tokens": 424558544.0, | |
| "step": 17250 | |
| }, | |
| { | |
| "entropy": 1.7446792232990265, | |
| "epoch": 0.8195944665529658, | |
| "grad_norm": 0.9279561042785645, | |
| "learning_rate": 7.350942537828105e-05, | |
| "loss": 1.357, | |
| "mean_token_accuracy": 0.7167576777935029, | |
| "num_tokens": 425791336.0, | |
| "step": 17300 | |
| }, | |
| { | |
| "entropy": 1.7697773826122285, | |
| "epoch": 0.8219632366875118, | |
| "grad_norm": 1.0355454683303833, | |
| "learning_rate": 7.332678413943352e-05, | |
| "loss": 1.3279, | |
| "mean_token_accuracy": 0.7211151129007339, | |
| "num_tokens": 427017026.0, | |
| "step": 17350 | |
| }, | |
| { | |
| "entropy": 1.769633387327194, | |
| "epoch": 0.824332006822058, | |
| "grad_norm": 1.1847100257873535, | |
| "learning_rate": 7.314374418430554e-05, | |
| "loss": 1.3239, | |
| "mean_token_accuracy": 0.7223846167325974, | |
| "num_tokens": 428272562.0, | |
| "step": 17400 | |
| }, | |
| { | |
| "entropy": 1.754239571094513, | |
| "epoch": 0.8267007769566042, | |
| "grad_norm": 1.0456291437149048, | |
| "learning_rate": 7.296030864153286e-05, | |
| "loss": 1.3136, | |
| "mean_token_accuracy": 0.7257154327630997, | |
| "num_tokens": 429502230.0, | |
| "step": 17450 | |
| }, | |
| { | |
| "entropy": 1.727893146276474, | |
| "epoch": 0.8290695470911503, | |
| "grad_norm": 1.0988260507583618, | |
| "learning_rate": 7.277648064651281e-05, | |
| "loss": 1.3325, | |
| "mean_token_accuracy": 0.7202855634689331, | |
| "num_tokens": 430738126.0, | |
| "step": 17500 | |
| }, | |
| { | |
| "entropy": 1.7441512525081635, | |
| "epoch": 0.8314383172256964, | |
| "grad_norm": 1.4698035717010498, | |
| "learning_rate": 7.259226334135079e-05, | |
| "loss": 1.303, | |
| "mean_token_accuracy": 0.7249046045541764, | |
| "num_tokens": 431957649.0, | |
| "step": 17550 | |
| }, | |
| { | |
| "entropy": 1.7701557087898254, | |
| "epoch": 0.8338070873602426, | |
| "grad_norm": 0.8762974143028259, | |
| "learning_rate": 7.240765987480654e-05, | |
| "loss": 1.3501, | |
| "mean_token_accuracy": 0.7148396277427673, | |
| "num_tokens": 433171928.0, | |
| "step": 17600 | |
| }, | |
| { | |
| "entropy": 1.771160396337509, | |
| "epoch": 0.8361758574947887, | |
| "grad_norm": 0.9736217260360718, | |
| "learning_rate": 7.222267340224034e-05, | |
| "loss": 1.324, | |
| "mean_token_accuracy": 0.7225921380519867, | |
| "num_tokens": 434354105.0, | |
| "step": 17650 | |
| }, | |
| { | |
| "entropy": 1.7395762205123901, | |
| "epoch": 0.8385446276293349, | |
| "grad_norm": 1.1522939205169678, | |
| "learning_rate": 7.203730708555897e-05, | |
| "loss": 1.3243, | |
| "mean_token_accuracy": 0.7200587207078933, | |
| "num_tokens": 435556247.0, | |
| "step": 17700 | |
| }, | |
| { | |
| "entropy": 1.7756438231468201, | |
| "epoch": 0.840913397763881, | |
| "grad_norm": 1.2958649396896362, | |
| "learning_rate": 7.185156409316186e-05, | |
| "loss": 1.374, | |
| "mean_token_accuracy": 0.7119175827503205, | |
| "num_tokens": 436745432.0, | |
| "step": 17750 | |
| }, | |
| { | |
| "entropy": 1.7542549967765808, | |
| "epoch": 0.8432821678984271, | |
| "grad_norm": 1.1340820789337158, | |
| "learning_rate": 7.166544759988676e-05, | |
| "loss": 1.3066, | |
| "mean_token_accuracy": 0.7246117842197418, | |
| "num_tokens": 437965467.0, | |
| "step": 17800 | |
| }, | |
| { | |
| "entropy": 1.7582876706123352, | |
| "epoch": 0.8456509380329733, | |
| "grad_norm": 1.3607231378555298, | |
| "learning_rate": 7.147896078695551e-05, | |
| "loss": 1.304, | |
| "mean_token_accuracy": 0.724840202331543, | |
| "num_tokens": 439190298.0, | |
| "step": 17850 | |
| }, | |
| { | |
| "entropy": 1.7604767334461213, | |
| "epoch": 0.8480197081675194, | |
| "grad_norm": 1.2566254138946533, | |
| "learning_rate": 7.129210684191973e-05, | |
| "loss": 1.3237, | |
| "mean_token_accuracy": 0.7207924181222916, | |
| "num_tokens": 440410573.0, | |
| "step": 17900 | |
| }, | |
| { | |
| "entropy": 1.7759632766246796, | |
| "epoch": 0.8503884783020655, | |
| "grad_norm": 1.3075144290924072, | |
| "learning_rate": 7.110488895860633e-05, | |
| "loss": 1.3476, | |
| "mean_token_accuracy": 0.7191230463981628, | |
| "num_tokens": 441635031.0, | |
| "step": 17950 | |
| }, | |
| { | |
| "entropy": 1.7343137776851654, | |
| "epoch": 0.8527572484366117, | |
| "grad_norm": 0.9776571393013, | |
| "learning_rate": 7.091731033706281e-05, | |
| "loss": 1.3101, | |
| "mean_token_accuracy": 0.7254330676794052, | |
| "num_tokens": 442870901.0, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.8527572484366117, | |
| "eval_entropy": 1.1681382538694325, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.7576727671230478, | |
| "eval_num_tokens": 442870901.0, | |
| "eval_runtime": 728.745, | |
| "eval_samples_per_second": 34.052, | |
| "eval_steps_per_second": 4.257, | |
| "step": 18000 | |
| }, | |
| { | |
| "entropy": 1.7468896472454072, | |
| "epoch": 0.8551260185711579, | |
| "grad_norm": 1.1819489002227783, | |
| "learning_rate": 7.072937418350267e-05, | |
| "loss": 1.3424, | |
| "mean_token_accuracy": 0.7205572354793549, | |
| "num_tokens": 444113226.0, | |
| "step": 18050 | |
| }, | |
| { | |
| "entropy": 1.752768530845642, | |
| "epoch": 0.857494788705704, | |
| "grad_norm": 1.1274313926696777, | |
| "learning_rate": 7.05410837102506e-05, | |
| "loss": 1.3276, | |
| "mean_token_accuracy": 0.7209989041090011, | |
| "num_tokens": 445334171.0, | |
| "step": 18100 | |
| }, | |
| { | |
| "entropy": 1.7303791618347169, | |
| "epoch": 0.8598635588402501, | |
| "grad_norm": 1.3661671876907349, | |
| "learning_rate": 7.035244213568752e-05, | |
| "loss": 1.2946, | |
| "mean_token_accuracy": 0.7279473000764847, | |
| "num_tokens": 446563863.0, | |
| "step": 18150 | |
| }, | |
| { | |
| "entropy": 1.7404825651645661, | |
| "epoch": 0.8622323289747963, | |
| "grad_norm": 1.2854536771774292, | |
| "learning_rate": 7.016345268419559e-05, | |
| "loss": 1.3414, | |
| "mean_token_accuracy": 0.7202253836393356, | |
| "num_tokens": 447805772.0, | |
| "step": 18200 | |
| }, | |
| { | |
| "entropy": 1.7380101013183593, | |
| "epoch": 0.8646010991093425, | |
| "grad_norm": 1.1535879373550415, | |
| "learning_rate": 6.997411858610311e-05, | |
| "loss": 1.3059, | |
| "mean_token_accuracy": 0.7250276601314545, | |
| "num_tokens": 449010921.0, | |
| "step": 18250 | |
| }, | |
| { | |
| "entropy": 1.7006947088241577, | |
| "epoch": 0.8669698692438885, | |
| "grad_norm": 1.076830506324768, | |
| "learning_rate": 6.978444307762932e-05, | |
| "loss": 1.2936, | |
| "mean_token_accuracy": 0.7278600412607193, | |
| "num_tokens": 450222021.0, | |
| "step": 18300 | |
| }, | |
| { | |
| "entropy": 1.7165203237533568, | |
| "epoch": 0.8693386393784347, | |
| "grad_norm": 1.1687458753585815, | |
| "learning_rate": 6.959442940082907e-05, | |
| "loss": 1.3093, | |
| "mean_token_accuracy": 0.7266046351194382, | |
| "num_tokens": 451437320.0, | |
| "step": 18350 | |
| }, | |
| { | |
| "entropy": 1.7525631844997407, | |
| "epoch": 0.8717074095129809, | |
| "grad_norm": 1.0274993181228638, | |
| "learning_rate": 6.940408080353737e-05, | |
| "loss": 1.3405, | |
| "mean_token_accuracy": 0.7197101265192032, | |
| "num_tokens": 452637999.0, | |
| "step": 18400 | |
| }, | |
| { | |
| "entropy": 1.6950951242446899, | |
| "epoch": 0.874076179647527, | |
| "grad_norm": 1.170281171798706, | |
| "learning_rate": 6.921340053931389e-05, | |
| "loss": 1.322, | |
| "mean_token_accuracy": 0.7230970364809036, | |
| "num_tokens": 453872583.0, | |
| "step": 18450 | |
| }, | |
| { | |
| "entropy": 1.7251943707466126, | |
| "epoch": 0.8764449497820731, | |
| "grad_norm": 1.0783965587615967, | |
| "learning_rate": 6.902239186738742e-05, | |
| "loss": 1.3077, | |
| "mean_token_accuracy": 0.7254717952013016, | |
| "num_tokens": 455115487.0, | |
| "step": 18500 | |
| }, | |
| { | |
| "entropy": 1.748286772966385, | |
| "epoch": 0.8788137199166193, | |
| "grad_norm": 0.9898918867111206, | |
| "learning_rate": 6.883105805260006e-05, | |
| "loss": 1.336, | |
| "mean_token_accuracy": 0.7198050141334533, | |
| "num_tokens": 456357289.0, | |
| "step": 18550 | |
| }, | |
| { | |
| "entropy": 1.7271603178977966, | |
| "epoch": 0.8811824900511654, | |
| "grad_norm": 1.0102683305740356, | |
| "learning_rate": 6.863940236535146e-05, | |
| "loss": 1.2972, | |
| "mean_token_accuracy": 0.7267592811584472, | |
| "num_tokens": 457601954.0, | |
| "step": 18600 | |
| }, | |
| { | |
| "entropy": 1.7312583494186402, | |
| "epoch": 0.8835512601857116, | |
| "grad_norm": 1.1015334129333496, | |
| "learning_rate": 6.844742808154297e-05, | |
| "loss": 1.3264, | |
| "mean_token_accuracy": 0.7216020065546036, | |
| "num_tokens": 458836210.0, | |
| "step": 18650 | |
| }, | |
| { | |
| "entropy": 1.7203202879428863, | |
| "epoch": 0.8859200303202577, | |
| "grad_norm": 1.2608678340911865, | |
| "learning_rate": 6.82551384825215e-05, | |
| "loss": 1.3112, | |
| "mean_token_accuracy": 0.7242467325925827, | |
| "num_tokens": 460059995.0, | |
| "step": 18700 | |
| }, | |
| { | |
| "entropy": 1.73845316529274, | |
| "epoch": 0.8882888004548039, | |
| "grad_norm": 1.2071726322174072, | |
| "learning_rate": 6.806253685502361e-05, | |
| "loss": 1.3422, | |
| "mean_token_accuracy": 0.7193791323900223, | |
| "num_tokens": 461321260.0, | |
| "step": 18750 | |
| }, | |
| { | |
| "entropy": 1.7286129772663117, | |
| "epoch": 0.89065757058935, | |
| "grad_norm": 0.9281275868415833, | |
| "learning_rate": 6.786962649111926e-05, | |
| "loss": 1.3346, | |
| "mean_token_accuracy": 0.7215994411706924, | |
| "num_tokens": 462547797.0, | |
| "step": 18800 | |
| }, | |
| { | |
| "entropy": 1.7290022671222687, | |
| "epoch": 0.8930263407238962, | |
| "grad_norm": 1.4838134050369263, | |
| "learning_rate": 6.767641068815546e-05, | |
| "loss": 1.2936, | |
| "mean_token_accuracy": 0.7260348951816559, | |
| "num_tokens": 463769872.0, | |
| "step": 18850 | |
| }, | |
| { | |
| "entropy": 1.7164359045028688, | |
| "epoch": 0.8953951108584423, | |
| "grad_norm": 0.9822611808776855, | |
| "learning_rate": 6.748289274870001e-05, | |
| "loss": 1.2841, | |
| "mean_token_accuracy": 0.7294727778434753, | |
| "num_tokens": 465012929.0, | |
| "step": 18900 | |
| }, | |
| { | |
| "entropy": 1.778987593650818, | |
| "epoch": 0.8977638809929884, | |
| "grad_norm": 1.056518793106079, | |
| "learning_rate": 6.728907598048503e-05, | |
| "loss": 1.3276, | |
| "mean_token_accuracy": 0.7213660079240799, | |
| "num_tokens": 466199667.0, | |
| "step": 18950 | |
| }, | |
| { | |
| "entropy": 1.7454373347759247, | |
| "epoch": 0.9001326511275346, | |
| "grad_norm": 1.1590458154678345, | |
| "learning_rate": 6.709496369635043e-05, | |
| "loss": 1.3057, | |
| "mean_token_accuracy": 0.7262363374233246, | |
| "num_tokens": 467441804.0, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.9001326511275346, | |
| "eval_entropy": 1.1678229505634554, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.7581798996773941, | |
| "eval_num_tokens": 467441804.0, | |
| "eval_runtime": 729.0934, | |
| "eval_samples_per_second": 34.035, | |
| "eval_steps_per_second": 4.255, | |
| "step": 19000 | |
| }, | |
| { | |
| "entropy": 1.7560745322704314, | |
| "epoch": 0.9025014212620808, | |
| "grad_norm": 1.1119104623794556, | |
| "learning_rate": 6.69005592141872e-05, | |
| "loss": 1.3103, | |
| "mean_token_accuracy": 0.7248196619749069, | |
| "num_tokens": 468672442.0, | |
| "step": 19050 | |
| }, | |
| { | |
| "entropy": 1.7368404233455659, | |
| "epoch": 0.9048701913966268, | |
| "grad_norm": 1.0841938257217407, | |
| "learning_rate": 6.670586585688086e-05, | |
| "loss": 1.3168, | |
| "mean_token_accuracy": 0.723661498427391, | |
| "num_tokens": 469911642.0, | |
| "step": 19100 | |
| }, | |
| { | |
| "entropy": 1.7659292578697205, | |
| "epoch": 0.907238961531173, | |
| "grad_norm": 1.3782135248184204, | |
| "learning_rate": 6.651088695225447e-05, | |
| "loss": 1.3044, | |
| "mean_token_accuracy": 0.7251561576128006, | |
| "num_tokens": 471110856.0, | |
| "step": 19150 | |
| }, | |
| { | |
| "entropy": 1.7336669373512268, | |
| "epoch": 0.9096077316657192, | |
| "grad_norm": 1.1556999683380127, | |
| "learning_rate": 6.631562583301191e-05, | |
| "loss": 1.297, | |
| "mean_token_accuracy": 0.7274081045389176, | |
| "num_tokens": 472320155.0, | |
| "step": 19200 | |
| }, | |
| { | |
| "entropy": 1.684252212047577, | |
| "epoch": 0.9119765018002653, | |
| "grad_norm": 0.9659352898597717, | |
| "learning_rate": 6.612008583668082e-05, | |
| "loss": 1.3105, | |
| "mean_token_accuracy": 0.7258839225769043, | |
| "num_tokens": 473560540.0, | |
| "step": 19250 | |
| }, | |
| { | |
| "entropy": 1.736447709798813, | |
| "epoch": 0.9143452719348114, | |
| "grad_norm": 1.231652855873108, | |
| "learning_rate": 6.592427030555565e-05, | |
| "loss": 1.3364, | |
| "mean_token_accuracy": 0.7204890990257263, | |
| "num_tokens": 474795749.0, | |
| "step": 19300 | |
| }, | |
| { | |
| "entropy": 1.6979431188106537, | |
| "epoch": 0.9167140420693576, | |
| "grad_norm": 1.0034390687942505, | |
| "learning_rate": 6.572818258664035e-05, | |
| "loss": 1.321, | |
| "mean_token_accuracy": 0.7222663134336471, | |
| "num_tokens": 476048351.0, | |
| "step": 19350 | |
| }, | |
| { | |
| "entropy": 1.7359539401531219, | |
| "epoch": 0.9190828122039038, | |
| "grad_norm": 1.0725759267807007, | |
| "learning_rate": 6.55318260315914e-05, | |
| "loss": 1.3228, | |
| "mean_token_accuracy": 0.7220259785652161, | |
| "num_tokens": 477260246.0, | |
| "step": 19400 | |
| }, | |
| { | |
| "entropy": 1.6657227408885955, | |
| "epoch": 0.9214515823384499, | |
| "grad_norm": 0.9826326370239258, | |
| "learning_rate": 6.533520399666033e-05, | |
| "loss": 1.2904, | |
| "mean_token_accuracy": 0.7296865725517273, | |
| "num_tokens": 478504094.0, | |
| "step": 19450 | |
| }, | |
| { | |
| "entropy": 1.7167856967449189, | |
| "epoch": 0.923820352472996, | |
| "grad_norm": 0.9942904710769653, | |
| "learning_rate": 6.513831984263641e-05, | |
| "loss": 1.2708, | |
| "mean_token_accuracy": 0.7317487215995788, | |
| "num_tokens": 479728318.0, | |
| "step": 19500 | |
| }, | |
| { | |
| "entropy": 1.7254127764701843, | |
| "epoch": 0.9261891226075422, | |
| "grad_norm": 1.4505666494369507, | |
| "learning_rate": 6.494117693478926e-05, | |
| "loss": 1.2893, | |
| "mean_token_accuracy": 0.7286518901586533, | |
| "num_tokens": 480937077.0, | |
| "step": 19550 | |
| }, | |
| { | |
| "entropy": 1.7521008849143982, | |
| "epoch": 0.9285578927420883, | |
| "grad_norm": 1.066002607345581, | |
| "learning_rate": 6.474377864281127e-05, | |
| "loss": 1.3244, | |
| "mean_token_accuracy": 0.7240516602993011, | |
| "num_tokens": 482172564.0, | |
| "step": 19600 | |
| }, | |
| { | |
| "entropy": 1.7225322866439818, | |
| "epoch": 0.9309266628766345, | |
| "grad_norm": 1.1396028995513916, | |
| "learning_rate": 6.454612834076e-05, | |
| "loss": 1.3052, | |
| "mean_token_accuracy": 0.7258065021038056, | |
| "num_tokens": 483406518.0, | |
| "step": 19650 | |
| }, | |
| { | |
| "entropy": 1.7187459325790406, | |
| "epoch": 0.9332954330111806, | |
| "grad_norm": 0.8960033655166626, | |
| "learning_rate": 6.434822940700057e-05, | |
| "loss": 1.297, | |
| "mean_token_accuracy": 0.7268172729015351, | |
| "num_tokens": 484643697.0, | |
| "step": 19700 | |
| }, | |
| { | |
| "entropy": 1.7082497942447663, | |
| "epoch": 0.9356642031457267, | |
| "grad_norm": 1.1821448802947998, | |
| "learning_rate": 6.415008522414782e-05, | |
| "loss": 1.292, | |
| "mean_token_accuracy": 0.7285556894540787, | |
| "num_tokens": 485855707.0, | |
| "step": 19750 | |
| }, | |
| { | |
| "entropy": 1.7341230428218841, | |
| "epoch": 0.9380329732802729, | |
| "grad_norm": 1.0824941396713257, | |
| "learning_rate": 6.395169917900858e-05, | |
| "loss": 1.3135, | |
| "mean_token_accuracy": 0.723016293644905, | |
| "num_tokens": 487075872.0, | |
| "step": 19800 | |
| }, | |
| { | |
| "entropy": 1.712151471376419, | |
| "epoch": 0.9404017434148191, | |
| "grad_norm": 1.4998127222061157, | |
| "learning_rate": 6.375307466252372e-05, | |
| "loss": 1.3492, | |
| "mean_token_accuracy": 0.7187636381387711, | |
| "num_tokens": 488272477.0, | |
| "step": 19850 | |
| }, | |
| { | |
| "entropy": 1.683067034482956, | |
| "epoch": 0.9427705135493651, | |
| "grad_norm": 0.9722391963005066, | |
| "learning_rate": 6.355421506971025e-05, | |
| "loss": 1.2899, | |
| "mean_token_accuracy": 0.728040627837181, | |
| "num_tokens": 489486559.0, | |
| "step": 19900 | |
| }, | |
| { | |
| "entropy": 1.6999699199199676, | |
| "epoch": 0.9451392836839113, | |
| "grad_norm": 1.150619626045227, | |
| "learning_rate": 6.335512379960322e-05, | |
| "loss": 1.2776, | |
| "mean_token_accuracy": 0.7324709689617157, | |
| "num_tokens": 490706066.0, | |
| "step": 19950 | |
| }, | |
| { | |
| "entropy": 1.7298948228359223, | |
| "epoch": 0.9475080538184575, | |
| "grad_norm": 1.2315185070037842, | |
| "learning_rate": 6.315580425519766e-05, | |
| "loss": 1.3312, | |
| "mean_token_accuracy": 0.7208713871240616, | |
| "num_tokens": 491918291.0, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.9475080538184575, | |
| "eval_entropy": 1.15051956327787, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.7579382187335588, | |
| "eval_num_tokens": 491918291.0, | |
| "eval_runtime": 726.7621, | |
| "eval_samples_per_second": 34.145, | |
| "eval_steps_per_second": 4.268, | |
| "step": 20000 | |
| }, | |
| { | |
| "entropy": 1.7038512194156648, | |
| "epoch": 0.9498768239530035, | |
| "grad_norm": 1.163555383682251, | |
| "learning_rate": 6.295625984339043e-05, | |
| "loss": 1.3204, | |
| "mean_token_accuracy": 0.7242659759521485, | |
| "num_tokens": 493116821.0, | |
| "step": 20050 | |
| }, | |
| { | |
| "entropy": 1.741351603269577, | |
| "epoch": 0.9522455940875497, | |
| "grad_norm": 1.1677212715148926, | |
| "learning_rate": 6.275649397492195e-05, | |
| "loss": 1.3061, | |
| "mean_token_accuracy": 0.7256824851036072, | |
| "num_tokens": 494289241.0, | |
| "step": 20100 | |
| }, | |
| { | |
| "entropy": 1.718240325450897, | |
| "epoch": 0.9546143642220959, | |
| "grad_norm": 0.9395654797554016, | |
| "learning_rate": 6.255651006431793e-05, | |
| "loss": 1.2979, | |
| "mean_token_accuracy": 0.7259613001346588, | |
| "num_tokens": 495524606.0, | |
| "step": 20150 | |
| }, | |
| { | |
| "entropy": 1.6947869229316712, | |
| "epoch": 0.9569831343566421, | |
| "grad_norm": 0.9097754955291748, | |
| "learning_rate": 6.235631152983098e-05, | |
| "loss": 1.3067, | |
| "mean_token_accuracy": 0.7251908606290818, | |
| "num_tokens": 496753832.0, | |
| "step": 20200 | |
| }, | |
| { | |
| "entropy": 1.6904695987701417, | |
| "epoch": 0.9593519044911881, | |
| "grad_norm": 1.0224709510803223, | |
| "learning_rate": 6.215590179338221e-05, | |
| "loss": 1.2916, | |
| "mean_token_accuracy": 0.7274919444322586, | |
| "num_tokens": 497977841.0, | |
| "step": 20250 | |
| }, | |
| { | |
| "entropy": 1.7162616491317748, | |
| "epoch": 0.9617206746257343, | |
| "grad_norm": 1.1449992656707764, | |
| "learning_rate": 6.195528428050273e-05, | |
| "loss": 1.3412, | |
| "mean_token_accuracy": 0.7184383940696716, | |
| "num_tokens": 499188236.0, | |
| "step": 20300 | |
| }, | |
| { | |
| "entropy": 1.6742710149288178, | |
| "epoch": 0.9640894447602805, | |
| "grad_norm": 1.100428581237793, | |
| "learning_rate": 6.17544624202751e-05, | |
| "loss": 1.2547, | |
| "mean_token_accuracy": 0.7356196337938309, | |
| "num_tokens": 500446815.0, | |
| "step": 20350 | |
| }, | |
| { | |
| "entropy": 1.6939631617069244, | |
| "epoch": 0.9664582148948266, | |
| "grad_norm": 0.9926633238792419, | |
| "learning_rate": 6.15534396452747e-05, | |
| "loss": 1.3121, | |
| "mean_token_accuracy": 0.7248954975605011, | |
| "num_tokens": 501679112.0, | |
| "step": 20400 | |
| }, | |
| { | |
| "entropy": 1.739109193086624, | |
| "epoch": 0.9688269850293727, | |
| "grad_norm": 1.270719051361084, | |
| "learning_rate": 6.135221939151108e-05, | |
| "loss": 1.3404, | |
| "mean_token_accuracy": 0.7209612077474594, | |
| "num_tokens": 502912575.0, | |
| "step": 20450 | |
| }, | |
| { | |
| "entropy": 1.7274162566661835, | |
| "epoch": 0.9711957551639189, | |
| "grad_norm": 1.2614290714263916, | |
| "learning_rate": 6.115080509836923e-05, | |
| "loss": 1.334, | |
| "mean_token_accuracy": 0.7216370838880539, | |
| "num_tokens": 504141410.0, | |
| "step": 20500 | |
| }, | |
| { | |
| "entropy": 1.708759593963623, | |
| "epoch": 0.973564525298465, | |
| "grad_norm": 1.2522040605545044, | |
| "learning_rate": 6.09492002085508e-05, | |
| "loss": 1.3175, | |
| "mean_token_accuracy": 0.7249479728937149, | |
| "num_tokens": 505345687.0, | |
| "step": 20550 | |
| }, | |
| { | |
| "entropy": 1.6911339461803436, | |
| "epoch": 0.9759332954330112, | |
| "grad_norm": 1.0709445476531982, | |
| "learning_rate": 6.074740816801516e-05, | |
| "loss": 1.2945, | |
| "mean_token_accuracy": 0.7283177155256272, | |
| "num_tokens": 506583420.0, | |
| "step": 20600 | |
| }, | |
| { | |
| "entropy": 1.7511263823509216, | |
| "epoch": 0.9783020655675573, | |
| "grad_norm": 1.1028821468353271, | |
| "learning_rate": 6.054543242592071e-05, | |
| "loss": 1.3661, | |
| "mean_token_accuracy": 0.7142648506164551, | |
| "num_tokens": 507769373.0, | |
| "step": 20650 | |
| }, | |
| { | |
| "entropy": 1.7048313403129578, | |
| "epoch": 0.9806708357021034, | |
| "grad_norm": 1.2044216394424438, | |
| "learning_rate": 6.034327643456569e-05, | |
| "loss": 1.2878, | |
| "mean_token_accuracy": 0.7300767368078231, | |
| "num_tokens": 508986124.0, | |
| "step": 20700 | |
| }, | |
| { | |
| "entropy": 1.732351886034012, | |
| "epoch": 0.9830396058366496, | |
| "grad_norm": 1.118547797203064, | |
| "learning_rate": 6.014094364932931e-05, | |
| "loss": 1.3298, | |
| "mean_token_accuracy": 0.7219525814056397, | |
| "num_tokens": 510216131.0, | |
| "step": 20750 | |
| }, | |
| { | |
| "entropy": 1.7391897797584535, | |
| "epoch": 0.9854083759711958, | |
| "grad_norm": 1.134662389755249, | |
| "learning_rate": 5.993843752861266e-05, | |
| "loss": 1.349, | |
| "mean_token_accuracy": 0.7185146582126617, | |
| "num_tokens": 511452480.0, | |
| "step": 20800 | |
| }, | |
| { | |
| "entropy": 1.7354848337173463, | |
| "epoch": 0.9877771461057419, | |
| "grad_norm": 1.2413026094436646, | |
| "learning_rate": 5.9735761533779575e-05, | |
| "loss": 1.3117, | |
| "mean_token_accuracy": 0.7226764589548111, | |
| "num_tokens": 512677249.0, | |
| "step": 20850 | |
| }, | |
| { | |
| "entropy": 1.7198446631431579, | |
| "epoch": 0.990145916240288, | |
| "grad_norm": 1.094545602798462, | |
| "learning_rate": 5.953291912909751e-05, | |
| "loss": 1.271, | |
| "mean_token_accuracy": 0.7310113716125488, | |
| "num_tokens": 513916521.0, | |
| "step": 20900 | |
| }, | |
| { | |
| "entropy": 1.7085091185569763, | |
| "epoch": 0.9925146863748342, | |
| "grad_norm": 1.1231886148452759, | |
| "learning_rate": 5.932991378167827e-05, | |
| "loss": 1.2842, | |
| "mean_token_accuracy": 0.7295077663660049, | |
| "num_tokens": 515136725.0, | |
| "step": 20950 | |
| }, | |
| { | |
| "entropy": 1.6870656645298003, | |
| "epoch": 0.9948834565093804, | |
| "grad_norm": 1.1282687187194824, | |
| "learning_rate": 5.912674896141883e-05, | |
| "loss": 1.3022, | |
| "mean_token_accuracy": 0.7291330778598786, | |
| "num_tokens": 516364497.0, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 0.9948834565093804, | |
| "eval_entropy": 1.1225133023863065, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.7580950103464779, | |
| "eval_num_tokens": 516364497.0, | |
| "eval_runtime": 728.4546, | |
| "eval_samples_per_second": 34.065, | |
| "eval_steps_per_second": 4.258, | |
| "step": 21000 | |
| }, | |
| { | |
| "entropy": 1.7055912351608276, | |
| "epoch": 0.9972522266439264, | |
| "grad_norm": 1.0038437843322754, | |
| "learning_rate": 5.892342814094193e-05, | |
| "loss": 1.3364, | |
| "mean_token_accuracy": 0.7213913726806641, | |
| "num_tokens": 517597413.0, | |
| "step": 21050 | |
| }, | |
| { | |
| "entropy": 1.6995044994354247, | |
| "epoch": 0.9996209967784726, | |
| "grad_norm": 1.0054970979690552, | |
| "learning_rate": 5.871995479553676e-05, | |
| "loss": 1.3426, | |
| "mean_token_accuracy": 0.7205459761619568, | |
| "num_tokens": 518808510.0, | |
| "step": 21100 | |
| }, | |
| { | |
| "entropy": 1.6123450350761415, | |
| "epoch": 1.0019897669130187, | |
| "grad_norm": 1.1733577251434326, | |
| "learning_rate": 5.851633240309963e-05, | |
| "loss": 1.2043, | |
| "mean_token_accuracy": 0.7436364030838013, | |
| "num_tokens": 520050715.0, | |
| "step": 21150 | |
| }, | |
| { | |
| "entropy": 1.6008358299732208, | |
| "epoch": 1.004358537047565, | |
| "grad_norm": 1.2594228982925415, | |
| "learning_rate": 5.8312564444074366e-05, | |
| "loss": 1.1962, | |
| "mean_token_accuracy": 0.7485791981220246, | |
| "num_tokens": 521272519.0, | |
| "step": 21200 | |
| }, | |
| { | |
| "entropy": 1.5864481520652771, | |
| "epoch": 1.006727307182111, | |
| "grad_norm": 1.1308941841125488, | |
| "learning_rate": 5.810865440139299e-05, | |
| "loss": 1.2014, | |
| "mean_token_accuracy": 0.7478245437145233, | |
| "num_tokens": 522505953.0, | |
| "step": 21250 | |
| }, | |
| { | |
| "entropy": 1.6050166165828705, | |
| "epoch": 1.0090960773166573, | |
| "grad_norm": 1.0653034448623657, | |
| "learning_rate": 5.790460576041608e-05, | |
| "loss": 1.2219, | |
| "mean_token_accuracy": 0.7426986521482468, | |
| "num_tokens": 523767803.0, | |
| "step": 21300 | |
| }, | |
| { | |
| "entropy": 1.598244547843933, | |
| "epoch": 1.0114648474512034, | |
| "grad_norm": 1.1732730865478516, | |
| "learning_rate": 5.77004220088732e-05, | |
| "loss": 1.2003, | |
| "mean_token_accuracy": 0.7464343649148941, | |
| "num_tokens": 525030135.0, | |
| "step": 21350 | |
| }, | |
| { | |
| "entropy": 1.5887214350700378, | |
| "epoch": 1.0138336175857494, | |
| "grad_norm": 1.0639592409133911, | |
| "learning_rate": 5.749610663680334e-05, | |
| "loss": 1.1959, | |
| "mean_token_accuracy": 0.7482451206445694, | |
| "num_tokens": 526245960.0, | |
| "step": 21400 | |
| }, | |
| { | |
| "entropy": 1.5872322118282318, | |
| "epoch": 1.0162023877202957, | |
| "grad_norm": 1.4078749418258667, | |
| "learning_rate": 5.729166313649523e-05, | |
| "loss": 1.1928, | |
| "mean_token_accuracy": 0.747797891497612, | |
| "num_tokens": 527461999.0, | |
| "step": 21450 | |
| }, | |
| { | |
| "entropy": 1.5684971618652344, | |
| "epoch": 1.0185711578548418, | |
| "grad_norm": 1.0040647983551025, | |
| "learning_rate": 5.7087095002427614e-05, | |
| "loss": 1.1636, | |
| "mean_token_accuracy": 0.7527302461862564, | |
| "num_tokens": 528693347.0, | |
| "step": 21500 | |
| }, | |
| { | |
| "entropy": 1.5993961930274962, | |
| "epoch": 1.0209399279893878, | |
| "grad_norm": 1.519827127456665, | |
| "learning_rate": 5.688240573120962e-05, | |
| "loss": 1.1996, | |
| "mean_token_accuracy": 0.7477444261312485, | |
| "num_tokens": 529916988.0, | |
| "step": 21550 | |
| }, | |
| { | |
| "entropy": 1.6100480878353118, | |
| "epoch": 1.023308698123934, | |
| "grad_norm": 1.298337459564209, | |
| "learning_rate": 5.6677598821520886e-05, | |
| "loss": 1.1941, | |
| "mean_token_accuracy": 0.746188434958458, | |
| "num_tokens": 531136613.0, | |
| "step": 21600 | |
| }, | |
| { | |
| "entropy": 1.608763552904129, | |
| "epoch": 1.0256774682584802, | |
| "grad_norm": 1.2754813432693481, | |
| "learning_rate": 5.647267777405177e-05, | |
| "loss": 1.1801, | |
| "mean_token_accuracy": 0.7486988466978073, | |
| "num_tokens": 532395495.0, | |
| "step": 21650 | |
| }, | |
| { | |
| "entropy": 1.5974150121212005, | |
| "epoch": 1.0280462383930264, | |
| "grad_norm": 1.306957721710205, | |
| "learning_rate": 5.626764609144364e-05, | |
| "loss": 1.229, | |
| "mean_token_accuracy": 0.7420145213603974, | |
| "num_tokens": 533626086.0, | |
| "step": 21700 | |
| }, | |
| { | |
| "entropy": 1.5830400812625884, | |
| "epoch": 1.0304150085275725, | |
| "grad_norm": 1.2734113931655884, | |
| "learning_rate": 5.606250727822883e-05, | |
| "loss": 1.2002, | |
| "mean_token_accuracy": 0.7472029691934585, | |
| "num_tokens": 534872278.0, | |
| "step": 21750 | |
| }, | |
| { | |
| "entropy": 1.5808272886276244, | |
| "epoch": 1.0327837786621186, | |
| "grad_norm": 0.9578977227210999, | |
| "learning_rate": 5.585726484077085e-05, | |
| "loss": 1.2118, | |
| "mean_token_accuracy": 0.745669018626213, | |
| "num_tokens": 536104060.0, | |
| "step": 21800 | |
| }, | |
| { | |
| "entropy": 1.546217747926712, | |
| "epoch": 1.0351525487966649, | |
| "grad_norm": 1.1195182800292969, | |
| "learning_rate": 5.565192228720439e-05, | |
| "loss": 1.1738, | |
| "mean_token_accuracy": 0.7508551919460297, | |
| "num_tokens": 537338574.0, | |
| "step": 21850 | |
| }, | |
| { | |
| "entropy": 1.5846721458435058, | |
| "epoch": 1.037521318931211, | |
| "grad_norm": 1.2577298879623413, | |
| "learning_rate": 5.544648312737547e-05, | |
| "loss": 1.1778, | |
| "mean_token_accuracy": 0.7510464614629746, | |
| "num_tokens": 538557980.0, | |
| "step": 21900 | |
| }, | |
| { | |
| "entropy": 1.5822556126117706, | |
| "epoch": 1.039890089065757, | |
| "grad_norm": 1.1481040716171265, | |
| "learning_rate": 5.524095087278126e-05, | |
| "loss": 1.1848, | |
| "mean_token_accuracy": 0.7497791868448257, | |
| "num_tokens": 539784677.0, | |
| "step": 21950 | |
| }, | |
| { | |
| "entropy": 1.627133835554123, | |
| "epoch": 1.0422588592003033, | |
| "grad_norm": 1.1411134004592896, | |
| "learning_rate": 5.503532903651023e-05, | |
| "loss": 1.2608, | |
| "mean_token_accuracy": 0.7371292334794998, | |
| "num_tokens": 541002283.0, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 1.0422588592003033, | |
| "eval_entropy": 1.0797893660903208, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.7580676496720636, | |
| "eval_num_tokens": 541002283.0, | |
| "eval_runtime": 728.5593, | |
| "eval_samples_per_second": 34.06, | |
| "eval_steps_per_second": 4.258, | |
| "step": 22000 | |
| }, | |
| { | |
| "entropy": 1.5906290924549102, | |
| "epoch": 1.0446276293348493, | |
| "grad_norm": 1.164602518081665, | |
| "learning_rate": 5.482962113318203e-05, | |
| "loss": 1.2085, | |
| "mean_token_accuracy": 0.744525915980339, | |
| "num_tokens": 542190090.0, | |
| "step": 22050 | |
| }, | |
| { | |
| "entropy": 1.5971219801902772, | |
| "epoch": 1.0469963994693954, | |
| "grad_norm": 1.2099922895431519, | |
| "learning_rate": 5.462383067888741e-05, | |
| "loss": 1.2251, | |
| "mean_token_accuracy": 0.7439098012447357, | |
| "num_tokens": 543425520.0, | |
| "step": 22100 | |
| }, | |
| { | |
| "entropy": 1.5796366775035857, | |
| "epoch": 1.0493651696039417, | |
| "grad_norm": 1.0977132320404053, | |
| "learning_rate": 5.441796119112814e-05, | |
| "loss": 1.1964, | |
| "mean_token_accuracy": 0.7492218172550201, | |
| "num_tokens": 544655034.0, | |
| "step": 22150 | |
| }, | |
| { | |
| "entropy": 1.6276609122753143, | |
| "epoch": 1.0517339397384877, | |
| "grad_norm": 1.2101308107376099, | |
| "learning_rate": 5.421201618875689e-05, | |
| "loss": 1.2242, | |
| "mean_token_accuracy": 0.7425367647409439, | |
| "num_tokens": 545867278.0, | |
| "step": 22200 | |
| }, | |
| { | |
| "entropy": 1.6007154369354248, | |
| "epoch": 1.054102709873034, | |
| "grad_norm": 1.2157655954360962, | |
| "learning_rate": 5.4005999191917034e-05, | |
| "loss": 1.2258, | |
| "mean_token_accuracy": 0.7422733837366104, | |
| "num_tokens": 547117555.0, | |
| "step": 22250 | |
| }, | |
| { | |
| "entropy": 1.6148575782775878, | |
| "epoch": 1.05647148000758, | |
| "grad_norm": 1.2759310007095337, | |
| "learning_rate": 5.379991372198259e-05, | |
| "loss": 1.187, | |
| "mean_token_accuracy": 0.7483934825658798, | |
| "num_tokens": 548337677.0, | |
| "step": 22300 | |
| }, | |
| { | |
| "entropy": 1.6164679837226867, | |
| "epoch": 1.0588402501421261, | |
| "grad_norm": 1.041208267211914, | |
| "learning_rate": 5.359376330149789e-05, | |
| "loss": 1.2082, | |
| "mean_token_accuracy": 0.7465775471925735, | |
| "num_tokens": 549541289.0, | |
| "step": 22350 | |
| }, | |
| { | |
| "entropy": 1.5865103662014008, | |
| "epoch": 1.0612090202766724, | |
| "grad_norm": 1.0899627208709717, | |
| "learning_rate": 5.338755145411749e-05, | |
| "loss": 1.1928, | |
| "mean_token_accuracy": 0.747542524933815, | |
| "num_tokens": 550805086.0, | |
| "step": 22400 | |
| }, | |
| { | |
| "entropy": 1.6212577140331268, | |
| "epoch": 1.0635777904112185, | |
| "grad_norm": 1.08705472946167, | |
| "learning_rate": 5.318128170454589e-05, | |
| "loss": 1.1852, | |
| "mean_token_accuracy": 0.7487930029630661, | |
| "num_tokens": 552036289.0, | |
| "step": 22450 | |
| }, | |
| { | |
| "entropy": 1.620989305973053, | |
| "epoch": 1.0659465605457648, | |
| "grad_norm": 1.2424880266189575, | |
| "learning_rate": 5.297495757847727e-05, | |
| "loss": 1.1865, | |
| "mean_token_accuracy": 0.750239091515541, | |
| "num_tokens": 553267770.0, | |
| "step": 22500 | |
| }, | |
| { | |
| "entropy": 1.5796532726287842, | |
| "epoch": 1.0683153306803108, | |
| "grad_norm": 1.4131051301956177, | |
| "learning_rate": 5.2768582602535246e-05, | |
| "loss": 1.177, | |
| "mean_token_accuracy": 0.7511077529191971, | |
| "num_tokens": 554500484.0, | |
| "step": 22550 | |
| }, | |
| { | |
| "entropy": 1.6192417418956757, | |
| "epoch": 1.0706841008148569, | |
| "grad_norm": 1.2125214338302612, | |
| "learning_rate": 5.25621603042126e-05, | |
| "loss": 1.2202, | |
| "mean_token_accuracy": 0.7429804271459579, | |
| "num_tokens": 555724501.0, | |
| "step": 22600 | |
| }, | |
| { | |
| "entropy": 1.6070238423347474, | |
| "epoch": 1.0730528709494032, | |
| "grad_norm": 1.1681252717971802, | |
| "learning_rate": 5.235569421181103e-05, | |
| "loss": 1.1896, | |
| "mean_token_accuracy": 0.748240845799446, | |
| "num_tokens": 556951175.0, | |
| "step": 22650 | |
| }, | |
| { | |
| "entropy": 1.6153511393070221, | |
| "epoch": 1.0754216410839492, | |
| "grad_norm": 1.2313953638076782, | |
| "learning_rate": 5.21491878543807e-05, | |
| "loss": 1.1826, | |
| "mean_token_accuracy": 0.7484846365451813, | |
| "num_tokens": 558174509.0, | |
| "step": 22700 | |
| }, | |
| { | |
| "entropy": 1.6176446998119354, | |
| "epoch": 1.0777904112184953, | |
| "grad_norm": 1.202515959739685, | |
| "learning_rate": 5.194264476166006e-05, | |
| "loss": 1.2147, | |
| "mean_token_accuracy": 0.7431507217884064, | |
| "num_tokens": 559404128.0, | |
| "step": 22750 | |
| }, | |
| { | |
| "entropy": 1.6383704769611358, | |
| "epoch": 1.0801591813530416, | |
| "grad_norm": 1.1402473449707031, | |
| "learning_rate": 5.1736068464015463e-05, | |
| "loss": 1.2216, | |
| "mean_token_accuracy": 0.7412754154205322, | |
| "num_tokens": 560601861.0, | |
| "step": 22800 | |
| }, | |
| { | |
| "entropy": 1.6056468284130097, | |
| "epoch": 1.0825279514875876, | |
| "grad_norm": 0.9191189408302307, | |
| "learning_rate": 5.152946249238082e-05, | |
| "loss": 1.1687, | |
| "mean_token_accuracy": 0.751121336221695, | |
| "num_tokens": 561852684.0, | |
| "step": 22850 | |
| }, | |
| { | |
| "entropy": 1.5993254363536835, | |
| "epoch": 1.0848967216221337, | |
| "grad_norm": 1.460180640220642, | |
| "learning_rate": 5.132283037819723e-05, | |
| "loss": 1.2194, | |
| "mean_token_accuracy": 0.7445776867866516, | |
| "num_tokens": 563087919.0, | |
| "step": 22900 | |
| }, | |
| { | |
| "entropy": 1.6140161871910095, | |
| "epoch": 1.08726549175668, | |
| "grad_norm": 1.2920235395431519, | |
| "learning_rate": 5.111617565335264e-05, | |
| "loss": 1.2139, | |
| "mean_token_accuracy": 0.7439986896514893, | |
| "num_tokens": 564333108.0, | |
| "step": 22950 | |
| }, | |
| { | |
| "entropy": 1.599245457649231, | |
| "epoch": 1.089634261891226, | |
| "grad_norm": 1.0727440118789673, | |
| "learning_rate": 5.090950185012152e-05, | |
| "loss": 1.1895, | |
| "mean_token_accuracy": 0.7461957842111587, | |
| "num_tokens": 565584977.0, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 1.089634261891226, | |
| "eval_entropy": 1.0720901108034038, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.7579072969698429, | |
| "eval_num_tokens": 565584977.0, | |
| "eval_runtime": 729.1236, | |
| "eval_samples_per_second": 34.034, | |
| "eval_steps_per_second": 4.254, | |
| "step": 23000 | |
| }, | |
| { | |
| "entropy": 1.5884887778759003, | |
| "epoch": 1.0920030320257723, | |
| "grad_norm": 1.2358678579330444, | |
| "learning_rate": 5.070281250110437e-05, | |
| "loss": 1.2144, | |
| "mean_token_accuracy": 0.7440959370136261, | |
| "num_tokens": 566830458.0, | |
| "step": 23050 | |
| }, | |
| { | |
| "entropy": 1.6139474022388458, | |
| "epoch": 1.0943718021603184, | |
| "grad_norm": 1.1043397188186646, | |
| "learning_rate": 5.049611113916745e-05, | |
| "loss": 1.2277, | |
| "mean_token_accuracy": 0.7419761300086976, | |
| "num_tokens": 568033795.0, | |
| "step": 23100 | |
| }, | |
| { | |
| "entropy": 1.5839227229356765, | |
| "epoch": 1.0967405722948644, | |
| "grad_norm": 1.267622470855713, | |
| "learning_rate": 5.028940129738234e-05, | |
| "loss": 1.1841, | |
| "mean_token_accuracy": 0.7496823114156723, | |
| "num_tokens": 569296146.0, | |
| "step": 23150 | |
| }, | |
| { | |
| "entropy": 1.5836478543281556, | |
| "epoch": 1.0991093424294107, | |
| "grad_norm": 1.3109345436096191, | |
| "learning_rate": 5.0082686508965594e-05, | |
| "loss": 1.2001, | |
| "mean_token_accuracy": 0.7444102185964584, | |
| "num_tokens": 570538060.0, | |
| "step": 23200 | |
| }, | |
| { | |
| "entropy": 1.6264153301715851, | |
| "epoch": 1.1014781125639568, | |
| "grad_norm": 1.166844129562378, | |
| "learning_rate": 4.987597030721826e-05, | |
| "loss": 1.2166, | |
| "mean_token_accuracy": 0.7437608361244201, | |
| "num_tokens": 571764299.0, | |
| "step": 23250 | |
| }, | |
| { | |
| "entropy": 1.593840502500534, | |
| "epoch": 1.1038468826985028, | |
| "grad_norm": 1.313543677330017, | |
| "learning_rate": 4.966925622546559e-05, | |
| "loss": 1.1976, | |
| "mean_token_accuracy": 0.7477673798799515, | |
| "num_tokens": 572986252.0, | |
| "step": 23300 | |
| }, | |
| { | |
| "entropy": 1.6218383753299712, | |
| "epoch": 1.1062156528330491, | |
| "grad_norm": 1.3079559803009033, | |
| "learning_rate": 4.9462547796996554e-05, | |
| "loss": 1.2085, | |
| "mean_token_accuracy": 0.7460601913928986, | |
| "num_tokens": 574203486.0, | |
| "step": 23350 | |
| }, | |
| { | |
| "entropy": 1.6025708365440368, | |
| "epoch": 1.1085844229675952, | |
| "grad_norm": 1.045305848121643, | |
| "learning_rate": 4.925584855500357e-05, | |
| "loss": 1.1834, | |
| "mean_token_accuracy": 0.7496994876861572, | |
| "num_tokens": 575431997.0, | |
| "step": 23400 | |
| }, | |
| { | |
| "entropy": 1.6023164546489717, | |
| "epoch": 1.1109531931021415, | |
| "grad_norm": 1.6260581016540527, | |
| "learning_rate": 4.904916203252196e-05, | |
| "loss": 1.1972, | |
| "mean_token_accuracy": 0.7476231580972672, | |
| "num_tokens": 576655765.0, | |
| "step": 23450 | |
| }, | |
| { | |
| "entropy": 1.6074532234668732, | |
| "epoch": 1.1133219632366875, | |
| "grad_norm": 1.1137516498565674, | |
| "learning_rate": 4.884249176236966e-05, | |
| "loss": 1.2031, | |
| "mean_token_accuracy": 0.7456535613536834, | |
| "num_tokens": 577896889.0, | |
| "step": 23500 | |
| }, | |
| { | |
| "entropy": 1.6064541089534758, | |
| "epoch": 1.1156907333712336, | |
| "grad_norm": 1.0753376483917236, | |
| "learning_rate": 4.8635841277086823e-05, | |
| "loss": 1.2093, | |
| "mean_token_accuracy": 0.7460182595252991, | |
| "num_tokens": 579123368.0, | |
| "step": 23550 | |
| }, | |
| { | |
| "entropy": 1.6451520609855652, | |
| "epoch": 1.1180595035057799, | |
| "grad_norm": 1.2830525636672974, | |
| "learning_rate": 4.842921410887541e-05, | |
| "loss": 1.2173, | |
| "mean_token_accuracy": 0.7460962778329849, | |
| "num_tokens": 580343576.0, | |
| "step": 23600 | |
| }, | |
| { | |
| "entropy": 1.594506859779358, | |
| "epoch": 1.120428273640326, | |
| "grad_norm": 1.2104618549346924, | |
| "learning_rate": 4.822261378953884e-05, | |
| "loss": 1.1846, | |
| "mean_token_accuracy": 0.7500998550653457, | |
| "num_tokens": 581571230.0, | |
| "step": 23650 | |
| }, | |
| { | |
| "entropy": 1.5977623069286346, | |
| "epoch": 1.122797043774872, | |
| "grad_norm": 1.0635625123977661, | |
| "learning_rate": 4.8016043850421614e-05, | |
| "loss": 1.2121, | |
| "mean_token_accuracy": 0.7432440650463105, | |
| "num_tokens": 582786589.0, | |
| "step": 23700 | |
| }, | |
| { | |
| "entropy": 1.6117420196533203, | |
| "epoch": 1.1251658139094183, | |
| "grad_norm": 1.2150670289993286, | |
| "learning_rate": 4.7809507822348967e-05, | |
| "loss": 1.1995, | |
| "mean_token_accuracy": 0.746940575838089, | |
| "num_tokens": 583979707.0, | |
| "step": 23750 | |
| }, | |
| { | |
| "entropy": 1.6140946924686432, | |
| "epoch": 1.1275345840439643, | |
| "grad_norm": 1.1496037244796753, | |
| "learning_rate": 4.7603009235566465e-05, | |
| "loss": 1.1965, | |
| "mean_token_accuracy": 0.7485955774784088, | |
| "num_tokens": 585198089.0, | |
| "step": 23800 | |
| }, | |
| { | |
| "entropy": 1.612507269382477, | |
| "epoch": 1.1299033541785106, | |
| "grad_norm": 1.1946005821228027, | |
| "learning_rate": 4.7396551619679735e-05, | |
| "loss": 1.1963, | |
| "mean_token_accuracy": 0.7465278053283692, | |
| "num_tokens": 586406915.0, | |
| "step": 23850 | |
| }, | |
| { | |
| "entropy": 1.614688711166382, | |
| "epoch": 1.1322721243130567, | |
| "grad_norm": 1.3998786211013794, | |
| "learning_rate": 4.719013850359412e-05, | |
| "loss": 1.202, | |
| "mean_token_accuracy": 0.7469007116556168, | |
| "num_tokens": 587625422.0, | |
| "step": 23900 | |
| }, | |
| { | |
| "entropy": 1.64091064453125, | |
| "epoch": 1.1346408944476027, | |
| "grad_norm": 1.1203569173812866, | |
| "learning_rate": 4.69837734154543e-05, | |
| "loss": 1.1882, | |
| "mean_token_accuracy": 0.7487949818372727, | |
| "num_tokens": 588838858.0, | |
| "step": 23950 | |
| }, | |
| { | |
| "entropy": 1.6168962919712067, | |
| "epoch": 1.137009664582149, | |
| "grad_norm": 1.4909604787826538, | |
| "learning_rate": 4.677745988258406e-05, | |
| "loss": 1.1948, | |
| "mean_token_accuracy": 0.7495903551578522, | |
| "num_tokens": 590081510.0, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 1.137009664582149, | |
| "eval_entropy": 1.0897794357236166, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.7586307351547699, | |
| "eval_num_tokens": 590081510.0, | |
| "eval_runtime": 728.9602, | |
| "eval_samples_per_second": 34.042, | |
| "eval_steps_per_second": 4.255, | |
| "step": 24000 | |
| }, | |
| { | |
| "entropy": 1.6240591382980347, | |
| "epoch": 1.139378434716695, | |
| "grad_norm": 0.9998461604118347, | |
| "learning_rate": 4.657120143142597e-05, | |
| "loss": 1.1922, | |
| "mean_token_accuracy": 0.7492350846529007, | |
| "num_tokens": 591297702.0, | |
| "step": 24050 | |
| }, | |
| { | |
| "entropy": 1.655022679567337, | |
| "epoch": 1.1417472048512411, | |
| "grad_norm": 1.124312162399292, | |
| "learning_rate": 4.636500158748109e-05, | |
| "loss": 1.2067, | |
| "mean_token_accuracy": 0.7455829763412476, | |
| "num_tokens": 592508714.0, | |
| "step": 24100 | |
| }, | |
| { | |
| "entropy": 1.5910465133190155, | |
| "epoch": 1.1441159749857874, | |
| "grad_norm": 1.11435866355896, | |
| "learning_rate": 4.6158863875248734e-05, | |
| "loss": 1.1684, | |
| "mean_token_accuracy": 0.7527882248163223, | |
| "num_tokens": 593747081.0, | |
| "step": 24150 | |
| }, | |
| { | |
| "entropy": 1.6349923205375672, | |
| "epoch": 1.1464847451203335, | |
| "grad_norm": 1.070635437965393, | |
| "learning_rate": 4.595279181816624e-05, | |
| "loss": 1.1916, | |
| "mean_token_accuracy": 0.749586900472641, | |
| "num_tokens": 594943747.0, | |
| "step": 24200 | |
| }, | |
| { | |
| "entropy": 1.601109493970871, | |
| "epoch": 1.1488535152548796, | |
| "grad_norm": 1.1581242084503174, | |
| "learning_rate": 4.574678893854871e-05, | |
| "loss": 1.1818, | |
| "mean_token_accuracy": 0.7509790074825287, | |
| "num_tokens": 596175131.0, | |
| "step": 24250 | |
| }, | |
| { | |
| "entropy": 1.6024657559394837, | |
| "epoch": 1.1512222853894258, | |
| "grad_norm": 1.2476531267166138, | |
| "learning_rate": 4.554085875752879e-05, | |
| "loss": 1.1997, | |
| "mean_token_accuracy": 0.747204402089119, | |
| "num_tokens": 597415232.0, | |
| "step": 24300 | |
| }, | |
| { | |
| "entropy": 1.636058064699173, | |
| "epoch": 1.153591055523972, | |
| "grad_norm": 1.6576809883117676, | |
| "learning_rate": 4.533500479499661e-05, | |
| "loss": 1.248, | |
| "mean_token_accuracy": 0.7378575146198273, | |
| "num_tokens": 598627396.0, | |
| "step": 24350 | |
| }, | |
| { | |
| "entropy": 1.6401480340957642, | |
| "epoch": 1.1559598256585182, | |
| "grad_norm": 1.2274305820465088, | |
| "learning_rate": 4.512923056953941e-05, | |
| "loss": 1.2219, | |
| "mean_token_accuracy": 0.7444866347312927, | |
| "num_tokens": 599864565.0, | |
| "step": 24400 | |
| }, | |
| { | |
| "entropy": 1.5943049252033235, | |
| "epoch": 1.1583285957930642, | |
| "grad_norm": 1.2362310886383057, | |
| "learning_rate": 4.49235395983816e-05, | |
| "loss": 1.1675, | |
| "mean_token_accuracy": 0.7534567403793335, | |
| "num_tokens": 601090785.0, | |
| "step": 24450 | |
| }, | |
| { | |
| "entropy": 1.5798736822605133, | |
| "epoch": 1.1606973659276103, | |
| "grad_norm": 0.9551867842674255, | |
| "learning_rate": 4.4717935397324504e-05, | |
| "loss": 1.1633, | |
| "mean_token_accuracy": 0.7534276330471039, | |
| "num_tokens": 602347409.0, | |
| "step": 24500 | |
| }, | |
| { | |
| "entropy": 1.612923823595047, | |
| "epoch": 1.1630661360621566, | |
| "grad_norm": 1.167639970779419, | |
| "learning_rate": 4.4512421480686334e-05, | |
| "loss": 1.1752, | |
| "mean_token_accuracy": 0.7525352644920349, | |
| "num_tokens": 603557548.0, | |
| "step": 24550 | |
| }, | |
| { | |
| "entropy": 1.5877990233898163, | |
| "epoch": 1.1654349061967026, | |
| "grad_norm": 1.1369372606277466, | |
| "learning_rate": 4.430700136124209e-05, | |
| "loss": 1.1781, | |
| "mean_token_accuracy": 0.7510064965486527, | |
| "num_tokens": 604816361.0, | |
| "step": 24600 | |
| }, | |
| { | |
| "entropy": 1.5707013046741485, | |
| "epoch": 1.167803676331249, | |
| "grad_norm": 1.3217617273330688, | |
| "learning_rate": 4.410167855016356e-05, | |
| "loss": 1.1578, | |
| "mean_token_accuracy": 0.7544564688205719, | |
| "num_tokens": 606031595.0, | |
| "step": 24650 | |
| }, | |
| { | |
| "entropy": 1.6142506301403046, | |
| "epoch": 1.170172446465795, | |
| "grad_norm": 1.0833561420440674, | |
| "learning_rate": 4.3896456556959245e-05, | |
| "loss": 1.1882, | |
| "mean_token_accuracy": 0.7481600660085678, | |
| "num_tokens": 607260243.0, | |
| "step": 24700 | |
| }, | |
| { | |
| "entropy": 1.5962833178043365, | |
| "epoch": 1.172541216600341, | |
| "grad_norm": 1.2788193225860596, | |
| "learning_rate": 4.369133888941442e-05, | |
| "loss": 1.1685, | |
| "mean_token_accuracy": 0.7528700757026673, | |
| "num_tokens": 608463931.0, | |
| "step": 24750 | |
| }, | |
| { | |
| "entropy": 1.6402158641815185, | |
| "epoch": 1.1749099867348873, | |
| "grad_norm": 1.275524616241455, | |
| "learning_rate": 4.348632905353116e-05, | |
| "loss": 1.1968, | |
| "mean_token_accuracy": 0.7491856187582016, | |
| "num_tokens": 609655302.0, | |
| "step": 24800 | |
| }, | |
| { | |
| "entropy": 1.614987963438034, | |
| "epoch": 1.1772787568694334, | |
| "grad_norm": 0.945978581905365, | |
| "learning_rate": 4.32814305534684e-05, | |
| "loss": 1.209, | |
| "mean_token_accuracy": 0.7443074882030487, | |
| "num_tokens": 610898851.0, | |
| "step": 24850 | |
| }, | |
| { | |
| "entropy": 1.636513990163803, | |
| "epoch": 1.1796475270039795, | |
| "grad_norm": 1.0590687990188599, | |
| "learning_rate": 4.307664689148205e-05, | |
| "loss": 1.2299, | |
| "mean_token_accuracy": 0.7430419319868088, | |
| "num_tokens": 612116242.0, | |
| "step": 24900 | |
| }, | |
| { | |
| "entropy": 1.6237515592575074, | |
| "epoch": 1.1820162971385257, | |
| "grad_norm": 1.3774100542068481, | |
| "learning_rate": 4.287198156786516e-05, | |
| "loss": 1.1786, | |
| "mean_token_accuracy": 0.7511296081542969, | |
| "num_tokens": 613335390.0, | |
| "step": 24950 | |
| }, | |
| { | |
| "entropy": 1.596169695854187, | |
| "epoch": 1.1843850672730718, | |
| "grad_norm": 1.1338964700698853, | |
| "learning_rate": 4.2667438080888036e-05, | |
| "loss": 1.1616, | |
| "mean_token_accuracy": 0.7531165385246277, | |
| "num_tokens": 614572046.0, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 1.1843850672730718, | |
| "eval_entropy": 1.0971692777848259, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.7585837088482061, | |
| "eval_num_tokens": 614572046.0, | |
| "eval_runtime": 728.9341, | |
| "eval_samples_per_second": 34.043, | |
| "eval_steps_per_second": 4.256, | |
| "step": 25000 | |
| }, | |
| { | |
| "entropy": 1.6357793831825256, | |
| "epoch": 1.1867538374076179, | |
| "grad_norm": 1.3235797882080078, | |
| "learning_rate": 4.24630199267385e-05, | |
| "loss": 1.2175, | |
| "mean_token_accuracy": 0.7434951883554458, | |
| "num_tokens": 615813138.0, | |
| "step": 25050 | |
| }, | |
| { | |
| "entropy": 1.610986716747284, | |
| "epoch": 1.1891226075421641, | |
| "grad_norm": 0.9810039401054382, | |
| "learning_rate": 4.225873059946206e-05, | |
| "loss": 1.183, | |
| "mean_token_accuracy": 0.7497907614707947, | |
| "num_tokens": 617052724.0, | |
| "step": 25100 | |
| }, | |
| { | |
| "entropy": 1.5992292380332946, | |
| "epoch": 1.1914913776767102, | |
| "grad_norm": 1.3308528661727905, | |
| "learning_rate": 4.2054573590902295e-05, | |
| "loss": 1.1455, | |
| "mean_token_accuracy": 0.7566713351011276, | |
| "num_tokens": 618273598.0, | |
| "step": 25150 | |
| }, | |
| { | |
| "entropy": 1.6277151501178742, | |
| "epoch": 1.1938601478112565, | |
| "grad_norm": 1.3727302551269531, | |
| "learning_rate": 4.1850552390641076e-05, | |
| "loss": 1.2243, | |
| "mean_token_accuracy": 0.7431273967027664, | |
| "num_tokens": 619483045.0, | |
| "step": 25200 | |
| }, | |
| { | |
| "entropy": 1.622536163330078, | |
| "epoch": 1.1962289179458026, | |
| "grad_norm": 1.319136142730713, | |
| "learning_rate": 4.164667048593892e-05, | |
| "loss": 1.1947, | |
| "mean_token_accuracy": 0.7481009513139725, | |
| "num_tokens": 620705194.0, | |
| "step": 25250 | |
| }, | |
| { | |
| "entropy": 1.6237769031524658, | |
| "epoch": 1.1985976880803486, | |
| "grad_norm": 1.2181921005249023, | |
| "learning_rate": 4.144293136167549e-05, | |
| "loss": 1.1737, | |
| "mean_token_accuracy": 0.7511861574649811, | |
| "num_tokens": 621924964.0, | |
| "step": 25300 | |
| }, | |
| { | |
| "entropy": 1.6047194039821624, | |
| "epoch": 1.200966458214895, | |
| "grad_norm": 1.2977948188781738, | |
| "learning_rate": 4.123933850028991e-05, | |
| "loss": 1.2143, | |
| "mean_token_accuracy": 0.7442529916763305, | |
| "num_tokens": 623143777.0, | |
| "step": 25350 | |
| }, | |
| { | |
| "entropy": 1.6068167972564698, | |
| "epoch": 1.203335228349441, | |
| "grad_norm": 1.313650369644165, | |
| "learning_rate": 4.103589538172127e-05, | |
| "loss": 1.2124, | |
| "mean_token_accuracy": 0.7447144162654876, | |
| "num_tokens": 624380253.0, | |
| "step": 25400 | |
| }, | |
| { | |
| "entropy": 1.5983986258506775, | |
| "epoch": 1.2057039984839872, | |
| "grad_norm": 1.6129273176193237, | |
| "learning_rate": 4.0832605483349193e-05, | |
| "loss": 1.1634, | |
| "mean_token_accuracy": 0.7545448428392411, | |
| "num_tokens": 625600084.0, | |
| "step": 25450 | |
| }, | |
| { | |
| "entropy": 1.6317675995826721, | |
| "epoch": 1.2080727686185333, | |
| "grad_norm": 1.2153606414794922, | |
| "learning_rate": 4.062947227993433e-05, | |
| "loss": 1.1998, | |
| "mean_token_accuracy": 0.7479275733232498, | |
| "num_tokens": 626816439.0, | |
| "step": 25500 | |
| }, | |
| { | |
| "entropy": 1.591133669614792, | |
| "epoch": 1.2104415387530794, | |
| "grad_norm": 1.3711997270584106, | |
| "learning_rate": 4.042649924355905e-05, | |
| "loss": 1.1747, | |
| "mean_token_accuracy": 0.7510749793052673, | |
| "num_tokens": 628060275.0, | |
| "step": 25550 | |
| }, | |
| { | |
| "entropy": 1.616237759590149, | |
| "epoch": 1.2128103088876254, | |
| "grad_norm": 1.317814588546753, | |
| "learning_rate": 4.022368984356801e-05, | |
| "loss": 1.1964, | |
| "mean_token_accuracy": 0.7473501098155976, | |
| "num_tokens": 629291699.0, | |
| "step": 25600 | |
| }, | |
| { | |
| "entropy": 1.618736606836319, | |
| "epoch": 1.2151790790221717, | |
| "grad_norm": 1.1058121919631958, | |
| "learning_rate": 4.002104754650887e-05, | |
| "loss": 1.2022, | |
| "mean_token_accuracy": 0.74667718231678, | |
| "num_tokens": 630538034.0, | |
| "step": 25650 | |
| }, | |
| { | |
| "entropy": 1.6311498081684113, | |
| "epoch": 1.2175478491567178, | |
| "grad_norm": 1.0992521047592163, | |
| "learning_rate": 3.981857581607313e-05, | |
| "loss": 1.1851, | |
| "mean_token_accuracy": 0.7504255121946335, | |
| "num_tokens": 631771489.0, | |
| "step": 25700 | |
| }, | |
| { | |
| "entropy": 1.6083201706409453, | |
| "epoch": 1.219916619291264, | |
| "grad_norm": 1.2340748310089111, | |
| "learning_rate": 3.9616278113036786e-05, | |
| "loss": 1.1595, | |
| "mean_token_accuracy": 0.7537871873378754, | |
| "num_tokens": 632996983.0, | |
| "step": 25750 | |
| }, | |
| { | |
| "entropy": 1.6100276720523834, | |
| "epoch": 1.22228538942581, | |
| "grad_norm": 1.2286880016326904, | |
| "learning_rate": 3.9414157895201273e-05, | |
| "loss": 1.2196, | |
| "mean_token_accuracy": 0.7460716181993484, | |
| "num_tokens": 634207237.0, | |
| "step": 25800 | |
| }, | |
| { | |
| "entropy": 1.6090706491470337, | |
| "epoch": 1.2246541595603562, | |
| "grad_norm": 1.2269102334976196, | |
| "learning_rate": 3.9212218617334356e-05, | |
| "loss": 1.19, | |
| "mean_token_accuracy": 0.7494009816646576, | |
| "num_tokens": 635435267.0, | |
| "step": 25850 | |
| }, | |
| { | |
| "entropy": 1.5722477328777313, | |
| "epoch": 1.2270229296949025, | |
| "grad_norm": 1.0874273777008057, | |
| "learning_rate": 3.901046373111103e-05, | |
| "loss": 1.1665, | |
| "mean_token_accuracy": 0.7549541050195694, | |
| "num_tokens": 636668958.0, | |
| "step": 25900 | |
| }, | |
| { | |
| "entropy": 1.6207891261577607, | |
| "epoch": 1.2293916998294485, | |
| "grad_norm": 1.4808669090270996, | |
| "learning_rate": 3.880889668505455e-05, | |
| "loss": 1.2441, | |
| "mean_token_accuracy": 0.7397470092773437, | |
| "num_tokens": 637878020.0, | |
| "step": 25950 | |
| }, | |
| { | |
| "entropy": 1.5923774099349977, | |
| "epoch": 1.2317604699639948, | |
| "grad_norm": 1.1652626991271973, | |
| "learning_rate": 3.860752092447749e-05, | |
| "loss": 1.1818, | |
| "mean_token_accuracy": 0.7510163110494613, | |
| "num_tokens": 639111025.0, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 1.2317604699639948, | |
| "eval_entropy": 1.0619611897482402, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.7597063543775787, | |
| "eval_num_tokens": 639111025.0, | |
| "eval_runtime": 728.2888, | |
| "eval_samples_per_second": 34.073, | |
| "eval_steps_per_second": 4.259, | |
| "step": 26000 | |
| }, | |
| { | |
| "entropy": 1.6090249001979828, | |
| "epoch": 1.2341292400985409, | |
| "grad_norm": 1.140541434288025, | |
| "learning_rate": 3.840633989142289e-05, | |
| "loss": 1.2208, | |
| "mean_token_accuracy": 0.7445061576366424, | |
| "num_tokens": 640330754.0, | |
| "step": 26050 | |
| }, | |
| { | |
| "entropy": 1.5675190496444702, | |
| "epoch": 1.236498010233087, | |
| "grad_norm": 1.1996732950210571, | |
| "learning_rate": 3.820535702460533e-05, | |
| "loss": 1.1648, | |
| "mean_token_accuracy": 0.7530361658334732, | |
| "num_tokens": 641577483.0, | |
| "step": 26100 | |
| }, | |
| { | |
| "entropy": 1.5749832487106323, | |
| "epoch": 1.2388667803676332, | |
| "grad_norm": 1.3973077535629272, | |
| "learning_rate": 3.800457575935222e-05, | |
| "loss": 1.172, | |
| "mean_token_accuracy": 0.7515737456083298, | |
| "num_tokens": 642810578.0, | |
| "step": 26150 | |
| }, | |
| { | |
| "entropy": 1.5630564618110656, | |
| "epoch": 1.2412355505021793, | |
| "grad_norm": 1.0359597206115723, | |
| "learning_rate": 3.780399952754507e-05, | |
| "loss": 1.1647, | |
| "mean_token_accuracy": 0.7527536135911942, | |
| "num_tokens": 644066110.0, | |
| "step": 26200 | |
| }, | |
| { | |
| "entropy": 1.592404429912567, | |
| "epoch": 1.2436043206367253, | |
| "grad_norm": 1.076479434967041, | |
| "learning_rate": 3.7603631757560855e-05, | |
| "loss": 1.1732, | |
| "mean_token_accuracy": 0.7527641028165817, | |
| "num_tokens": 645301566.0, | |
| "step": 26250 | |
| }, | |
| { | |
| "entropy": 1.6117449808120727, | |
| "epoch": 1.2459730907712716, | |
| "grad_norm": 1.218085527420044, | |
| "learning_rate": 3.7403475874213354e-05, | |
| "loss": 1.2315, | |
| "mean_token_accuracy": 0.7417248862981797, | |
| "num_tokens": 646546442.0, | |
| "step": 26300 | |
| }, | |
| { | |
| "entropy": 1.5983488774299621, | |
| "epoch": 1.2483418609058177, | |
| "grad_norm": 1.2915470600128174, | |
| "learning_rate": 3.7203535298694656e-05, | |
| "loss": 1.2024, | |
| "mean_token_accuracy": 0.7482427370548248, | |
| "num_tokens": 647787345.0, | |
| "step": 26350 | |
| }, | |
| { | |
| "entropy": 1.5954162907600402, | |
| "epoch": 1.2507106310403637, | |
| "grad_norm": 1.0252054929733276, | |
| "learning_rate": 3.700381344851665e-05, | |
| "loss": 1.1757, | |
| "mean_token_accuracy": 0.752863358259201, | |
| "num_tokens": 649019906.0, | |
| "step": 26400 | |
| }, | |
| { | |
| "entropy": 1.5740117967128753, | |
| "epoch": 1.25307940117491, | |
| "grad_norm": 1.2225929498672485, | |
| "learning_rate": 3.6804313737452686e-05, | |
| "loss": 1.1731, | |
| "mean_token_accuracy": 0.7517155534029007, | |
| "num_tokens": 650242573.0, | |
| "step": 26450 | |
| }, | |
| { | |
| "entropy": 1.5935308575630187, | |
| "epoch": 1.255448171309456, | |
| "grad_norm": 1.3027613162994385, | |
| "learning_rate": 3.66050395754791e-05, | |
| "loss": 1.1661, | |
| "mean_token_accuracy": 0.7530785751342773, | |
| "num_tokens": 651477593.0, | |
| "step": 26500 | |
| }, | |
| { | |
| "entropy": 1.5640760624408723, | |
| "epoch": 1.2578169414440024, | |
| "grad_norm": 0.9961079955101013, | |
| "learning_rate": 3.6405994368717054e-05, | |
| "loss": 1.1706, | |
| "mean_token_accuracy": 0.7543714487552643, | |
| "num_tokens": 652736186.0, | |
| "step": 26550 | |
| }, | |
| { | |
| "entropy": 1.6221139824390411, | |
| "epoch": 1.2601857115785484, | |
| "grad_norm": 1.6644654273986816, | |
| "learning_rate": 3.620718151937425e-05, | |
| "loss": 1.1881, | |
| "mean_token_accuracy": 0.7484775596857071, | |
| "num_tokens": 653945306.0, | |
| "step": 26600 | |
| }, | |
| { | |
| "entropy": 1.627434605360031, | |
| "epoch": 1.2625544817130945, | |
| "grad_norm": 1.1984070539474487, | |
| "learning_rate": 3.6008604425686766e-05, | |
| "loss": 1.2087, | |
| "mean_token_accuracy": 0.744699953198433, | |
| "num_tokens": 655163994.0, | |
| "step": 26650 | |
| }, | |
| { | |
| "entropy": 1.595815200805664, | |
| "epoch": 1.2649232518476408, | |
| "grad_norm": 0.942965030670166, | |
| "learning_rate": 3.581026648186101e-05, | |
| "loss": 1.2047, | |
| "mean_token_accuracy": 0.7466594022512436, | |
| "num_tokens": 656389078.0, | |
| "step": 26700 | |
| }, | |
| { | |
| "entropy": 1.5596436941623688, | |
| "epoch": 1.2672920219821868, | |
| "grad_norm": 1.0333982706069946, | |
| "learning_rate": 3.561217107801568e-05, | |
| "loss": 1.1366, | |
| "mean_token_accuracy": 0.7599540430307389, | |
| "num_tokens": 657628840.0, | |
| "step": 26750 | |
| }, | |
| { | |
| "entropy": 1.582226196527481, | |
| "epoch": 1.269660792116733, | |
| "grad_norm": 1.3895862102508545, | |
| "learning_rate": 3.5414321600123854e-05, | |
| "loss": 1.1594, | |
| "mean_token_accuracy": 0.7542477381229401, | |
| "num_tokens": 658863710.0, | |
| "step": 26800 | |
| }, | |
| { | |
| "entropy": 1.6000851714611053, | |
| "epoch": 1.2720295622512792, | |
| "grad_norm": 1.2000585794448853, | |
| "learning_rate": 3.521672142995506e-05, | |
| "loss": 1.1862, | |
| "mean_token_accuracy": 0.7507990497350693, | |
| "num_tokens": 660068012.0, | |
| "step": 26850 | |
| }, | |
| { | |
| "entropy": 1.6038016283512115, | |
| "epoch": 1.2743983323858252, | |
| "grad_norm": 1.0799274444580078, | |
| "learning_rate": 3.501937394501747e-05, | |
| "loss": 1.1911, | |
| "mean_token_accuracy": 0.7496211153268814, | |
| "num_tokens": 661305265.0, | |
| "step": 26900 | |
| }, | |
| { | |
| "entropy": 1.6001941812038423, | |
| "epoch": 1.2767671025203713, | |
| "grad_norm": 1.0266954898834229, | |
| "learning_rate": 3.4822282518500286e-05, | |
| "loss": 1.1319, | |
| "mean_token_accuracy": 0.7590432322025299, | |
| "num_tokens": 662525430.0, | |
| "step": 26950 | |
| }, | |
| { | |
| "entropy": 1.6020025527477264, | |
| "epoch": 1.2791358726549176, | |
| "grad_norm": 1.4219437837600708, | |
| "learning_rate": 3.4625450519215915e-05, | |
| "loss": 1.1896, | |
| "mean_token_accuracy": 0.7499201774597168, | |
| "num_tokens": 663708332.0, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 1.2791358726549176, | |
| "eval_entropy": 1.0667552875749993, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.7588769892467213, | |
| "eval_num_tokens": 663708332.0, | |
| "eval_runtime": 730.6419, | |
| "eval_samples_per_second": 33.963, | |
| "eval_steps_per_second": 4.246, | |
| "step": 27000 | |
| }, | |
| { | |
| "entropy": 1.5845714378356934, | |
| "epoch": 1.2815046427894636, | |
| "grad_norm": 1.2918035984039307, | |
| "learning_rate": 3.4428881311542485e-05, | |
| "loss": 1.2018, | |
| "mean_token_accuracy": 0.7470276898145676, | |
| "num_tokens": 664918181.0, | |
| "step": 27050 | |
| }, | |
| { | |
| "entropy": 1.567339129447937, | |
| "epoch": 1.28387341292401, | |
| "grad_norm": 1.3722400665283203, | |
| "learning_rate": 3.423257825536637e-05, | |
| "loss": 1.1469, | |
| "mean_token_accuracy": 0.7578269052505493, | |
| "num_tokens": 666127940.0, | |
| "step": 27100 | |
| }, | |
| { | |
| "entropy": 1.5779772651195527, | |
| "epoch": 1.286242183058556, | |
| "grad_norm": 1.2678896188735962, | |
| "learning_rate": 3.403654470602463e-05, | |
| "loss": 1.2057, | |
| "mean_token_accuracy": 0.7468883281946183, | |
| "num_tokens": 667363858.0, | |
| "step": 27150 | |
| }, | |
| { | |
| "entropy": 1.6206267583370209, | |
| "epoch": 1.288610953193102, | |
| "grad_norm": 1.415664553642273, | |
| "learning_rate": 3.3840784014247825e-05, | |
| "loss": 1.1709, | |
| "mean_token_accuracy": 0.7533509171009064, | |
| "num_tokens": 668586984.0, | |
| "step": 27200 | |
| }, | |
| { | |
| "entropy": 1.58282252907753, | |
| "epoch": 1.2909797233276483, | |
| "grad_norm": 1.0562044382095337, | |
| "learning_rate": 3.3645299526102625e-05, | |
| "loss": 1.1525, | |
| "mean_token_accuracy": 0.7587102675437927, | |
| "num_tokens": 669848008.0, | |
| "step": 27250 | |
| }, | |
| { | |
| "entropy": 1.6004004609584808, | |
| "epoch": 1.2933484934621944, | |
| "grad_norm": 1.3321800231933594, | |
| "learning_rate": 3.3450094582934624e-05, | |
| "loss": 1.168, | |
| "mean_token_accuracy": 0.7531924885511398, | |
| "num_tokens": 671055921.0, | |
| "step": 27300 | |
| }, | |
| { | |
| "entropy": 1.6070753967761993, | |
| "epoch": 1.2957172635967407, | |
| "grad_norm": 1.1480814218521118, | |
| "learning_rate": 3.3255172521311296e-05, | |
| "loss": 1.1957, | |
| "mean_token_accuracy": 0.7474820953607559, | |
| "num_tokens": 672291814.0, | |
| "step": 27350 | |
| }, | |
| { | |
| "entropy": 1.618386241197586, | |
| "epoch": 1.2980860337312867, | |
| "grad_norm": 1.3218634128570557, | |
| "learning_rate": 3.306053667296491e-05, | |
| "loss": 1.1813, | |
| "mean_token_accuracy": 0.749809256196022, | |
| "num_tokens": 673529686.0, | |
| "step": 27400 | |
| }, | |
| { | |
| "entropy": 1.5894237875938415, | |
| "epoch": 1.3004548038658328, | |
| "grad_norm": 1.2133702039718628, | |
| "learning_rate": 3.286619036473557e-05, | |
| "loss": 1.1527, | |
| "mean_token_accuracy": 0.7563208711147308, | |
| "num_tokens": 674737012.0, | |
| "step": 27450 | |
| }, | |
| { | |
| "entropy": 1.5680401778221131, | |
| "epoch": 1.302823574000379, | |
| "grad_norm": 1.3504135608673096, | |
| "learning_rate": 3.267213691851443e-05, | |
| "loss": 1.1453, | |
| "mean_token_accuracy": 0.7576598930358887, | |
| "num_tokens": 676016669.0, | |
| "step": 27500 | |
| }, | |
| { | |
| "entropy": 1.5564317107200623, | |
| "epoch": 1.3051923441349251, | |
| "grad_norm": 1.2370836734771729, | |
| "learning_rate": 3.2478379651186814e-05, | |
| "loss": 1.151, | |
| "mean_token_accuracy": 0.7560758543014526, | |
| "num_tokens": 677240518.0, | |
| "step": 27550 | |
| }, | |
| { | |
| "entropy": 1.5811952316761018, | |
| "epoch": 1.3075611142694714, | |
| "grad_norm": 1.161582589149475, | |
| "learning_rate": 3.228492187457557e-05, | |
| "loss": 1.1623, | |
| "mean_token_accuracy": 0.7540548771619797, | |
| "num_tokens": 678477906.0, | |
| "step": 27600 | |
| }, | |
| { | |
| "entropy": 1.616460270881653, | |
| "epoch": 1.3099298844040175, | |
| "grad_norm": 1.2357761859893799, | |
| "learning_rate": 3.209176689538448e-05, | |
| "loss": 1.203, | |
| "mean_token_accuracy": 0.7480911284685134, | |
| "num_tokens": 679717449.0, | |
| "step": 27650 | |
| }, | |
| { | |
| "entropy": 1.6137902176380157, | |
| "epoch": 1.3122986545385635, | |
| "grad_norm": 1.1097781658172607, | |
| "learning_rate": 3.189891801514171e-05, | |
| "loss": 1.1877, | |
| "mean_token_accuracy": 0.7503223437070846, | |
| "num_tokens": 680910674.0, | |
| "step": 27700 | |
| }, | |
| { | |
| "entropy": 1.6009632289409637, | |
| "epoch": 1.3146674246731096, | |
| "grad_norm": 1.260872721672058, | |
| "learning_rate": 3.1706378530143385e-05, | |
| "loss": 1.1725, | |
| "mean_token_accuracy": 0.7530629223585129, | |
| "num_tokens": 682144950.0, | |
| "step": 27750 | |
| }, | |
| { | |
| "entropy": 1.6063115882873535, | |
| "epoch": 1.3170361948076559, | |
| "grad_norm": 1.1645578145980835, | |
| "learning_rate": 3.1514151731397246e-05, | |
| "loss": 1.1647, | |
| "mean_token_accuracy": 0.753446283340454, | |
| "num_tokens": 683390865.0, | |
| "step": 27800 | |
| }, | |
| { | |
| "entropy": 1.6270235812664031, | |
| "epoch": 1.319404964942202, | |
| "grad_norm": 1.2020407915115356, | |
| "learning_rate": 3.1322240904566426e-05, | |
| "loss": 1.1735, | |
| "mean_token_accuracy": 0.7529788100719452, | |
| "num_tokens": 684605889.0, | |
| "step": 27850 | |
| }, | |
| { | |
| "entropy": 1.607979006767273, | |
| "epoch": 1.3217737350767482, | |
| "grad_norm": 1.137190580368042, | |
| "learning_rate": 3.1130649329913225e-05, | |
| "loss": 1.2056, | |
| "mean_token_accuracy": 0.7471660190820694, | |
| "num_tokens": 685842856.0, | |
| "step": 27900 | |
| }, | |
| { | |
| "entropy": 1.6045704185962677, | |
| "epoch": 1.3241425052112943, | |
| "grad_norm": 1.21959388256073, | |
| "learning_rate": 3.09393802822431e-05, | |
| "loss": 1.1506, | |
| "mean_token_accuracy": 0.7561245012283325, | |
| "num_tokens": 687059905.0, | |
| "step": 27950 | |
| }, | |
| { | |
| "entropy": 1.6008513212203979, | |
| "epoch": 1.3265112753458403, | |
| "grad_norm": 0.969918429851532, | |
| "learning_rate": 3.074843703084869e-05, | |
| "loss": 1.1717, | |
| "mean_token_accuracy": 0.7522702825069427, | |
| "num_tokens": 688293184.0, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 1.3265112753458403, | |
| "eval_entropy": 1.1074502345902315, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.7599941444139493, | |
| "eval_num_tokens": 688293184.0, | |
| "eval_runtime": 728.3714, | |
| "eval_samples_per_second": 34.069, | |
| "eval_steps_per_second": 4.259, | |
| "step": 28000 | |
| }, | |
| { | |
| "entropy": 1.613970617055893, | |
| "epoch": 1.3288800454803866, | |
| "grad_norm": 1.2240134477615356, | |
| "learning_rate": 3.0557822839453874e-05, | |
| "loss": 1.1618, | |
| "mean_token_accuracy": 0.7536975979804993, | |
| "num_tokens": 689517314.0, | |
| "step": 28050 | |
| }, | |
| { | |
| "entropy": 1.5815585339069367, | |
| "epoch": 1.3312488156149327, | |
| "grad_norm": 1.1150712966918945, | |
| "learning_rate": 3.036754096615807e-05, | |
| "loss": 1.1704, | |
| "mean_token_accuracy": 0.753908543586731, | |
| "num_tokens": 690755165.0, | |
| "step": 28100 | |
| }, | |
| { | |
| "entropy": 1.5578910648822784, | |
| "epoch": 1.333617585749479, | |
| "grad_norm": 1.2640388011932373, | |
| "learning_rate": 3.017759466338046e-05, | |
| "loss": 1.1623, | |
| "mean_token_accuracy": 0.7535911196470261, | |
| "num_tokens": 692007773.0, | |
| "step": 28150 | |
| }, | |
| { | |
| "entropy": 1.5919007360935211, | |
| "epoch": 1.335986355884025, | |
| "grad_norm": 1.1327555179595947, | |
| "learning_rate": 2.9987987177804494e-05, | |
| "loss": 1.1729, | |
| "mean_token_accuracy": 0.7511158144474029, | |
| "num_tokens": 693232662.0, | |
| "step": 28200 | |
| }, | |
| { | |
| "entropy": 1.6085839200019836, | |
| "epoch": 1.338355126018571, | |
| "grad_norm": 0.9447433352470398, | |
| "learning_rate": 2.979872175032231e-05, | |
| "loss": 1.1558, | |
| "mean_token_accuracy": 0.7552832061052323, | |
| "num_tokens": 694432399.0, | |
| "step": 28250 | |
| }, | |
| { | |
| "entropy": 1.6203950083255767, | |
| "epoch": 1.3407238961531174, | |
| "grad_norm": 1.1614621877670288, | |
| "learning_rate": 2.960980161597936e-05, | |
| "loss": 1.1892, | |
| "mean_token_accuracy": 0.7479177683591842, | |
| "num_tokens": 695664920.0, | |
| "step": 28300 | |
| }, | |
| { | |
| "entropy": 1.588459266424179, | |
| "epoch": 1.3430926662876634, | |
| "grad_norm": 1.6014941930770874, | |
| "learning_rate": 2.9421230003919155e-05, | |
| "loss": 1.1604, | |
| "mean_token_accuracy": 0.75300128698349, | |
| "num_tokens": 696895792.0, | |
| "step": 28350 | |
| }, | |
| { | |
| "entropy": 1.6118758118152618, | |
| "epoch": 1.3454614364222097, | |
| "grad_norm": 1.1947180032730103, | |
| "learning_rate": 2.923301013732799e-05, | |
| "loss": 1.1825, | |
| "mean_token_accuracy": 0.7502673131227493, | |
| "num_tokens": 698079475.0, | |
| "step": 28400 | |
| }, | |
| { | |
| "entropy": 1.6115264117717742, | |
| "epoch": 1.3478302065567558, | |
| "grad_norm": 1.2438665628433228, | |
| "learning_rate": 2.9045145233379976e-05, | |
| "loss": 1.2001, | |
| "mean_token_accuracy": 0.7489022916555405, | |
| "num_tokens": 699305883.0, | |
| "step": 28450 | |
| }, | |
| { | |
| "entropy": 1.5930208683013916, | |
| "epoch": 1.3501989766913018, | |
| "grad_norm": 1.2472587823867798, | |
| "learning_rate": 2.885763850318193e-05, | |
| "loss": 1.1455, | |
| "mean_token_accuracy": 0.7588497418165207, | |
| "num_tokens": 700517157.0, | |
| "step": 28500 | |
| }, | |
| { | |
| "entropy": 1.611283905506134, | |
| "epoch": 1.352567746825848, | |
| "grad_norm": 1.1896998882293701, | |
| "learning_rate": 2.8670493151718526e-05, | |
| "loss": 1.2069, | |
| "mean_token_accuracy": 0.7471307969093323, | |
| "num_tokens": 701725293.0, | |
| "step": 28550 | |
| }, | |
| { | |
| "entropy": 1.5695743489265441, | |
| "epoch": 1.3549365169603942, | |
| "grad_norm": 1.1057043075561523, | |
| "learning_rate": 2.8483712377797544e-05, | |
| "loss": 1.1538, | |
| "mean_token_accuracy": 0.7563241708278656, | |
| "num_tokens": 702969110.0, | |
| "step": 28600 | |
| }, | |
| { | |
| "entropy": 1.5800132751464844, | |
| "epoch": 1.3573052870949402, | |
| "grad_norm": 1.1600664854049683, | |
| "learning_rate": 2.829729937399515e-05, | |
| "loss": 1.1533, | |
| "mean_token_accuracy": 0.7582338035106659, | |
| "num_tokens": 704225571.0, | |
| "step": 28650 | |
| }, | |
| { | |
| "entropy": 1.6222402799129485, | |
| "epoch": 1.3596740572294865, | |
| "grad_norm": 0.993548572063446, | |
| "learning_rate": 2.8111257326601402e-05, | |
| "loss": 1.2294, | |
| "mean_token_accuracy": 0.742488032579422, | |
| "num_tokens": 705467457.0, | |
| "step": 28700 | |
| }, | |
| { | |
| "entropy": 1.568291175365448, | |
| "epoch": 1.3620428273640326, | |
| "grad_norm": 1.0379763841629028, | |
| "learning_rate": 2.7925589415565666e-05, | |
| "loss": 1.1593, | |
| "mean_token_accuracy": 0.7555217838287354, | |
| "num_tokens": 706689479.0, | |
| "step": 28750 | |
| }, | |
| { | |
| "entropy": 1.6043275892734528, | |
| "epoch": 1.3644115974985787, | |
| "grad_norm": 1.43356454372406, | |
| "learning_rate": 2.774029881444238e-05, | |
| "loss": 1.2127, | |
| "mean_token_accuracy": 0.7451710641384125, | |
| "num_tokens": 707935708.0, | |
| "step": 28800 | |
| }, | |
| { | |
| "entropy": 1.5973702204227447, | |
| "epoch": 1.366780367633125, | |
| "grad_norm": 1.2377339601516724, | |
| "learning_rate": 2.7555388690336725e-05, | |
| "loss": 1.163, | |
| "mean_token_accuracy": 0.7523965907096862, | |
| "num_tokens": 709186556.0, | |
| "step": 28850 | |
| }, | |
| { | |
| "entropy": 1.5913502633571626, | |
| "epoch": 1.369149137767671, | |
| "grad_norm": 1.2148689031600952, | |
| "learning_rate": 2.737086220385055e-05, | |
| "loss": 1.128, | |
| "mean_token_accuracy": 0.7610643255710602, | |
| "num_tokens": 710387868.0, | |
| "step": 28900 | |
| }, | |
| { | |
| "entropy": 1.629042412042618, | |
| "epoch": 1.3715179079022173, | |
| "grad_norm": 1.3843477964401245, | |
| "learning_rate": 2.7186722509028294e-05, | |
| "loss": 1.1943, | |
| "mean_token_accuracy": 0.7486888426542282, | |
| "num_tokens": 711599301.0, | |
| "step": 28950 | |
| }, | |
| { | |
| "entropy": 1.5498824548721313, | |
| "epoch": 1.3738866780367633, | |
| "grad_norm": 1.2801408767700195, | |
| "learning_rate": 2.7002972753303167e-05, | |
| "loss": 1.1466, | |
| "mean_token_accuracy": 0.7573213475942612, | |
| "num_tokens": 712853561.0, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 1.3738866780367633, | |
| "eval_entropy": 1.08869860577322, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.7604253247985526, | |
| "eval_num_tokens": 712853561.0, | |
| "eval_runtime": 728.768, | |
| "eval_samples_per_second": 34.051, | |
| "eval_steps_per_second": 4.256, | |
| "step": 29000 | |
| }, | |
| { | |
| "entropy": 1.5806612002849578, | |
| "epoch": 1.3762554481713094, | |
| "grad_norm": 1.1750032901763916, | |
| "learning_rate": 2.6819616077443243e-05, | |
| "loss": 1.1608, | |
| "mean_token_accuracy": 0.754431728720665, | |
| "num_tokens": 714067279.0, | |
| "step": 29050 | |
| }, | |
| { | |
| "entropy": 1.5694472527503966, | |
| "epoch": 1.3786242183058555, | |
| "grad_norm": 1.0282950401306152, | |
| "learning_rate": 2.6636655615497808e-05, | |
| "loss": 1.1345, | |
| "mean_token_accuracy": 0.7596888369321824, | |
| "num_tokens": 715313924.0, | |
| "step": 29100 | |
| }, | |
| { | |
| "entropy": 1.5819503486156463, | |
| "epoch": 1.3809929884404017, | |
| "grad_norm": 1.0335078239440918, | |
| "learning_rate": 2.6454094494743865e-05, | |
| "loss": 1.172, | |
| "mean_token_accuracy": 0.7516703462600708, | |
| "num_tokens": 716539594.0, | |
| "step": 29150 | |
| }, | |
| { | |
| "entropy": 1.5596589314937592, | |
| "epoch": 1.3833617585749478, | |
| "grad_norm": 1.1024916172027588, | |
| "learning_rate": 2.627193583563259e-05, | |
| "loss": 1.1515, | |
| "mean_token_accuracy": 0.7565756964683533, | |
| "num_tokens": 717779064.0, | |
| "step": 29200 | |
| }, | |
| { | |
| "entropy": 1.5737035143375397, | |
| "epoch": 1.385730528709494, | |
| "grad_norm": 1.3154332637786865, | |
| "learning_rate": 2.609018275173601e-05, | |
| "loss": 1.1313, | |
| "mean_token_accuracy": 0.7610451829433441, | |
| "num_tokens": 719013380.0, | |
| "step": 29250 | |
| }, | |
| { | |
| "entropy": 1.5879342305660247, | |
| "epoch": 1.3880992988440402, | |
| "grad_norm": 1.1045840978622437, | |
| "learning_rate": 2.590883834969383e-05, | |
| "loss": 1.1607, | |
| "mean_token_accuracy": 0.753515048623085, | |
| "num_tokens": 720213990.0, | |
| "step": 29300 | |
| }, | |
| { | |
| "entropy": 1.5969963049888611, | |
| "epoch": 1.3904680689785862, | |
| "grad_norm": 1.360352635383606, | |
| "learning_rate": 2.5727905729160274e-05, | |
| "loss": 1.2105, | |
| "mean_token_accuracy": 0.7468285745382309, | |
| "num_tokens": 721454429.0, | |
| "step": 29350 | |
| }, | |
| { | |
| "entropy": 1.5682587361335754, | |
| "epoch": 1.3928368391131325, | |
| "grad_norm": 1.2134160995483398, | |
| "learning_rate": 2.5547387982751186e-05, | |
| "loss": 1.1674, | |
| "mean_token_accuracy": 0.7542579096555709, | |
| "num_tokens": 722678187.0, | |
| "step": 29400 | |
| }, | |
| { | |
| "entropy": 1.5867646288871766, | |
| "epoch": 1.3952056092476786, | |
| "grad_norm": 1.316106915473938, | |
| "learning_rate": 2.536728819599108e-05, | |
| "loss": 1.1752, | |
| "mean_token_accuracy": 0.7528412294387817, | |
| "num_tokens": 723910197.0, | |
| "step": 29450 | |
| }, | |
| { | |
| "entropy": 1.5651627695560455, | |
| "epoch": 1.3975743793822248, | |
| "grad_norm": 1.184169054031372, | |
| "learning_rate": 2.5187609447260417e-05, | |
| "loss": 1.1439, | |
| "mean_token_accuracy": 0.7595011454820633, | |
| "num_tokens": 725117786.0, | |
| "step": 29500 | |
| }, | |
| { | |
| "entropy": 1.5885691118240357, | |
| "epoch": 1.399943149516771, | |
| "grad_norm": 1.026950716972351, | |
| "learning_rate": 2.5008354807743063e-05, | |
| "loss": 1.1624, | |
| "mean_token_accuracy": 0.7540817469358444, | |
| "num_tokens": 726361382.0, | |
| "step": 29550 | |
| }, | |
| { | |
| "entropy": 1.6078854203224182, | |
| "epoch": 1.402311919651317, | |
| "grad_norm": 1.105989694595337, | |
| "learning_rate": 2.482952734137369e-05, | |
| "loss": 1.1846, | |
| "mean_token_accuracy": 0.7512462210655212, | |
| "num_tokens": 727584011.0, | |
| "step": 29600 | |
| }, | |
| { | |
| "entropy": 1.5742497992515565, | |
| "epoch": 1.4046806897858632, | |
| "grad_norm": 1.0175246000289917, | |
| "learning_rate": 2.4651130104785464e-05, | |
| "loss": 1.1383, | |
| "mean_token_accuracy": 0.7599206572771072, | |
| "num_tokens": 728859452.0, | |
| "step": 29650 | |
| }, | |
| { | |
| "entropy": 1.5539786064624785, | |
| "epoch": 1.4070494599204093, | |
| "grad_norm": 1.19257652759552, | |
| "learning_rate": 2.447316614725779e-05, | |
| "loss": 1.1285, | |
| "mean_token_accuracy": 0.7605455183982849, | |
| "num_tokens": 730093871.0, | |
| "step": 29700 | |
| }, | |
| { | |
| "entropy": 1.575538364648819, | |
| "epoch": 1.4094182300549556, | |
| "grad_norm": 1.3367068767547607, | |
| "learning_rate": 2.429563851066423e-05, | |
| "loss": 1.1549, | |
| "mean_token_accuracy": 0.7566698521375657, | |
| "num_tokens": 731296865.0, | |
| "step": 29750 | |
| }, | |
| { | |
| "entropy": 1.5920424699783324, | |
| "epoch": 1.4117870001895017, | |
| "grad_norm": 1.1951195001602173, | |
| "learning_rate": 2.411855022942043e-05, | |
| "loss": 1.1863, | |
| "mean_token_accuracy": 0.7511163413524627, | |
| "num_tokens": 732528311.0, | |
| "step": 29800 | |
| }, | |
| { | |
| "entropy": 1.5808912098407746, | |
| "epoch": 1.4141557703240477, | |
| "grad_norm": 1.2582076787948608, | |
| "learning_rate": 2.394190433043228e-05, | |
| "loss": 1.1524, | |
| "mean_token_accuracy": 0.756331347823143, | |
| "num_tokens": 733754679.0, | |
| "step": 29850 | |
| }, | |
| { | |
| "entropy": 1.6012385189533234, | |
| "epoch": 1.4165245404585938, | |
| "grad_norm": 1.2719967365264893, | |
| "learning_rate": 2.376570383304423e-05, | |
| "loss": 1.1689, | |
| "mean_token_accuracy": 0.7530979549884796, | |
| "num_tokens": 734988780.0, | |
| "step": 29900 | |
| }, | |
| { | |
| "entropy": 1.5944563674926757, | |
| "epoch": 1.41889331059314, | |
| "grad_norm": 1.168672800064087, | |
| "learning_rate": 2.3589951748987615e-05, | |
| "loss": 1.1874, | |
| "mean_token_accuracy": 0.7496302407979966, | |
| "num_tokens": 736210496.0, | |
| "step": 29950 | |
| }, | |
| { | |
| "entropy": 1.5730518507957458, | |
| "epoch": 1.4212620807276861, | |
| "grad_norm": 1.2104912996292114, | |
| "learning_rate": 2.3414651082329214e-05, | |
| "loss": 1.1672, | |
| "mean_token_accuracy": 0.7543781703710556, | |
| "num_tokens": 737427744.0, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 1.4212620807276861, | |
| "eval_entropy": 1.0714102481581333, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.7607390975986735, | |
| "eval_num_tokens": 737427744.0, | |
| "eval_runtime": 727.6986, | |
| "eval_samples_per_second": 34.101, | |
| "eval_steps_per_second": 4.263, | |
| "step": 30000 | |
| }, | |
| { | |
| "entropy": 1.5904272723197936, | |
| "epoch": 1.4236308508622324, | |
| "grad_norm": 1.3191770315170288, | |
| "learning_rate": 2.323980482941987e-05, | |
| "loss": 1.1451, | |
| "mean_token_accuracy": 0.7584353858232498, | |
| "num_tokens": 738655939.0, | |
| "step": 30050 | |
| }, | |
| { | |
| "entropy": 1.5946367990970611, | |
| "epoch": 1.4259996209967785, | |
| "grad_norm": 1.6591154336929321, | |
| "learning_rate": 2.3065415978843334e-05, | |
| "loss": 1.1805, | |
| "mean_token_accuracy": 0.7520586925745011, | |
| "num_tokens": 739862826.0, | |
| "step": 30100 | |
| }, | |
| { | |
| "entropy": 1.5869379675388335, | |
| "epoch": 1.4283683911313245, | |
| "grad_norm": 1.4352805614471436, | |
| "learning_rate": 2.2891487511365144e-05, | |
| "loss": 1.1486, | |
| "mean_token_accuracy": 0.7577965116500854, | |
| "num_tokens": 741119357.0, | |
| "step": 30150 | |
| }, | |
| { | |
| "entropy": 1.5365791404247284, | |
| "epoch": 1.4307371612658708, | |
| "grad_norm": 1.2801551818847656, | |
| "learning_rate": 2.2718022399881637e-05, | |
| "loss": 1.142, | |
| "mean_token_accuracy": 0.7584273481369018, | |
| "num_tokens": 742333607.0, | |
| "step": 30200 | |
| }, | |
| { | |
| "entropy": 1.5572332954406738, | |
| "epoch": 1.4331059314004169, | |
| "grad_norm": 1.212966799736023, | |
| "learning_rate": 2.2545023609369202e-05, | |
| "loss": 1.1619, | |
| "mean_token_accuracy": 0.7548242086172103, | |
| "num_tokens": 743565034.0, | |
| "step": 30250 | |
| }, | |
| { | |
| "entropy": 1.566081155538559, | |
| "epoch": 1.4354747015349631, | |
| "grad_norm": 1.0920426845550537, | |
| "learning_rate": 2.237249409683356e-05, | |
| "loss": 1.1783, | |
| "mean_token_accuracy": 0.7530256235599517, | |
| "num_tokens": 744804057.0, | |
| "step": 30300 | |
| }, | |
| { | |
| "entropy": 1.6045558285713195, | |
| "epoch": 1.4378434716695092, | |
| "grad_norm": 0.9273141026496887, | |
| "learning_rate": 2.220043681125924e-05, | |
| "loss": 1.1419, | |
| "mean_token_accuracy": 0.7590651035308837, | |
| "num_tokens": 746006768.0, | |
| "step": 30350 | |
| }, | |
| { | |
| "entropy": 1.5942323195934296, | |
| "epoch": 1.4402122418040553, | |
| "grad_norm": 1.1541792154312134, | |
| "learning_rate": 2.202885469355916e-05, | |
| "loss": 1.1921, | |
| "mean_token_accuracy": 0.7489143800735474, | |
| "num_tokens": 747223675.0, | |
| "step": 30400 | |
| }, | |
| { | |
| "entropy": 1.5869334352016449, | |
| "epoch": 1.4425810119386016, | |
| "grad_norm": 1.462320327758789, | |
| "learning_rate": 2.1857750676524357e-05, | |
| "loss": 1.1442, | |
| "mean_token_accuracy": 0.7573701620101929, | |
| "num_tokens": 748430497.0, | |
| "step": 30450 | |
| }, | |
| { | |
| "entropy": 1.586618103981018, | |
| "epoch": 1.4449497820731476, | |
| "grad_norm": 1.0793588161468506, | |
| "learning_rate": 2.168712768477392e-05, | |
| "loss": 1.1743, | |
| "mean_token_accuracy": 0.7522615754604339, | |
| "num_tokens": 749647006.0, | |
| "step": 30500 | |
| }, | |
| { | |
| "entropy": 1.6103046894073487, | |
| "epoch": 1.447318552207694, | |
| "grad_norm": 1.2154242992401123, | |
| "learning_rate": 2.1516988634704882e-05, | |
| "loss": 1.19, | |
| "mean_token_accuracy": 0.7501159131526947, | |
| "num_tokens": 750853602.0, | |
| "step": 30550 | |
| }, | |
| { | |
| "entropy": 1.5543176436424255, | |
| "epoch": 1.44968732234224, | |
| "grad_norm": 1.1655502319335938, | |
| "learning_rate": 2.1347336434442467e-05, | |
| "loss": 1.1284, | |
| "mean_token_accuracy": 0.7604024815559387, | |
| "num_tokens": 752063383.0, | |
| "step": 30600 | |
| }, | |
| { | |
| "entropy": 1.5402367627620697, | |
| "epoch": 1.452056092476786, | |
| "grad_norm": 0.9396981000900269, | |
| "learning_rate": 2.1178173983790333e-05, | |
| "loss": 1.1413, | |
| "mean_token_accuracy": 0.7587384188175201, | |
| "num_tokens": 753297932.0, | |
| "step": 30650 | |
| }, | |
| { | |
| "entropy": 1.565843381881714, | |
| "epoch": 1.454424862611332, | |
| "grad_norm": 1.2412699460983276, | |
| "learning_rate": 2.100950417418105e-05, | |
| "loss": 1.1336, | |
| "mean_token_accuracy": 0.76046923995018, | |
| "num_tokens": 754534333.0, | |
| "step": 30700 | |
| }, | |
| { | |
| "entropy": 1.5800429701805114, | |
| "epoch": 1.4567936327458784, | |
| "grad_norm": 1.3534191846847534, | |
| "learning_rate": 2.084132988862663e-05, | |
| "loss": 1.168, | |
| "mean_token_accuracy": 0.7545898991823197, | |
| "num_tokens": 755771112.0, | |
| "step": 30750 | |
| }, | |
| { | |
| "entropy": 1.5431535518169404, | |
| "epoch": 1.4591624028804244, | |
| "grad_norm": 1.1893748044967651, | |
| "learning_rate": 2.067365400166928e-05, | |
| "loss": 1.1317, | |
| "mean_token_accuracy": 0.7592762231826782, | |
| "num_tokens": 757016170.0, | |
| "step": 30800 | |
| }, | |
| { | |
| "entropy": 1.5234503149986267, | |
| "epoch": 1.4615311730149707, | |
| "grad_norm": 1.2661027908325195, | |
| "learning_rate": 2.0506479379332277e-05, | |
| "loss": 1.1197, | |
| "mean_token_accuracy": 0.7625928592681884, | |
| "num_tokens": 758267588.0, | |
| "step": 30850 | |
| }, | |
| { | |
| "entropy": 1.595642819404602, | |
| "epoch": 1.4638999431495168, | |
| "grad_norm": 1.3147796392440796, | |
| "learning_rate": 2.0339808879070942e-05, | |
| "loss": 1.1943, | |
| "mean_token_accuracy": 0.7485580265522003, | |
| "num_tokens": 759488024.0, | |
| "step": 30900 | |
| }, | |
| { | |
| "entropy": 1.581249178647995, | |
| "epoch": 1.4662687132840628, | |
| "grad_norm": 1.1915379762649536, | |
| "learning_rate": 2.0173645349723823e-05, | |
| "loss": 1.1843, | |
| "mean_token_accuracy": 0.751889705657959, | |
| "num_tokens": 760705547.0, | |
| "step": 30950 | |
| }, | |
| { | |
| "entropy": 1.547069821357727, | |
| "epoch": 1.4686374834186091, | |
| "grad_norm": 1.3527320623397827, | |
| "learning_rate": 2.0007991631463985e-05, | |
| "loss": 1.124, | |
| "mean_token_accuracy": 0.7617496418952941, | |
| "num_tokens": 761946385.0, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 1.4686374834186091, | |
| "eval_entropy": 1.0745692070571167, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.7606811193421454, | |
| "eval_num_tokens": 761946385.0, | |
| "eval_runtime": 750.8644, | |
| "eval_samples_per_second": 33.049, | |
| "eval_steps_per_second": 4.131, | |
| "step": 31000 | |
| }, | |
| { | |
| "entropy": 1.5860296082496643, | |
| "epoch": 1.4710062535531552, | |
| "grad_norm": 1.1028927564620972, | |
| "learning_rate": 1.984285055575052e-05, | |
| "loss": 1.1477, | |
| "mean_token_accuracy": 0.757946463227272, | |
| "num_tokens": 763145229.0, | |
| "step": 31050 | |
| }, | |
| { | |
| "entropy": 1.5409555327892304, | |
| "epoch": 1.4733750236877015, | |
| "grad_norm": 1.3105554580688477, | |
| "learning_rate": 1.967822494528007e-05, | |
| "loss": 1.1388, | |
| "mean_token_accuracy": 0.7595143103599549, | |
| "num_tokens": 764368780.0, | |
| "step": 31100 | |
| }, | |
| { | |
| "entropy": 1.5342436349391937, | |
| "epoch": 1.4757437938222475, | |
| "grad_norm": 1.175134539604187, | |
| "learning_rate": 1.9514117613938625e-05, | |
| "loss": 1.1376, | |
| "mean_token_accuracy": 0.7598193883895874, | |
| "num_tokens": 765622021.0, | |
| "step": 31150 | |
| }, | |
| { | |
| "entropy": 1.568356648683548, | |
| "epoch": 1.4781125639567936, | |
| "grad_norm": 1.1193444728851318, | |
| "learning_rate": 1.935053136675339e-05, | |
| "loss": 1.1488, | |
| "mean_token_accuracy": 0.7594327408075333, | |
| "num_tokens": 766856360.0, | |
| "step": 31200 | |
| }, | |
| { | |
| "entropy": 1.5699161183834076, | |
| "epoch": 1.4804813340913396, | |
| "grad_norm": 1.0560715198516846, | |
| "learning_rate": 1.9187468999844936e-05, | |
| "loss": 1.1459, | |
| "mean_token_accuracy": 0.7583613079786301, | |
| "num_tokens": 768058852.0, | |
| "step": 31250 | |
| }, | |
| { | |
| "entropy": 1.588351196050644, | |
| "epoch": 1.482850104225886, | |
| "grad_norm": 1.2862846851348877, | |
| "learning_rate": 1.9024933300379277e-05, | |
| "loss": 1.1692, | |
| "mean_token_accuracy": 0.7539873999357224, | |
| "num_tokens": 769278437.0, | |
| "step": 31300 | |
| }, | |
| { | |
| "entropy": 1.5751077544689178, | |
| "epoch": 1.485218874360432, | |
| "grad_norm": 1.1171611547470093, | |
| "learning_rate": 1.8862927046520312e-05, | |
| "loss": 1.1468, | |
| "mean_token_accuracy": 0.7574340969324111, | |
| "num_tokens": 770491005.0, | |
| "step": 31350 | |
| }, | |
| { | |
| "entropy": 1.5523167753219604, | |
| "epoch": 1.4875876444949783, | |
| "grad_norm": 1.1966944932937622, | |
| "learning_rate": 1.8701453007382314e-05, | |
| "loss": 1.1322, | |
| "mean_token_accuracy": 0.7628469413518906, | |
| "num_tokens": 771700031.0, | |
| "step": 31400 | |
| }, | |
| { | |
| "entropy": 1.5793978321552276, | |
| "epoch": 1.4899564146295243, | |
| "grad_norm": 1.404768466949463, | |
| "learning_rate": 1.8540513942982602e-05, | |
| "loss": 1.1309, | |
| "mean_token_accuracy": 0.7610709732770919, | |
| "num_tokens": 772941795.0, | |
| "step": 31450 | |
| }, | |
| { | |
| "entropy": 1.5804509365558623, | |
| "epoch": 1.4923251847640704, | |
| "grad_norm": 1.3773914575576782, | |
| "learning_rate": 1.838011260419435e-05, | |
| "loss": 1.1556, | |
| "mean_token_accuracy": 0.7568354111909866, | |
| "num_tokens": 774162687.0, | |
| "step": 31500 | |
| }, | |
| { | |
| "entropy": 1.5457658851146698, | |
| "epoch": 1.4946939548986167, | |
| "grad_norm": 0.9370711445808411, | |
| "learning_rate": 1.822025173269964e-05, | |
| "loss": 1.1291, | |
| "mean_token_accuracy": 0.7615066528320312, | |
| "num_tokens": 775426714.0, | |
| "step": 31550 | |
| }, | |
| { | |
| "entropy": 1.550627862215042, | |
| "epoch": 1.4970627250331627, | |
| "grad_norm": 1.1992812156677246, | |
| "learning_rate": 1.8060934060942487e-05, | |
| "loss": 1.1443, | |
| "mean_token_accuracy": 0.7579187524318695, | |
| "num_tokens": 776645207.0, | |
| "step": 31600 | |
| }, | |
| { | |
| "entropy": 1.5496788358688354, | |
| "epoch": 1.499431495167709, | |
| "grad_norm": 1.290854811668396, | |
| "learning_rate": 1.7902162312082194e-05, | |
| "loss": 1.1542, | |
| "mean_token_accuracy": 0.7575876170396805, | |
| "num_tokens": 777890539.0, | |
| "step": 31650 | |
| }, | |
| { | |
| "entropy": 1.583651841878891, | |
| "epoch": 1.501800265302255, | |
| "grad_norm": 1.4201711416244507, | |
| "learning_rate": 1.7743939199946818e-05, | |
| "loss": 1.1669, | |
| "mean_token_accuracy": 0.7559886735677719, | |
| "num_tokens": 779106659.0, | |
| "step": 31700 | |
| }, | |
| { | |
| "entropy": 1.5621719944477082, | |
| "epoch": 1.5041690354368011, | |
| "grad_norm": 1.0013508796691895, | |
| "learning_rate": 1.7586267428986763e-05, | |
| "loss": 1.1622, | |
| "mean_token_accuracy": 0.7543949365615845, | |
| "num_tokens": 780313881.0, | |
| "step": 31750 | |
| }, | |
| { | |
| "entropy": 1.5784629476070404, | |
| "epoch": 1.5065378055713472, | |
| "grad_norm": 1.293186068534851, | |
| "learning_rate": 1.742914969422856e-05, | |
| "loss": 1.1484, | |
| "mean_token_accuracy": 0.7578674453496933, | |
| "num_tokens": 781544604.0, | |
| "step": 31800 | |
| }, | |
| { | |
| "entropy": 1.5572881984710694, | |
| "epoch": 1.5089065757058935, | |
| "grad_norm": 1.1909185647964478, | |
| "learning_rate": 1.7272588681228767e-05, | |
| "loss": 1.1025, | |
| "mean_token_accuracy": 0.7669892936944962, | |
| "num_tokens": 782765240.0, | |
| "step": 31850 | |
| }, | |
| { | |
| "entropy": 1.5987060451507569, | |
| "epoch": 1.5112753458404398, | |
| "grad_norm": 1.3331712484359741, | |
| "learning_rate": 1.7116587066028172e-05, | |
| "loss": 1.1787, | |
| "mean_token_accuracy": 0.7533298796415329, | |
| "num_tokens": 783994667.0, | |
| "step": 31900 | |
| }, | |
| { | |
| "entropy": 1.5784035372734069, | |
| "epoch": 1.5136441159749858, | |
| "grad_norm": 1.3433549404144287, | |
| "learning_rate": 1.6961147515105897e-05, | |
| "loss": 1.1539, | |
| "mean_token_accuracy": 0.7583291745185852, | |
| "num_tokens": 785241722.0, | |
| "step": 31950 | |
| }, | |
| { | |
| "entropy": 1.5924919998645783, | |
| "epoch": 1.5160128861095319, | |
| "grad_norm": 0.9708880186080933, | |
| "learning_rate": 1.6806272685333967e-05, | |
| "loss": 1.168, | |
| "mean_token_accuracy": 0.7547562402486802, | |
| "num_tokens": 786450293.0, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 1.5160128861095319, | |
| "eval_entropy": 1.0883530624676796, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.760518561690796, | |
| "eval_num_tokens": 786450293.0, | |
| "eval_runtime": 728.4178, | |
| "eval_samples_per_second": 34.067, | |
| "eval_steps_per_second": 4.259, | |
| "step": 32000 | |
| }, | |
| { | |
| "entropy": 1.5847830092906952, | |
| "epoch": 1.518381656244078, | |
| "grad_norm": 1.0674691200256348, | |
| "learning_rate": 1.6651965223931798e-05, | |
| "loss": 1.122, | |
| "mean_token_accuracy": 0.7640283882617951, | |
| "num_tokens": 787651249.0, | |
| "step": 32050 | |
| }, | |
| { | |
| "entropy": 1.6060099351406096, | |
| "epoch": 1.5207504263786242, | |
| "grad_norm": 1.3451073169708252, | |
| "learning_rate": 1.6498227768420986e-05, | |
| "loss": 1.1986, | |
| "mean_token_accuracy": 0.7503712397813797, | |
| "num_tokens": 788894856.0, | |
| "step": 32100 | |
| }, | |
| { | |
| "entropy": 1.5495011293888092, | |
| "epoch": 1.5231191965131705, | |
| "grad_norm": 1.184458613395691, | |
| "learning_rate": 1.634506294658023e-05, | |
| "loss": 1.1241, | |
| "mean_token_accuracy": 0.7635609942674637, | |
| "num_tokens": 790124279.0, | |
| "step": 32150 | |
| }, | |
| { | |
| "entropy": 1.5789199233055116, | |
| "epoch": 1.5254879666477166, | |
| "grad_norm": 1.4204998016357422, | |
| "learning_rate": 1.619247337640041e-05, | |
| "loss": 1.1481, | |
| "mean_token_accuracy": 0.7602039396762847, | |
| "num_tokens": 791346787.0, | |
| "step": 32200 | |
| }, | |
| { | |
| "entropy": 1.5563798201084138, | |
| "epoch": 1.5278567367822626, | |
| "grad_norm": 1.1505266427993774, | |
| "learning_rate": 1.6040461666039808e-05, | |
| "loss": 1.1499, | |
| "mean_token_accuracy": 0.7575086969137191, | |
| "num_tokens": 792593563.0, | |
| "step": 32250 | |
| }, | |
| { | |
| "entropy": 1.6074644064903258, | |
| "epoch": 1.5302255069168087, | |
| "grad_norm": 1.185583472251892, | |
| "learning_rate": 1.5889030413779622e-05, | |
| "loss": 1.156, | |
| "mean_token_accuracy": 0.7562423485517502, | |
| "num_tokens": 793829790.0, | |
| "step": 32300 | |
| }, | |
| { | |
| "entropy": 1.5752575540542602, | |
| "epoch": 1.532594277051355, | |
| "grad_norm": 1.4243769645690918, | |
| "learning_rate": 1.5738182207979435e-05, | |
| "loss": 1.1459, | |
| "mean_token_accuracy": 0.7583240360021591, | |
| "num_tokens": 795055789.0, | |
| "step": 32350 | |
| }, | |
| { | |
| "entropy": 1.5923644971847535, | |
| "epoch": 1.534963047185901, | |
| "grad_norm": 1.6619261503219604, | |
| "learning_rate": 1.558791962703304e-05, | |
| "loss": 1.154, | |
| "mean_token_accuracy": 0.7567561262845993, | |
| "num_tokens": 796275636.0, | |
| "step": 32400 | |
| }, | |
| { | |
| "entropy": 1.583539651632309, | |
| "epoch": 1.5373318173204473, | |
| "grad_norm": 1.5260084867477417, | |
| "learning_rate": 1.5438245239324372e-05, | |
| "loss": 1.1293, | |
| "mean_token_accuracy": 0.7631356823444366, | |
| "num_tokens": 797503738.0, | |
| "step": 32450 | |
| }, | |
| { | |
| "entropy": 1.5718154168128968, | |
| "epoch": 1.5397005874549934, | |
| "grad_norm": 1.1916577816009521, | |
| "learning_rate": 1.5289161603183565e-05, | |
| "loss": 1.1556, | |
| "mean_token_accuracy": 0.756939308643341, | |
| "num_tokens": 798743606.0, | |
| "step": 32500 | |
| }, | |
| { | |
| "entropy": 1.5819120156764983, | |
| "epoch": 1.5420693575895394, | |
| "grad_norm": 1.1773018836975098, | |
| "learning_rate": 1.5140671266843276e-05, | |
| "loss": 1.1722, | |
| "mean_token_accuracy": 0.7551066309213639, | |
| "num_tokens": 799964473.0, | |
| "step": 32550 | |
| }, | |
| { | |
| "entropy": 1.575450291633606, | |
| "epoch": 1.5444381277240855, | |
| "grad_norm": 1.0022114515304565, | |
| "learning_rate": 1.4992776768395073e-05, | |
| "loss": 1.1449, | |
| "mean_token_accuracy": 0.7597598391771316, | |
| "num_tokens": 801188088.0, | |
| "step": 32600 | |
| }, | |
| { | |
| "entropy": 1.5459000968933105, | |
| "epoch": 1.5468068978586318, | |
| "grad_norm": 1.2957897186279297, | |
| "learning_rate": 1.4845480635746129e-05, | |
| "loss": 1.1227, | |
| "mean_token_accuracy": 0.7632001984119415, | |
| "num_tokens": 802438523.0, | |
| "step": 32650 | |
| }, | |
| { | |
| "entropy": 1.5870135259628295, | |
| "epoch": 1.549175667993178, | |
| "grad_norm": 1.3867087364196777, | |
| "learning_rate": 1.469878538657593e-05, | |
| "loss": 1.1395, | |
| "mean_token_accuracy": 0.7591842120885849, | |
| "num_tokens": 803649365.0, | |
| "step": 32700 | |
| }, | |
| { | |
| "entropy": 1.5655232286453247, | |
| "epoch": 1.5515444381277241, | |
| "grad_norm": 0.9858147501945496, | |
| "learning_rate": 1.4552693528293287e-05, | |
| "loss": 1.1343, | |
| "mean_token_accuracy": 0.7619771939516068, | |
| "num_tokens": 804874548.0, | |
| "step": 32750 | |
| }, | |
| { | |
| "entropy": 1.5606813442707062, | |
| "epoch": 1.5539132082622702, | |
| "grad_norm": 1.1506080627441406, | |
| "learning_rate": 1.4407207557993468e-05, | |
| "loss": 1.1358, | |
| "mean_token_accuracy": 0.7607605350017548, | |
| "num_tokens": 806110451.0, | |
| "step": 32800 | |
| }, | |
| { | |
| "entropy": 1.5905800759792328, | |
| "epoch": 1.5562819783968163, | |
| "grad_norm": 1.2425259351730347, | |
| "learning_rate": 1.4262329962415521e-05, | |
| "loss": 1.1416, | |
| "mean_token_accuracy": 0.7600742274522782, | |
| "num_tokens": 807342732.0, | |
| "step": 32850 | |
| }, | |
| { | |
| "entropy": 1.5680935847759248, | |
| "epoch": 1.5586507485313625, | |
| "grad_norm": 1.153823733329773, | |
| "learning_rate": 1.4118063217899746e-05, | |
| "loss": 1.1335, | |
| "mean_token_accuracy": 0.7605480921268463, | |
| "num_tokens": 808586619.0, | |
| "step": 32900 | |
| }, | |
| { | |
| "entropy": 1.5620489943027496, | |
| "epoch": 1.5610195186659088, | |
| "grad_norm": 1.050882339477539, | |
| "learning_rate": 1.397440979034544e-05, | |
| "loss": 1.1522, | |
| "mean_token_accuracy": 0.756206591129303, | |
| "num_tokens": 809832674.0, | |
| "step": 32950 | |
| }, | |
| { | |
| "entropy": 1.605532693862915, | |
| "epoch": 1.5633882888004549, | |
| "grad_norm": 1.3032130002975464, | |
| "learning_rate": 1.383137213516862e-05, | |
| "loss": 1.1758, | |
| "mean_token_accuracy": 0.7512508201599121, | |
| "num_tokens": 811045391.0, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 1.5633882888004549, | |
| "eval_entropy": 1.0869283330279116, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.7607752118838518, | |
| "eval_num_tokens": 811045391.0, | |
| "eval_runtime": 725.7757, | |
| "eval_samples_per_second": 34.191, | |
| "eval_steps_per_second": 4.274, | |
| "step": 33000 | |
| }, | |
| { | |
| "entropy": 1.592424702644348, | |
| "epoch": 1.565757058935001, | |
| "grad_norm": 1.4170438051223755, | |
| "learning_rate": 1.36889526972602e-05, | |
| "loss": 1.1393, | |
| "mean_token_accuracy": 0.7614479339122773, | |
| "num_tokens": 812284992.0, | |
| "step": 33050 | |
| }, | |
| { | |
| "entropy": 1.5892753875255585, | |
| "epoch": 1.568125829069547, | |
| "grad_norm": 1.1543865203857422, | |
| "learning_rate": 1.3547153910944083e-05, | |
| "loss": 1.1511, | |
| "mean_token_accuracy": 0.7588121032714844, | |
| "num_tokens": 813547664.0, | |
| "step": 33100 | |
| }, | |
| { | |
| "entropy": 1.584560890197754, | |
| "epoch": 1.5704945992040933, | |
| "grad_norm": 0.9274981021881104, | |
| "learning_rate": 1.3405978199935615e-05, | |
| "loss": 1.1374, | |
| "mean_token_accuracy": 0.7598586791753769, | |
| "num_tokens": 814802491.0, | |
| "step": 33150 | |
| }, | |
| { | |
| "entropy": 1.6207016718387603, | |
| "epoch": 1.5728633693386393, | |
| "grad_norm": 1.2314906120300293, | |
| "learning_rate": 1.3265427977300137e-05, | |
| "loss": 1.1615, | |
| "mean_token_accuracy": 0.7563327193260193, | |
| "num_tokens": 816017932.0, | |
| "step": 33200 | |
| }, | |
| { | |
| "entropy": 1.586843957901001, | |
| "epoch": 1.5752321394731856, | |
| "grad_norm": 1.1286264657974243, | |
| "learning_rate": 1.3125505645411745e-05, | |
| "loss": 1.1426, | |
| "mean_token_accuracy": 0.7603234398365021, | |
| "num_tokens": 817264849.0, | |
| "step": 33250 | |
| }, | |
| { | |
| "entropy": 1.591685062646866, | |
| "epoch": 1.5776009096077317, | |
| "grad_norm": 1.0517480373382568, | |
| "learning_rate": 1.2986213595912234e-05, | |
| "loss": 1.1207, | |
| "mean_token_accuracy": 0.7635256379842759, | |
| "num_tokens": 818484337.0, | |
| "step": 33300 | |
| }, | |
| { | |
| "entropy": 1.6172384572029115, | |
| "epoch": 1.5799696797422778, | |
| "grad_norm": 1.360130786895752, | |
| "learning_rate": 1.2847554209670182e-05, | |
| "loss": 1.161, | |
| "mean_token_accuracy": 0.7545110338926315, | |
| "num_tokens": 819712425.0, | |
| "step": 33350 | |
| }, | |
| { | |
| "entropy": 1.5726615214347839, | |
| "epoch": 1.5823384498768238, | |
| "grad_norm": 0.9944781064987183, | |
| "learning_rate": 1.2709529856740331e-05, | |
| "loss": 1.1207, | |
| "mean_token_accuracy": 0.7627239066362381, | |
| "num_tokens": 820956336.0, | |
| "step": 33400 | |
| }, | |
| { | |
| "entropy": 1.58195317029953, | |
| "epoch": 1.58470722001137, | |
| "grad_norm": 1.3657851219177246, | |
| "learning_rate": 1.2572142896322991e-05, | |
| "loss": 1.1318, | |
| "mean_token_accuracy": 0.7620331639051438, | |
| "num_tokens": 822181235.0, | |
| "step": 33450 | |
| }, | |
| { | |
| "entropy": 1.5519816017150878, | |
| "epoch": 1.5870759901459164, | |
| "grad_norm": 1.1192278861999512, | |
| "learning_rate": 1.2435395676723765e-05, | |
| "loss": 1.1255, | |
| "mean_token_accuracy": 0.763149077296257, | |
| "num_tokens": 823394039.0, | |
| "step": 33500 | |
| }, | |
| { | |
| "entropy": 1.5487982165813445, | |
| "epoch": 1.5894447602804624, | |
| "grad_norm": 0.9175589084625244, | |
| "learning_rate": 1.229929053531339e-05, | |
| "loss": 1.1266, | |
| "mean_token_accuracy": 0.762880043387413, | |
| "num_tokens": 824629826.0, | |
| "step": 33550 | |
| }, | |
| { | |
| "entropy": 1.5455662417411804, | |
| "epoch": 1.5918135304150085, | |
| "grad_norm": 1.543500542640686, | |
| "learning_rate": 1.2163829798487796e-05, | |
| "loss": 1.1179, | |
| "mean_token_accuracy": 0.7645809006690979, | |
| "num_tokens": 825885699.0, | |
| "step": 33600 | |
| }, | |
| { | |
| "entropy": 1.545372655391693, | |
| "epoch": 1.5941823005495546, | |
| "grad_norm": 1.042571783065796, | |
| "learning_rate": 1.2029015781628333e-05, | |
| "loss": 1.1253, | |
| "mean_token_accuracy": 0.7624981206655502, | |
| "num_tokens": 827144142.0, | |
| "step": 33650 | |
| }, | |
| { | |
| "entropy": 1.5542615973949432, | |
| "epoch": 1.5965510706841008, | |
| "grad_norm": 1.2017757892608643, | |
| "learning_rate": 1.1894850789062234e-05, | |
| "loss": 1.1095, | |
| "mean_token_accuracy": 0.7662106871604919, | |
| "num_tokens": 828358780.0, | |
| "step": 33700 | |
| }, | |
| { | |
| "entropy": 1.5387887310981752, | |
| "epoch": 1.598919840818647, | |
| "grad_norm": 1.2897499799728394, | |
| "learning_rate": 1.1761337114023157e-05, | |
| "loss": 1.1393, | |
| "mean_token_accuracy": 0.7597699278593063, | |
| "num_tokens": 829617688.0, | |
| "step": 33750 | |
| }, | |
| { | |
| "entropy": 1.57563338637352, | |
| "epoch": 1.6012886109531932, | |
| "grad_norm": 1.7554948329925537, | |
| "learning_rate": 1.1628477038612035e-05, | |
| "loss": 1.1186, | |
| "mean_token_accuracy": 0.7649687886238098, | |
| "num_tokens": 830817095.0, | |
| "step": 33800 | |
| }, | |
| { | |
| "entropy": 1.5856982839107514, | |
| "epoch": 1.6036573810877393, | |
| "grad_norm": 0.9697763919830322, | |
| "learning_rate": 1.1496272833758042e-05, | |
| "loss": 1.1803, | |
| "mean_token_accuracy": 0.7541155385971069, | |
| "num_tokens": 832068396.0, | |
| "step": 33850 | |
| }, | |
| { | |
| "entropy": 1.582381078004837, | |
| "epoch": 1.6060261512222853, | |
| "grad_norm": 1.2476204633712769, | |
| "learning_rate": 1.1364726759179856e-05, | |
| "loss": 1.1366, | |
| "mean_token_accuracy": 0.7601368808746338, | |
| "num_tokens": 833258832.0, | |
| "step": 33900 | |
| }, | |
| { | |
| "entropy": 1.5806366765499116, | |
| "epoch": 1.6083949213568314, | |
| "grad_norm": 1.464986801147461, | |
| "learning_rate": 1.12338410633469e-05, | |
| "loss": 1.1401, | |
| "mean_token_accuracy": 0.7602562707662582, | |
| "num_tokens": 834452824.0, | |
| "step": 33950 | |
| }, | |
| { | |
| "entropy": 1.5599962186813354, | |
| "epoch": 1.6107636914913777, | |
| "grad_norm": 1.1796619892120361, | |
| "learning_rate": 1.1103617983441017e-05, | |
| "loss": 1.1369, | |
| "mean_token_accuracy": 0.7609011316299439, | |
| "num_tokens": 835712240.0, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 1.6107636914913777, | |
| "eval_entropy": 1.0767018733141425, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.760854017967028, | |
| "eval_num_tokens": 835712240.0, | |
| "eval_runtime": 727.3112, | |
| "eval_samples_per_second": 34.119, | |
| "eval_steps_per_second": 4.265, | |
| "step": 34000 | |
| }, | |
| { | |
| "entropy": 1.579708174467087, | |
| "epoch": 1.613132461625924, | |
| "grad_norm": 1.2132450342178345, | |
| "learning_rate": 1.0974059745318177e-05, | |
| "loss": 1.1412, | |
| "mean_token_accuracy": 0.7581533867120743, | |
| "num_tokens": 836951903.0, | |
| "step": 34050 | |
| }, | |
| { | |
| "entropy": 1.5457484829425812, | |
| "epoch": 1.61550123176047, | |
| "grad_norm": 0.9479733109474182, | |
| "learning_rate": 1.0845168563470492e-05, | |
| "loss": 1.1319, | |
| "mean_token_accuracy": 0.7595140463113785, | |
| "num_tokens": 838214157.0, | |
| "step": 34100 | |
| }, | |
| { | |
| "entropy": 1.5466408836841583, | |
| "epoch": 1.617870001895016, | |
| "grad_norm": 1.7005789279937744, | |
| "learning_rate": 1.071694664098828e-05, | |
| "loss": 1.1175, | |
| "mean_token_accuracy": 0.7649167954921723, | |
| "num_tokens": 839471563.0, | |
| "step": 34150 | |
| }, | |
| { | |
| "entropy": 1.5700657200813293, | |
| "epoch": 1.6202387720295621, | |
| "grad_norm": 1.059528112411499, | |
| "learning_rate": 1.0589396169522465e-05, | |
| "loss": 1.1615, | |
| "mean_token_accuracy": 0.7565118598937989, | |
| "num_tokens": 840715891.0, | |
| "step": 34200 | |
| }, | |
| { | |
| "entropy": 1.5375142538547515, | |
| "epoch": 1.6226075421641084, | |
| "grad_norm": 1.559818148612976, | |
| "learning_rate": 1.0462519329247094e-05, | |
| "loss": 1.1356, | |
| "mean_token_accuracy": 0.7605293154716491, | |
| "num_tokens": 841942053.0, | |
| "step": 34250 | |
| }, | |
| { | |
| "entropy": 1.587239592075348, | |
| "epoch": 1.6249763122986547, | |
| "grad_norm": 1.1134872436523438, | |
| "learning_rate": 1.03363182888221e-05, | |
| "loss": 1.1564, | |
| "mean_token_accuracy": 0.7570129364728928, | |
| "num_tokens": 843161472.0, | |
| "step": 34300 | |
| }, | |
| { | |
| "entropy": 1.5743994867801667, | |
| "epoch": 1.6273450824332008, | |
| "grad_norm": 1.3683475255966187, | |
| "learning_rate": 1.021079520535619e-05, | |
| "loss": 1.159, | |
| "mean_token_accuracy": 0.7565000504255295, | |
| "num_tokens": 844415080.0, | |
| "step": 34350 | |
| }, | |
| { | |
| "entropy": 1.5728708267211915, | |
| "epoch": 1.6297138525677468, | |
| "grad_norm": 1.1361534595489502, | |
| "learning_rate": 1.0085952224369998e-05, | |
| "loss": 1.1464, | |
| "mean_token_accuracy": 0.7604904717206955, | |
| "num_tokens": 845652767.0, | |
| "step": 34400 | |
| }, | |
| { | |
| "entropy": 1.555993628501892, | |
| "epoch": 1.6320826227022929, | |
| "grad_norm": 1.2197624444961548, | |
| "learning_rate": 9.961791479759453e-06, | |
| "loss": 1.1094, | |
| "mean_token_accuracy": 0.7654684072732926, | |
| "num_tokens": 846861078.0, | |
| "step": 34450 | |
| }, | |
| { | |
| "entropy": 1.5712345719337464, | |
| "epoch": 1.6344513928368392, | |
| "grad_norm": 1.2012556791305542, | |
| "learning_rate": 9.83831509375922e-06, | |
| "loss": 1.1318, | |
| "mean_token_accuracy": 0.7618235784769058, | |
| "num_tokens": 848079801.0, | |
| "step": 34500 | |
| }, | |
| { | |
| "entropy": 1.5478745126724243, | |
| "epoch": 1.6368201629713852, | |
| "grad_norm": 1.1320964097976685, | |
| "learning_rate": 9.715525176906482e-06, | |
| "loss": 1.1156, | |
| "mean_token_accuracy": 0.763830555677414, | |
| "num_tokens": 849324814.0, | |
| "step": 34550 | |
| }, | |
| { | |
| "entropy": 1.5780341172218322, | |
| "epoch": 1.6391889331059315, | |
| "grad_norm": 1.025578498840332, | |
| "learning_rate": 9.59342382800486e-06, | |
| "loss": 1.1426, | |
| "mean_token_accuracy": 0.7610353720188141, | |
| "num_tokens": 850545817.0, | |
| "step": 34600 | |
| }, | |
| { | |
| "entropy": 1.5829120945930482, | |
| "epoch": 1.6415577032404776, | |
| "grad_norm": 1.0543193817138672, | |
| "learning_rate": 9.472013134088525e-06, | |
| "loss": 1.1659, | |
| "mean_token_accuracy": 0.7564774835109711, | |
| "num_tokens": 851771892.0, | |
| "step": 34650 | |
| }, | |
| { | |
| "entropy": 1.5125657570362092, | |
| "epoch": 1.6439264733750236, | |
| "grad_norm": 1.047337532043457, | |
| "learning_rate": 9.351295170386536e-06, | |
| "loss": 1.1436, | |
| "mean_token_accuracy": 0.7618407100439072, | |
| "num_tokens": 853004916.0, | |
| "step": 34700 | |
| }, | |
| { | |
| "entropy": 1.5835665547847748, | |
| "epoch": 1.6462952435095697, | |
| "grad_norm": 1.4141535758972168, | |
| "learning_rate": 9.231272000287355e-06, | |
| "loss": 1.1394, | |
| "mean_token_accuracy": 0.7607215863466262, | |
| "num_tokens": 854213875.0, | |
| "step": 34750 | |
| }, | |
| { | |
| "entropy": 1.5290465533733368, | |
| "epoch": 1.648664013644116, | |
| "grad_norm": 1.578470230102539, | |
| "learning_rate": 9.111945675303619e-06, | |
| "loss": 1.0863, | |
| "mean_token_accuracy": 0.7710424029827118, | |
| "num_tokens": 855445223.0, | |
| "step": 34800 | |
| }, | |
| { | |
| "entropy": 1.549327657222748, | |
| "epoch": 1.6510327837786622, | |
| "grad_norm": 1.0670899152755737, | |
| "learning_rate": 8.993318235037001e-06, | |
| "loss": 1.1251, | |
| "mean_token_accuracy": 0.7622793889045716, | |
| "num_tokens": 856681494.0, | |
| "step": 34850 | |
| }, | |
| { | |
| "entropy": 1.5306969308853149, | |
| "epoch": 1.6534015539132083, | |
| "grad_norm": 1.3352553844451904, | |
| "learning_rate": 8.875391707143432e-06, | |
| "loss": 1.1102, | |
| "mean_token_accuracy": 0.7646553814411163, | |
| "num_tokens": 857925423.0, | |
| "step": 34900 | |
| }, | |
| { | |
| "entropy": 1.534108463525772, | |
| "epoch": 1.6557703240477544, | |
| "grad_norm": 1.3138459920883179, | |
| "learning_rate": 8.75816810729837e-06, | |
| "loss": 1.1059, | |
| "mean_token_accuracy": 0.7676987838745117, | |
| "num_tokens": 859172535.0, | |
| "step": 34950 | |
| }, | |
| { | |
| "entropy": 1.561165556907654, | |
| "epoch": 1.6581390941823004, | |
| "grad_norm": 1.1682779788970947, | |
| "learning_rate": 8.641649439162396e-06, | |
| "loss": 1.1193, | |
| "mean_token_accuracy": 0.7643628352880478, | |
| "num_tokens": 860388305.0, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 1.6581390941823004, | |
| "eval_entropy": 1.0728673784675942, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.761188728099558, | |
| "eval_num_tokens": 860388305.0, | |
| "eval_runtime": 727.1748, | |
| "eval_samples_per_second": 34.125, | |
| "eval_steps_per_second": 4.266, | |
| "step": 35000 | |
| }, | |
| { | |
| "entropy": 1.5597219800949096, | |
| "epoch": 1.6605078643168467, | |
| "grad_norm": 1.1003355979919434, | |
| "learning_rate": 8.525837694346932e-06, | |
| "loss": 1.1456, | |
| "mean_token_accuracy": 0.7606293076276779, | |
| "num_tokens": 861636981.0, | |
| "step": 35050 | |
| }, | |
| { | |
| "entropy": 1.5564636278152466, | |
| "epoch": 1.662876634451393, | |
| "grad_norm": 1.22114098072052, | |
| "learning_rate": 8.410734852380231e-06, | |
| "loss": 1.1478, | |
| "mean_token_accuracy": 0.7590148377418519, | |
| "num_tokens": 862893390.0, | |
| "step": 35100 | |
| }, | |
| { | |
| "entropy": 1.5701312077045442, | |
| "epoch": 1.665245404585939, | |
| "grad_norm": 1.4223356246948242, | |
| "learning_rate": 8.296342880673513e-06, | |
| "loss": 1.1266, | |
| "mean_token_accuracy": 0.763382934331894, | |
| "num_tokens": 864117153.0, | |
| "step": 35150 | |
| }, | |
| { | |
| "entropy": 1.587481471300125, | |
| "epoch": 1.6676141747204851, | |
| "grad_norm": 1.5688437223434448, | |
| "learning_rate": 8.182663734487372e-06, | |
| "loss": 1.1656, | |
| "mean_token_accuracy": 0.7555125683546067, | |
| "num_tokens": 865348622.0, | |
| "step": 35200 | |
| }, | |
| { | |
| "entropy": 1.5680071783065797, | |
| "epoch": 1.6699829448550312, | |
| "grad_norm": 1.2558252811431885, | |
| "learning_rate": 8.069699356898309e-06, | |
| "loss": 1.151, | |
| "mean_token_accuracy": 0.7581069612503052, | |
| "num_tokens": 866584445.0, | |
| "step": 35250 | |
| }, | |
| { | |
| "entropy": 1.5600192046165466, | |
| "epoch": 1.6723517149895775, | |
| "grad_norm": 1.3348325490951538, | |
| "learning_rate": 7.95745167876556e-06, | |
| "loss": 1.1564, | |
| "mean_token_accuracy": 0.7571524727344513, | |
| "num_tokens": 867822403.0, | |
| "step": 35300 | |
| }, | |
| { | |
| "entropy": 1.562008023262024, | |
| "epoch": 1.6747204851241235, | |
| "grad_norm": 1.3110319375991821, | |
| "learning_rate": 7.84592261869806e-06, | |
| "loss": 1.1462, | |
| "mean_token_accuracy": 0.7590863239765168, | |
| "num_tokens": 869072939.0, | |
| "step": 35350 | |
| }, | |
| { | |
| "entropy": 1.5816670620441438, | |
| "epoch": 1.6770892552586698, | |
| "grad_norm": 1.2804386615753174, | |
| "learning_rate": 7.735114083021683e-06, | |
| "loss": 1.1353, | |
| "mean_token_accuracy": 0.7603730088472367, | |
| "num_tokens": 870288358.0, | |
| "step": 35400 | |
| }, | |
| { | |
| "entropy": 1.546754379272461, | |
| "epoch": 1.6794580253932159, | |
| "grad_norm": 1.1443445682525635, | |
| "learning_rate": 7.625027965746634e-06, | |
| "loss": 1.1473, | |
| "mean_token_accuracy": 0.7597916102409363, | |
| "num_tokens": 871537045.0, | |
| "step": 35450 | |
| }, | |
| { | |
| "entropy": 1.5389190435409545, | |
| "epoch": 1.681826795527762, | |
| "grad_norm": 1.2796293497085571, | |
| "learning_rate": 7.515666148535067e-06, | |
| "loss": 1.1159, | |
| "mean_token_accuracy": 0.7650646787881851, | |
| "num_tokens": 872759023.0, | |
| "step": 35500 | |
| }, | |
| { | |
| "entropy": 1.5631573498249054, | |
| "epoch": 1.684195565662308, | |
| "grad_norm": 1.503520131111145, | |
| "learning_rate": 7.407030500668971e-06, | |
| "loss": 1.1688, | |
| "mean_token_accuracy": 0.7553135341405869, | |
| "num_tokens": 873995801.0, | |
| "step": 35550 | |
| }, | |
| { | |
| "entropy": 1.5810019493103027, | |
| "epoch": 1.6865643357968543, | |
| "grad_norm": 1.126876950263977, | |
| "learning_rate": 7.299122879018155e-06, | |
| "loss": 1.1475, | |
| "mean_token_accuracy": 0.7582389563322067, | |
| "num_tokens": 875225780.0, | |
| "step": 35600 | |
| }, | |
| { | |
| "entropy": 1.5617643618583679, | |
| "epoch": 1.6889331059314006, | |
| "grad_norm": 1.041165828704834, | |
| "learning_rate": 7.191945128008548e-06, | |
| "loss": 1.1599, | |
| "mean_token_accuracy": 0.7565414899587631, | |
| "num_tokens": 876441973.0, | |
| "step": 35650 | |
| }, | |
| { | |
| "entropy": 1.5521779787540435, | |
| "epoch": 1.6913018760659466, | |
| "grad_norm": 1.024032711982727, | |
| "learning_rate": 7.085499079590674e-06, | |
| "loss": 1.1359, | |
| "mean_token_accuracy": 0.7614186578989028, | |
| "num_tokens": 877691572.0, | |
| "step": 35700 | |
| }, | |
| { | |
| "entropy": 1.5647426414489747, | |
| "epoch": 1.6936706462004927, | |
| "grad_norm": 1.4068409204483032, | |
| "learning_rate": 6.979786553208306e-06, | |
| "loss": 1.1434, | |
| "mean_token_accuracy": 0.7604501461982727, | |
| "num_tokens": 878910690.0, | |
| "step": 35750 | |
| }, | |
| { | |
| "entropy": 1.5498666989803314, | |
| "epoch": 1.6960394163350387, | |
| "grad_norm": 1.2371301651000977, | |
| "learning_rate": 6.8748093557674084e-06, | |
| "loss": 1.1359, | |
| "mean_token_accuracy": 0.7612047231197357, | |
| "num_tokens": 880160478.0, | |
| "step": 35800 | |
| }, | |
| { | |
| "entropy": 1.5569170558452605, | |
| "epoch": 1.698408186469585, | |
| "grad_norm": 1.2279973030090332, | |
| "learning_rate": 6.770569281605244e-06, | |
| "loss": 1.1249, | |
| "mean_token_accuracy": 0.7620308262109756, | |
| "num_tokens": 881367218.0, | |
| "step": 35850 | |
| }, | |
| { | |
| "entropy": 1.5841618192195892, | |
| "epoch": 1.700776956604131, | |
| "grad_norm": 1.173069953918457, | |
| "learning_rate": 6.667068112459662e-06, | |
| "loss": 1.1585, | |
| "mean_token_accuracy": 0.7556693691015244, | |
| "num_tokens": 882584025.0, | |
| "step": 35900 | |
| }, | |
| { | |
| "entropy": 1.5525937521457671, | |
| "epoch": 1.7031457267386774, | |
| "grad_norm": 1.2860488891601562, | |
| "learning_rate": 6.56430761743872e-06, | |
| "loss": 1.1681, | |
| "mean_token_accuracy": 0.7552351075410842, | |
| "num_tokens": 883859871.0, | |
| "step": 35950 | |
| }, | |
| { | |
| "entropy": 1.5703179001808167, | |
| "epoch": 1.7055144968732234, | |
| "grad_norm": 1.2826309204101562, | |
| "learning_rate": 6.462289552990353e-06, | |
| "loss": 1.1341, | |
| "mean_token_accuracy": 0.7611742705106735, | |
| "num_tokens": 885071859.0, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 1.7055144968732234, | |
| "eval_entropy": 1.0768880580103375, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.7609103475268005, | |
| "eval_num_tokens": 885071859.0, | |
| "eval_runtime": 728.1701, | |
| "eval_samples_per_second": 34.079, | |
| "eval_steps_per_second": 4.26, | |
| "step": 36000 | |
| }, | |
| { | |
| "entropy": 1.5391144120693208, | |
| "epoch": 1.7078832670077695, | |
| "grad_norm": 1.2603217363357544, | |
| "learning_rate": 6.361015662872433e-06, | |
| "loss": 1.1158, | |
| "mean_token_accuracy": 0.7669140672683716, | |
| "num_tokens": 886316413.0, | |
| "step": 36050 | |
| }, | |
| { | |
| "entropy": 1.5330300676822661, | |
| "epoch": 1.7102520371423156, | |
| "grad_norm": 1.2248510122299194, | |
| "learning_rate": 6.260487678122911e-06, | |
| "loss": 1.0644, | |
| "mean_token_accuracy": 0.7753306698799133, | |
| "num_tokens": 887565349.0, | |
| "step": 36100 | |
| }, | |
| { | |
| "entropy": 1.5597666120529174, | |
| "epoch": 1.7126208072768618, | |
| "grad_norm": 1.0783295631408691, | |
| "learning_rate": 6.160707317030256e-06, | |
| "loss": 1.109, | |
| "mean_token_accuracy": 0.7654628306627274, | |
| "num_tokens": 888770171.0, | |
| "step": 36150 | |
| }, | |
| { | |
| "entropy": 1.5886062026023864, | |
| "epoch": 1.7149895774114081, | |
| "grad_norm": 1.518917202949524, | |
| "learning_rate": 6.0616762851040675e-06, | |
| "loss": 1.1602, | |
| "mean_token_accuracy": 0.7575176376104354, | |
| "num_tokens": 889998256.0, | |
| "step": 36200 | |
| }, | |
| { | |
| "entropy": 1.5691123294830323, | |
| "epoch": 1.7173583475459542, | |
| "grad_norm": 1.1943705081939697, | |
| "learning_rate": 5.963396275045951e-06, | |
| "loss": 1.1476, | |
| "mean_token_accuracy": 0.758755573630333, | |
| "num_tokens": 891199335.0, | |
| "step": 36250 | |
| }, | |
| { | |
| "entropy": 1.5685584223270417, | |
| "epoch": 1.7197271176805002, | |
| "grad_norm": 1.2932955026626587, | |
| "learning_rate": 5.865868966720556e-06, | |
| "loss": 1.1354, | |
| "mean_token_accuracy": 0.7614442694187165, | |
| "num_tokens": 892434722.0, | |
| "step": 36300 | |
| }, | |
| { | |
| "entropy": 1.5710162222385406, | |
| "epoch": 1.7220958878150463, | |
| "grad_norm": 1.200039267539978, | |
| "learning_rate": 5.769096027126869e-06, | |
| "loss": 1.1766, | |
| "mean_token_accuracy": 0.7540206718444824, | |
| "num_tokens": 893676597.0, | |
| "step": 36350 | |
| }, | |
| { | |
| "entropy": 1.562828722000122, | |
| "epoch": 1.7244646579495926, | |
| "grad_norm": 1.2064563035964966, | |
| "learning_rate": 5.673079110369722e-06, | |
| "loss": 1.121, | |
| "mean_token_accuracy": 0.7634602183103562, | |
| "num_tokens": 894910050.0, | |
| "step": 36400 | |
| }, | |
| { | |
| "entropy": 1.5540617489814759, | |
| "epoch": 1.7268334280841389, | |
| "grad_norm": 1.4902048110961914, | |
| "learning_rate": 5.577819857631539e-06, | |
| "loss": 1.1201, | |
| "mean_token_accuracy": 0.7639645302295685, | |
| "num_tokens": 896142711.0, | |
| "step": 36450 | |
| }, | |
| { | |
| "entropy": 1.554260642528534, | |
| "epoch": 1.729202198218685, | |
| "grad_norm": 1.2376636266708374, | |
| "learning_rate": 5.483319897144257e-06, | |
| "loss": 1.141, | |
| "mean_token_accuracy": 0.7609711056947708, | |
| "num_tokens": 897387745.0, | |
| "step": 36500 | |
| }, | |
| { | |
| "entropy": 1.5512582790851592, | |
| "epoch": 1.731570968353231, | |
| "grad_norm": 1.0070257186889648, | |
| "learning_rate": 5.389580844161491e-06, | |
| "loss": 1.151, | |
| "mean_token_accuracy": 0.7582071113586426, | |
| "num_tokens": 898612694.0, | |
| "step": 36550 | |
| }, | |
| { | |
| "entropy": 1.5260178673267364, | |
| "epoch": 1.733939738487777, | |
| "grad_norm": 1.035585880279541, | |
| "learning_rate": 5.296604300930968e-06, | |
| "loss": 1.1097, | |
| "mean_token_accuracy": 0.7681008791923523, | |
| "num_tokens": 899864115.0, | |
| "step": 36600 | |
| }, | |
| { | |
| "entropy": 1.557324800491333, | |
| "epoch": 1.7363085086223233, | |
| "grad_norm": 1.2301568984985352, | |
| "learning_rate": 5.204391856667101e-06, | |
| "loss": 1.1191, | |
| "mean_token_accuracy": 0.7642790126800537, | |
| "num_tokens": 901100268.0, | |
| "step": 36650 | |
| }, | |
| { | |
| "entropy": 1.5380194628238677, | |
| "epoch": 1.7386772787568694, | |
| "grad_norm": 1.246462345123291, | |
| "learning_rate": 5.112945087523824e-06, | |
| "loss": 1.1108, | |
| "mean_token_accuracy": 0.7644780373573303, | |
| "num_tokens": 902310249.0, | |
| "step": 36700 | |
| }, | |
| { | |
| "entropy": 1.5637565624713898, | |
| "epoch": 1.7410460488914157, | |
| "grad_norm": 1.1634399890899658, | |
| "learning_rate": 5.022265556567668e-06, | |
| "loss": 1.1319, | |
| "mean_token_accuracy": 0.7625255084037781, | |
| "num_tokens": 903523545.0, | |
| "step": 36750 | |
| }, | |
| { | |
| "entropy": 1.5375991368293762, | |
| "epoch": 1.7434148190259617, | |
| "grad_norm": 1.3280473947525024, | |
| "learning_rate": 4.9323548137510555e-06, | |
| "loss": 1.1053, | |
| "mean_token_accuracy": 0.7662223023176193, | |
| "num_tokens": 904774364.0, | |
| "step": 36800 | |
| }, | |
| { | |
| "entropy": 1.5782112526893615, | |
| "epoch": 1.7457835891605078, | |
| "grad_norm": 1.3013827800750732, | |
| "learning_rate": 4.843214395885776e-06, | |
| "loss": 1.1594, | |
| "mean_token_accuracy": 0.758129763007164, | |
| "num_tokens": 906007167.0, | |
| "step": 36850 | |
| }, | |
| { | |
| "entropy": 1.541445196866989, | |
| "epoch": 1.7481523592950539, | |
| "grad_norm": 0.9861883521080017, | |
| "learning_rate": 4.754845826616727e-06, | |
| "loss": 1.1442, | |
| "mean_token_accuracy": 0.7601429998874665, | |
| "num_tokens": 907201311.0, | |
| "step": 36900 | |
| }, | |
| { | |
| "entropy": 1.5591549813747405, | |
| "epoch": 1.7505211294296001, | |
| "grad_norm": 1.1912263631820679, | |
| "learning_rate": 4.667250616395885e-06, | |
| "loss": 1.1229, | |
| "mean_token_accuracy": 0.7642272913455963, | |
| "num_tokens": 908429689.0, | |
| "step": 36950 | |
| }, | |
| { | |
| "entropy": 1.53731192111969, | |
| "epoch": 1.7528898995641464, | |
| "grad_norm": 1.2835556268692017, | |
| "learning_rate": 4.580430262456503e-06, | |
| "loss": 1.0855, | |
| "mean_token_accuracy": 0.770037140250206, | |
| "num_tokens": 909656463.0, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 1.7528898995641464, | |
| "eval_entropy": 1.0678023306651703, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.7609675386542278, | |
| "eval_num_tokens": 909656463.0, | |
| "eval_runtime": 744.4803, | |
| "eval_samples_per_second": 33.332, | |
| "eval_steps_per_second": 4.167, | |
| "step": 37000 | |
| }, | |
| { | |
| "entropy": 1.555112097263336, | |
| "epoch": 1.7552586696986925, | |
| "grad_norm": 1.195916771888733, | |
| "learning_rate": 4.4943862487874575e-06, | |
| "loss": 1.1449, | |
| "mean_token_accuracy": 0.7592443466186524, | |
| "num_tokens": 910867530.0, | |
| "step": 37050 | |
| }, | |
| { | |
| "entropy": 1.5418341505527495, | |
| "epoch": 1.7576274398332385, | |
| "grad_norm": 1.139011025428772, | |
| "learning_rate": 4.409120046107945e-06, | |
| "loss": 1.1017, | |
| "mean_token_accuracy": 0.7676555049419403, | |
| "num_tokens": 912088709.0, | |
| "step": 37100 | |
| }, | |
| { | |
| "entropy": 1.5542387223243714, | |
| "epoch": 1.7599962099677846, | |
| "grad_norm": 1.1180921792984009, | |
| "learning_rate": 4.324633111842308e-06, | |
| "loss": 1.1473, | |
| "mean_token_accuracy": 0.759157150387764, | |
| "num_tokens": 913332056.0, | |
| "step": 37150 | |
| }, | |
| { | |
| "entropy": 1.5596051335334777, | |
| "epoch": 1.762364980102331, | |
| "grad_norm": 1.006624460220337, | |
| "learning_rate": 4.240926890095148e-06, | |
| "loss": 1.1482, | |
| "mean_token_accuracy": 0.7598807489871979, | |
| "num_tokens": 914537591.0, | |
| "step": 37200 | |
| }, | |
| { | |
| "entropy": 1.5528207927942277, | |
| "epoch": 1.7647337502368772, | |
| "grad_norm": 0.971926748752594, | |
| "learning_rate": 4.158002811626621e-06, | |
| "loss": 1.1571, | |
| "mean_token_accuracy": 0.7576300024986267, | |
| "num_tokens": 915743333.0, | |
| "step": 37250 | |
| }, | |
| { | |
| "entropy": 1.5758486306667328, | |
| "epoch": 1.7671025203714232, | |
| "grad_norm": 1.1977986097335815, | |
| "learning_rate": 4.075862293827986e-06, | |
| "loss": 1.1495, | |
| "mean_token_accuracy": 0.7577051311731339, | |
| "num_tokens": 916959683.0, | |
| "step": 37300 | |
| }, | |
| { | |
| "entropy": 1.544589899778366, | |
| "epoch": 1.7694712905059693, | |
| "grad_norm": 1.3282675743103027, | |
| "learning_rate": 3.994506740697407e-06, | |
| "loss": 1.1269, | |
| "mean_token_accuracy": 0.7617994117736816, | |
| "num_tokens": 918211320.0, | |
| "step": 37350 | |
| }, | |
| { | |
| "entropy": 1.5542384481430054, | |
| "epoch": 1.7718400606405154, | |
| "grad_norm": 1.4619874954223633, | |
| "learning_rate": 3.9139375428159095e-06, | |
| "loss": 1.1173, | |
| "mean_token_accuracy": 0.7629494529962539, | |
| "num_tokens": 919446181.0, | |
| "step": 37400 | |
| }, | |
| { | |
| "entropy": 1.5291326987743377, | |
| "epoch": 1.7742088307750616, | |
| "grad_norm": 1.3056997060775757, | |
| "learning_rate": 3.834156077323636e-06, | |
| "loss": 1.0887, | |
| "mean_token_accuracy": 0.7687935763597489, | |
| "num_tokens": 920685182.0, | |
| "step": 37450 | |
| }, | |
| { | |
| "entropy": 1.5420164275169372, | |
| "epoch": 1.7765776009096077, | |
| "grad_norm": 1.2205777168273926, | |
| "learning_rate": 3.7551637078963085e-06, | |
| "loss": 1.1142, | |
| "mean_token_accuracy": 0.7653926169872284, | |
| "num_tokens": 921939020.0, | |
| "step": 37500 | |
| }, | |
| { | |
| "entropy": 1.567058709859848, | |
| "epoch": 1.778946371044154, | |
| "grad_norm": 0.9547618627548218, | |
| "learning_rate": 3.6769617847219164e-06, | |
| "loss": 1.1223, | |
| "mean_token_accuracy": 0.7639624851942063, | |
| "num_tokens": 923177265.0, | |
| "step": 37550 | |
| }, | |
| { | |
| "entropy": 1.5774991846084594, | |
| "epoch": 1.7813151411787, | |
| "grad_norm": 1.2139365673065186, | |
| "learning_rate": 3.5995516444776276e-06, | |
| "loss": 1.1457, | |
| "mean_token_accuracy": 0.7596712547540665, | |
| "num_tokens": 924378635.0, | |
| "step": 37600 | |
| }, | |
| { | |
| "entropy": 1.5741923189163207, | |
| "epoch": 1.783683911313246, | |
| "grad_norm": 1.3455299139022827, | |
| "learning_rate": 3.5229346103069547e-06, | |
| "loss": 1.1265, | |
| "mean_token_accuracy": 0.7622561120986938, | |
| "num_tokens": 925558387.0, | |
| "step": 37650 | |
| }, | |
| { | |
| "entropy": 1.5316821897029878, | |
| "epoch": 1.7860526814477922, | |
| "grad_norm": 1.833621859550476, | |
| "learning_rate": 3.4471119917971473e-06, | |
| "loss": 1.1108, | |
| "mean_token_accuracy": 0.7637511855363845, | |
| "num_tokens": 926797544.0, | |
| "step": 37700 | |
| }, | |
| { | |
| "entropy": 1.5604101026058197, | |
| "epoch": 1.7884214515823385, | |
| "grad_norm": 1.2970396280288696, | |
| "learning_rate": 3.3720850849567944e-06, | |
| "loss": 1.112, | |
| "mean_token_accuracy": 0.7661514669656754, | |
| "num_tokens": 928034501.0, | |
| "step": 37750 | |
| }, | |
| { | |
| "entropy": 1.543112144470215, | |
| "epoch": 1.7907902217168847, | |
| "grad_norm": 0.9984686970710754, | |
| "learning_rate": 3.297855172193659e-06, | |
| "loss": 1.1264, | |
| "mean_token_accuracy": 0.7631747448444366, | |
| "num_tokens": 929281453.0, | |
| "step": 37800 | |
| }, | |
| { | |
| "entropy": 1.5535426819324494, | |
| "epoch": 1.7931589918514308, | |
| "grad_norm": 1.123579740524292, | |
| "learning_rate": 3.22442352229278e-06, | |
| "loss": 1.1449, | |
| "mean_token_accuracy": 0.760178684592247, | |
| "num_tokens": 930526986.0, | |
| "step": 37850 | |
| }, | |
| { | |
| "entropy": 1.533100154399872, | |
| "epoch": 1.7955277619859769, | |
| "grad_norm": 1.1597360372543335, | |
| "learning_rate": 3.1517913903947707e-06, | |
| "loss": 1.1216, | |
| "mean_token_accuracy": 0.7638274627923965, | |
| "num_tokens": 931736763.0, | |
| "step": 37900 | |
| }, | |
| { | |
| "entropy": 1.5554070842266083, | |
| "epoch": 1.797896532120523, | |
| "grad_norm": 1.2038190364837646, | |
| "learning_rate": 3.0799600179743927e-06, | |
| "loss": 1.1308, | |
| "mean_token_accuracy": 0.7614258807897568, | |
| "num_tokens": 932923483.0, | |
| "step": 37950 | |
| }, | |
| { | |
| "entropy": 1.540804421901703, | |
| "epoch": 1.8002653022550692, | |
| "grad_norm": 1.0928473472595215, | |
| "learning_rate": 3.00893063281929e-06, | |
| "loss": 1.1338, | |
| "mean_token_accuracy": 0.7619293278455734, | |
| "num_tokens": 934170514.0, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 1.8002653022550692, | |
| "eval_entropy": 1.0719800475410766, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.7611359905939115, | |
| "eval_num_tokens": 934170514.0, | |
| "eval_runtime": 723.3133, | |
| "eval_samples_per_second": 34.307, | |
| "eval_steps_per_second": 4.289, | |
| "step": 38000 | |
| }, | |
| { | |
| "entropy": 1.5533955585956574, | |
| "epoch": 1.8026340723896153, | |
| "grad_norm": 1.0474472045898438, | |
| "learning_rate": 2.9387044490090385e-06, | |
| "loss": 1.1715, | |
| "mean_token_accuracy": 0.7546948331594467, | |
| "num_tokens": 935423080.0, | |
| "step": 38050 | |
| }, | |
| { | |
| "entropy": 1.5497667694091797, | |
| "epoch": 1.8050028425241615, | |
| "grad_norm": 1.1220715045928955, | |
| "learning_rate": 2.869282666894402e-06, | |
| "loss": 1.1327, | |
| "mean_token_accuracy": 0.7612757116556168, | |
| "num_tokens": 936650729.0, | |
| "step": 38100 | |
| }, | |
| { | |
| "entropy": 1.5674825513362884, | |
| "epoch": 1.8073716126587076, | |
| "grad_norm": 1.4526468515396118, | |
| "learning_rate": 2.8006664730767683e-06, | |
| "loss": 1.1427, | |
| "mean_token_accuracy": 0.7606144285202027, | |
| "num_tokens": 937892138.0, | |
| "step": 38150 | |
| }, | |
| { | |
| "entropy": 1.5554044562578202, | |
| "epoch": 1.8097403827932537, | |
| "grad_norm": 1.2976021766662598, | |
| "learning_rate": 2.7328570403879205e-06, | |
| "loss": 1.1397, | |
| "mean_token_accuracy": 0.7616138017177582, | |
| "num_tokens": 939126149.0, | |
| "step": 38200 | |
| }, | |
| { | |
| "entropy": 1.568613636493683, | |
| "epoch": 1.8121091529277997, | |
| "grad_norm": 1.0613607168197632, | |
| "learning_rate": 2.665855527869948e-06, | |
| "loss": 1.1463, | |
| "mean_token_accuracy": 0.759678093791008, | |
| "num_tokens": 940346079.0, | |
| "step": 38250 | |
| }, | |
| { | |
| "entropy": 1.5337522840499878, | |
| "epoch": 1.814477923062346, | |
| "grad_norm": 1.267045259475708, | |
| "learning_rate": 2.59966308075546e-06, | |
| "loss": 1.1493, | |
| "mean_token_accuracy": 0.7598909741640091, | |
| "num_tokens": 941597120.0, | |
| "step": 38300 | |
| }, | |
| { | |
| "entropy": 1.5623528015613557, | |
| "epoch": 1.8168466931968923, | |
| "grad_norm": 1.3286635875701904, | |
| "learning_rate": 2.5342808304479993e-06, | |
| "loss": 1.1713, | |
| "mean_token_accuracy": 0.754482525587082, | |
| "num_tokens": 942822640.0, | |
| "step": 38350 | |
| }, | |
| { | |
| "entropy": 1.5387673115730285, | |
| "epoch": 1.8192154633314384, | |
| "grad_norm": 1.1870768070220947, | |
| "learning_rate": 2.46970989450272e-06, | |
| "loss": 1.1121, | |
| "mean_token_accuracy": 0.7645811969041825, | |
| "num_tokens": 944054652.0, | |
| "step": 38400 | |
| }, | |
| { | |
| "entropy": 1.5411259424686432, | |
| "epoch": 1.8215842334659844, | |
| "grad_norm": 1.487240195274353, | |
| "learning_rate": 2.405951376607257e-06, | |
| "loss": 1.0865, | |
| "mean_token_accuracy": 0.7703835678100586, | |
| "num_tokens": 945284822.0, | |
| "step": 38450 | |
| }, | |
| { | |
| "entropy": 1.586391316652298, | |
| "epoch": 1.8239530036005305, | |
| "grad_norm": 1.198615312576294, | |
| "learning_rate": 2.3430063665628943e-06, | |
| "loss": 1.163, | |
| "mean_token_accuracy": 0.7574268835783005, | |
| "num_tokens": 946506870.0, | |
| "step": 38500 | |
| }, | |
| { | |
| "entropy": 1.5495548892021178, | |
| "epoch": 1.8263217737350768, | |
| "grad_norm": 1.2452329397201538, | |
| "learning_rate": 2.280875940265903e-06, | |
| "loss": 1.1172, | |
| "mean_token_accuracy": 0.7652907830476761, | |
| "num_tokens": 947752324.0, | |
| "step": 38550 | |
| }, | |
| { | |
| "entropy": 1.5577371573448182, | |
| "epoch": 1.828690543869623, | |
| "grad_norm": 1.1785380840301514, | |
| "learning_rate": 2.2195611596891872e-06, | |
| "loss": 1.113, | |
| "mean_token_accuracy": 0.7650933820009231, | |
| "num_tokens": 948980177.0, | |
| "step": 38600 | |
| }, | |
| { | |
| "entropy": 1.5553138053417206, | |
| "epoch": 1.831059314004169, | |
| "grad_norm": 1.3556625843048096, | |
| "learning_rate": 2.159063072864087e-06, | |
| "loss": 1.1187, | |
| "mean_token_accuracy": 0.7657572621107102, | |
| "num_tokens": 950168267.0, | |
| "step": 38650 | |
| }, | |
| { | |
| "entropy": 1.5441812425851822, | |
| "epoch": 1.8334280841387152, | |
| "grad_norm": 1.2191582918167114, | |
| "learning_rate": 2.09938271386253e-06, | |
| "loss": 1.132, | |
| "mean_token_accuracy": 0.7622571617364884, | |
| "num_tokens": 951380706.0, | |
| "step": 38700 | |
| }, | |
| { | |
| "entropy": 1.5715741848945617, | |
| "epoch": 1.8357968542732612, | |
| "grad_norm": 1.22894287109375, | |
| "learning_rate": 2.040521102779286e-06, | |
| "loss": 1.1266, | |
| "mean_token_accuracy": 0.7630192279815674, | |
| "num_tokens": 952592270.0, | |
| "step": 38750 | |
| }, | |
| { | |
| "entropy": 1.5562168991565704, | |
| "epoch": 1.8381656244078075, | |
| "grad_norm": 1.5664132833480835, | |
| "learning_rate": 1.982479245714569e-06, | |
| "loss": 1.1185, | |
| "mean_token_accuracy": 0.765923129916191, | |
| "num_tokens": 953815987.0, | |
| "step": 38800 | |
| }, | |
| { | |
| "entropy": 1.574343602657318, | |
| "epoch": 1.8405343945423536, | |
| "grad_norm": 1.1616158485412598, | |
| "learning_rate": 1.925258134756858e-06, | |
| "loss": 1.1508, | |
| "mean_token_accuracy": 0.7590267878770828, | |
| "num_tokens": 955053412.0, | |
| "step": 38850 | |
| }, | |
| { | |
| "entropy": 1.5616822016239167, | |
| "epoch": 1.8429031646768999, | |
| "grad_norm": 1.0465819835662842, | |
| "learning_rate": 1.8688587479658793e-06, | |
| "loss": 1.1668, | |
| "mean_token_accuracy": 0.7562251263856887, | |
| "num_tokens": 956294592.0, | |
| "step": 38900 | |
| }, | |
| { | |
| "entropy": 1.533266224861145, | |
| "epoch": 1.845271934811446, | |
| "grad_norm": 1.3192518949508667, | |
| "learning_rate": 1.8132820493559521e-06, | |
| "loss": 1.1001, | |
| "mean_token_accuracy": 0.7673191577196121, | |
| "num_tokens": 957522550.0, | |
| "step": 38950 | |
| }, | |
| { | |
| "entropy": 1.552718700170517, | |
| "epoch": 1.847640704945992, | |
| "grad_norm": 1.5773288011550903, | |
| "learning_rate": 1.758528988879471e-06, | |
| "loss": 1.1048, | |
| "mean_token_accuracy": 0.76726045191288, | |
| "num_tokens": 958757267.0, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 1.847640704945992, | |
| "eval_entropy": 1.0707699135186055, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.7611362978498987, | |
| "eval_num_tokens": 958757267.0, | |
| "eval_runtime": 728.2004, | |
| "eval_samples_per_second": 34.077, | |
| "eval_steps_per_second": 4.26, | |
| "step": 39000 | |
| }, | |
| { | |
| "entropy": 1.5483810114860534, | |
| "epoch": 1.850009475080538, | |
| "grad_norm": 1.4050700664520264, | |
| "learning_rate": 1.704600502410686e-06, | |
| "loss": 1.1036, | |
| "mean_token_accuracy": 0.7662443941831589, | |
| "num_tokens": 959971891.0, | |
| "step": 39050 | |
| }, | |
| { | |
| "entropy": 1.538731288909912, | |
| "epoch": 1.8523782452150843, | |
| "grad_norm": 1.1801916360855103, | |
| "learning_rate": 1.6514975117296994e-06, | |
| "loss": 1.101, | |
| "mean_token_accuracy": 0.7680959284305573, | |
| "num_tokens": 961210335.0, | |
| "step": 39100 | |
| }, | |
| { | |
| "entropy": 1.5468962013721466, | |
| "epoch": 1.8547470153496306, | |
| "grad_norm": 1.2052723169326782, | |
| "learning_rate": 1.599220924506728e-06, | |
| "loss": 1.1602, | |
| "mean_token_accuracy": 0.7576704436540603, | |
| "num_tokens": 962476786.0, | |
| "step": 39150 | |
| }, | |
| { | |
| "entropy": 1.5435814583301544, | |
| "epoch": 1.8571157854841767, | |
| "grad_norm": 1.056043267250061, | |
| "learning_rate": 1.547771634286549e-06, | |
| "loss": 1.1059, | |
| "mean_token_accuracy": 0.7669939565658569, | |
| "num_tokens": 963730472.0, | |
| "step": 39200 | |
| }, | |
| { | |
| "entropy": 1.557974625825882, | |
| "epoch": 1.8594845556187227, | |
| "grad_norm": 1.2635796070098877, | |
| "learning_rate": 1.4971505204732673e-06, | |
| "loss": 1.1212, | |
| "mean_token_accuracy": 0.7648367810249329, | |
| "num_tokens": 964964095.0, | |
| "step": 39250 | |
| }, | |
| { | |
| "entropy": 1.5005019557476045, | |
| "epoch": 1.8618533257532688, | |
| "grad_norm": 1.2261488437652588, | |
| "learning_rate": 1.4473584483152614e-06, | |
| "loss": 1.0945, | |
| "mean_token_accuracy": 0.7694985699653626, | |
| "num_tokens": 966211342.0, | |
| "step": 39300 | |
| }, | |
| { | |
| "entropy": 1.595631295442581, | |
| "epoch": 1.864222095887815, | |
| "grad_norm": 1.1187533140182495, | |
| "learning_rate": 1.3983962688904062e-06, | |
| "loss": 1.1547, | |
| "mean_token_accuracy": 0.7579811322689056, | |
| "num_tokens": 967415509.0, | |
| "step": 39350 | |
| }, | |
| { | |
| "entropy": 1.5132013654708862, | |
| "epoch": 1.8665908660223614, | |
| "grad_norm": 1.1046701669692993, | |
| "learning_rate": 1.3502648190915124e-06, | |
| "loss": 1.1251, | |
| "mean_token_accuracy": 0.7633352410793305, | |
| "num_tokens": 968645582.0, | |
| "step": 39400 | |
| }, | |
| { | |
| "entropy": 1.5683543026447295, | |
| "epoch": 1.8689596361569074, | |
| "grad_norm": 0.9930199384689331, | |
| "learning_rate": 1.3029649216120376e-06, | |
| "loss": 1.1359, | |
| "mean_token_accuracy": 0.7611577039957047, | |
| "num_tokens": 969861208.0, | |
| "step": 39450 | |
| }, | |
| { | |
| "entropy": 1.5816809368133544, | |
| "epoch": 1.8713284062914535, | |
| "grad_norm": 1.0561200380325317, | |
| "learning_rate": 1.2564973849320204e-06, | |
| "loss": 1.14, | |
| "mean_token_accuracy": 0.7613069009780884, | |
| "num_tokens": 971090749.0, | |
| "step": 39500 | |
| }, | |
| { | |
| "entropy": 1.5379055535793305, | |
| "epoch": 1.8736971764259995, | |
| "grad_norm": 1.1951854228973389, | |
| "learning_rate": 1.2108630033042412e-06, | |
| "loss": 1.1165, | |
| "mean_token_accuracy": 0.7650814574956893, | |
| "num_tokens": 972346968.0, | |
| "step": 39550 | |
| }, | |
| { | |
| "entropy": 1.5865603411197662, | |
| "epoch": 1.8760659465605458, | |
| "grad_norm": 1.2471119165420532, | |
| "learning_rate": 1.1660625567406768e-06, | |
| "loss": 1.1328, | |
| "mean_token_accuracy": 0.7614058357477188, | |
| "num_tokens": 973571764.0, | |
| "step": 39600 | |
| }, | |
| { | |
| "entropy": 1.5396224319934846, | |
| "epoch": 1.8784347166950919, | |
| "grad_norm": 1.348791241645813, | |
| "learning_rate": 1.1220968109991515e-06, | |
| "loss": 1.0901, | |
| "mean_token_accuracy": 0.769990593791008, | |
| "num_tokens": 974799757.0, | |
| "step": 39650 | |
| }, | |
| { | |
| "entropy": 1.553302252292633, | |
| "epoch": 1.8808034868296382, | |
| "grad_norm": 1.2576488256454468, | |
| "learning_rate": 1.0789665175702456e-06, | |
| "loss": 1.1081, | |
| "mean_token_accuracy": 0.7674774092435837, | |
| "num_tokens": 976007380.0, | |
| "step": 39700 | |
| }, | |
| { | |
| "entropy": 1.5356010353565217, | |
| "epoch": 1.8831722569641842, | |
| "grad_norm": 1.2327417135238647, | |
| "learning_rate": 1.036672413664458e-06, | |
| "loss": 1.1085, | |
| "mean_token_accuracy": 0.7655727046728135, | |
| "num_tokens": 977252116.0, | |
| "step": 39750 | |
| }, | |
| { | |
| "entropy": 1.5608667409420014, | |
| "epoch": 1.8855410270987303, | |
| "grad_norm": 1.2925286293029785, | |
| "learning_rate": 9.952152221996024e-07, | |
| "loss": 1.1201, | |
| "mean_token_accuracy": 0.7642844372987747, | |
| "num_tokens": 978457729.0, | |
| "step": 39800 | |
| }, | |
| { | |
| "entropy": 1.5483964371681214, | |
| "epoch": 1.8879097972332763, | |
| "grad_norm": 1.4176242351531982, | |
| "learning_rate": 9.54595651788448e-07, | |
| "loss": 1.1321, | |
| "mean_token_accuracy": 0.7614534211158752, | |
| "num_tokens": 979721797.0, | |
| "step": 39850 | |
| }, | |
| { | |
| "entropy": 1.556614215373993, | |
| "epoch": 1.8902785673678226, | |
| "grad_norm": 1.2233829498291016, | |
| "learning_rate": 9.148143967266209e-07, | |
| "loss": 1.1657, | |
| "mean_token_accuracy": 0.7561358803510666, | |
| "num_tokens": 980985021.0, | |
| "step": 39900 | |
| }, | |
| { | |
| "entropy": 1.5509166061878203, | |
| "epoch": 1.892647337502369, | |
| "grad_norm": 1.1454182863235474, | |
| "learning_rate": 8.758721369807099e-07, | |
| "loss": 1.103, | |
| "mean_token_accuracy": 0.7666506910324097, | |
| "num_tokens": 982221836.0, | |
| "step": 39950 | |
| }, | |
| { | |
| "entropy": 1.5321409046649932, | |
| "epoch": 1.895016107636915, | |
| "grad_norm": 1.1777846813201904, | |
| "learning_rate": 8.377695381766804e-07, | |
| "loss": 1.1016, | |
| "mean_token_accuracy": 0.7675345009565353, | |
| "num_tokens": 983496374.0, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 1.895016107636915, | |
| "eval_entropy": 1.0718190068778954, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.7612931992664097, | |
| "eval_num_tokens": 983496374.0, | |
| "eval_runtime": 728.5673, | |
| "eval_samples_per_second": 34.06, | |
| "eval_steps_per_second": 4.258, | |
| "step": 40000 | |
| }, | |
| { | |
| "entropy": 1.563276962041855, | |
| "epoch": 1.897384877771461, | |
| "grad_norm": 1.0903818607330322, | |
| "learning_rate": 8.00507251588456e-07, | |
| "loss": 1.1352, | |
| "mean_token_accuracy": 0.761111610531807, | |
| "num_tokens": 984743304.0, | |
| "step": 40050 | |
| }, | |
| { | |
| "entropy": 1.5520950222015382, | |
| "epoch": 1.899753647906007, | |
| "grad_norm": 1.4384498596191406, | |
| "learning_rate": 7.64085914126822e-07, | |
| "loss": 1.105, | |
| "mean_token_accuracy": 0.7678878873586654, | |
| "num_tokens": 985955766.0, | |
| "step": 40100 | |
| }, | |
| { | |
| "entropy": 1.522156765460968, | |
| "epoch": 1.9021224180405534, | |
| "grad_norm": 1.1247589588165283, | |
| "learning_rate": 7.285061483285227e-07, | |
| "loss": 1.0875, | |
| "mean_token_accuracy": 0.7697209113836289, | |
| "num_tokens": 987194885.0, | |
| "step": 40150 | |
| }, | |
| { | |
| "entropy": 1.5538606441020966, | |
| "epoch": 1.9044911881750994, | |
| "grad_norm": 1.1288164854049683, | |
| "learning_rate": 6.937685623456147e-07, | |
| "loss": 1.1135, | |
| "mean_token_accuracy": 0.7657132083177567, | |
| "num_tokens": 988419847.0, | |
| "step": 40200 | |
| }, | |
| { | |
| "entropy": 1.5719120705127716, | |
| "epoch": 1.9068599583096457, | |
| "grad_norm": 1.0764816999435425, | |
| "learning_rate": 6.598737499350915e-07, | |
| "loss": 1.1339, | |
| "mean_token_accuracy": 0.7616824221611023, | |
| "num_tokens": 989644278.0, | |
| "step": 40250 | |
| }, | |
| { | |
| "entropy": 1.531646077632904, | |
| "epoch": 1.9092287284441918, | |
| "grad_norm": 1.442927360534668, | |
| "learning_rate": 6.268222904487087e-07, | |
| "loss": 1.1163, | |
| "mean_token_accuracy": 0.7660135048627853, | |
| "num_tokens": 990874401.0, | |
| "step": 40300 | |
| }, | |
| { | |
| "entropy": 1.5490214943885803, | |
| "epoch": 1.9115974985787378, | |
| "grad_norm": 1.215856671333313, | |
| "learning_rate": 5.946147488231135e-07, | |
| "loss": 1.1019, | |
| "mean_token_accuracy": 0.7664029818773269, | |
| "num_tokens": 992095108.0, | |
| "step": 40350 | |
| }, | |
| { | |
| "entropy": 1.5366157114505767, | |
| "epoch": 1.913966268713284, | |
| "grad_norm": 1.2367199659347534, | |
| "learning_rate": 5.632516755701588e-07, | |
| "loss": 1.1322, | |
| "mean_token_accuracy": 0.763009768128395, | |
| "num_tokens": 993348612.0, | |
| "step": 40400 | |
| }, | |
| { | |
| "entropy": 1.5416124892234802, | |
| "epoch": 1.9163350388478302, | |
| "grad_norm": 1.5094550848007202, | |
| "learning_rate": 5.327336067674992e-07, | |
| "loss": 1.121, | |
| "mean_token_accuracy": 0.7645821911096573, | |
| "num_tokens": 994580681.0, | |
| "step": 40450 | |
| }, | |
| { | |
| "entropy": 1.5396121156215667, | |
| "epoch": 1.9187038089823765, | |
| "grad_norm": 1.1620501279830933, | |
| "learning_rate": 5.030610640494427e-07, | |
| "loss": 1.0964, | |
| "mean_token_accuracy": 0.7677625626325607, | |
| "num_tokens": 995811651.0, | |
| "step": 40500 | |
| }, | |
| { | |
| "entropy": 1.551676151752472, | |
| "epoch": 1.9210725791169225, | |
| "grad_norm": 1.1541757583618164, | |
| "learning_rate": 4.7423455459803536e-07, | |
| "loss": 1.1273, | |
| "mean_token_accuracy": 0.7630288958549499, | |
| "num_tokens": 997009574.0, | |
| "step": 40550 | |
| }, | |
| { | |
| "entropy": 1.5580232727527619, | |
| "epoch": 1.9234413492514686, | |
| "grad_norm": 1.1015989780426025, | |
| "learning_rate": 4.46254571134358e-07, | |
| "loss": 1.1074, | |
| "mean_token_accuracy": 0.7681276690959931, | |
| "num_tokens": 998241396.0, | |
| "step": 40600 | |
| }, | |
| { | |
| "entropy": 1.5923674273490906, | |
| "epoch": 1.9258101193860147, | |
| "grad_norm": 1.3336706161499023, | |
| "learning_rate": 4.1912159191015433e-07, | |
| "loss": 1.1296, | |
| "mean_token_accuracy": 0.7642046666145325, | |
| "num_tokens": 999467429.0, | |
| "step": 40650 | |
| }, | |
| { | |
| "entropy": 1.5437606346607209, | |
| "epoch": 1.928178889520561, | |
| "grad_norm": 1.2162717580795288, | |
| "learning_rate": 3.928360806996212e-07, | |
| "loss": 1.1328, | |
| "mean_token_accuracy": 0.7629256331920624, | |
| "num_tokens": 1000699068.0, | |
| "step": 40700 | |
| }, | |
| { | |
| "entropy": 1.5417852425575256, | |
| "epoch": 1.9305476596551072, | |
| "grad_norm": 1.6152098178863525, | |
| "learning_rate": 3.673984867914815e-07, | |
| "loss": 1.1152, | |
| "mean_token_accuracy": 0.7649567657709122, | |
| "num_tokens": 1001935400.0, | |
| "step": 40750 | |
| }, | |
| { | |
| "entropy": 1.5360798180103301, | |
| "epoch": 1.9329164297896533, | |
| "grad_norm": 1.1792229413986206, | |
| "learning_rate": 3.4280924498132917e-07, | |
| "loss": 1.0897, | |
| "mean_token_accuracy": 0.7702373021841049, | |
| "num_tokens": 1003148072.0, | |
| "step": 40800 | |
| }, | |
| { | |
| "entropy": 1.5614196360111237, | |
| "epoch": 1.9352851999241993, | |
| "grad_norm": 1.244520664215088, | |
| "learning_rate": 3.1906877556417414e-07, | |
| "loss": 1.1636, | |
| "mean_token_accuracy": 0.7582697266340256, | |
| "num_tokens": 1004371846.0, | |
| "step": 40850 | |
| }, | |
| { | |
| "entropy": 1.5311019134521484, | |
| "epoch": 1.9376539700587454, | |
| "grad_norm": 1.064553141593933, | |
| "learning_rate": 2.961774843272702e-07, | |
| "loss": 1.0873, | |
| "mean_token_accuracy": 0.7693702638149261, | |
| "num_tokens": 1005607072.0, | |
| "step": 40900 | |
| }, | |
| { | |
| "entropy": 1.5764271855354308, | |
| "epoch": 1.9400227401932917, | |
| "grad_norm": 1.3938215970993042, | |
| "learning_rate": 2.7413576254317065e-07, | |
| "loss": 1.1587, | |
| "mean_token_accuracy": 0.7578467607498169, | |
| "num_tokens": 1006850136.0, | |
| "step": 40950 | |
| }, | |
| { | |
| "entropy": 1.5480285215377807, | |
| "epoch": 1.9423915103278377, | |
| "grad_norm": 1.2054554224014282, | |
| "learning_rate": 2.529439869630612e-07, | |
| "loss": 1.1434, | |
| "mean_token_accuracy": 0.7606914877891541, | |
| "num_tokens": 1008107025.0, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 1.9423915103278377, | |
| "eval_entropy": 1.068777510758218, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.7612478421396935, | |
| "eval_num_tokens": 1008107025.0, | |
| "eval_runtime": 729.2965, | |
| "eval_samples_per_second": 34.026, | |
| "eval_steps_per_second": 4.253, | |
| "step": 41000 | |
| }, | |
| { | |
| "entropy": 1.5319510400295258, | |
| "epoch": 1.944760280462384, | |
| "grad_norm": 1.2180676460266113, | |
| "learning_rate": 2.326025198102877e-07, | |
| "loss": 1.1041, | |
| "mean_token_accuracy": 0.7670031028985977, | |
| "num_tokens": 1009369117.0, | |
| "step": 41050 | |
| }, | |
| { | |
| "entropy": 1.5614441645145416, | |
| "epoch": 1.94712905059693, | |
| "grad_norm": 1.0921282768249512, | |
| "learning_rate": 2.1311170877418296e-07, | |
| "loss": 1.1394, | |
| "mean_token_accuracy": 0.7615219783782959, | |
| "num_tokens": 1010587367.0, | |
| "step": 41100 | |
| }, | |
| { | |
| "entropy": 1.5393009448051453, | |
| "epoch": 1.9494978207314761, | |
| "grad_norm": 0.9701796770095825, | |
| "learning_rate": 1.9447188700413287e-07, | |
| "loss": 1.097, | |
| "mean_token_accuracy": 0.7700717490911484, | |
| "num_tokens": 1011833377.0, | |
| "step": 41150 | |
| }, | |
| { | |
| "entropy": 1.541386902332306, | |
| "epoch": 1.9518665908660222, | |
| "grad_norm": 1.24893319606781, | |
| "learning_rate": 1.7668337310386418e-07, | |
| "loss": 1.1177, | |
| "mean_token_accuracy": 0.7653918391466141, | |
| "num_tokens": 1013068815.0, | |
| "step": 41200 | |
| }, | |
| { | |
| "entropy": 1.5547258961200714, | |
| "epoch": 1.9542353610005685, | |
| "grad_norm": 1.079546570777893, | |
| "learning_rate": 1.5974647112600994e-07, | |
| "loss": 1.123, | |
| "mean_token_accuracy": 0.7637561255693436, | |
| "num_tokens": 1014274097.0, | |
| "step": 41250 | |
| }, | |
| { | |
| "entropy": 1.5249077999591827, | |
| "epoch": 1.9566041311351148, | |
| "grad_norm": 1.5189976692199707, | |
| "learning_rate": 1.436614705669026e-07, | |
| "loss": 1.1098, | |
| "mean_token_accuracy": 0.7675369191169739, | |
| "num_tokens": 1015545120.0, | |
| "step": 41300 | |
| }, | |
| { | |
| "entropy": 1.569046869277954, | |
| "epoch": 1.9589729012696608, | |
| "grad_norm": 1.0235174894332886, | |
| "learning_rate": 1.2842864636164464e-07, | |
| "loss": 1.1425, | |
| "mean_token_accuracy": 0.7613073486089706, | |
| "num_tokens": 1016784041.0, | |
| "step": 41350 | |
| }, | |
| { | |
| "entropy": 1.5418232583999634, | |
| "epoch": 1.961341671404207, | |
| "grad_norm": 1.1395779848098755, | |
| "learning_rate": 1.1404825887937898e-07, | |
| "loss": 1.1096, | |
| "mean_token_accuracy": 0.7651181477308273, | |
| "num_tokens": 1018023715.0, | |
| "step": 41400 | |
| }, | |
| { | |
| "entropy": 1.5499148654937744, | |
| "epoch": 1.963710441538753, | |
| "grad_norm": 1.103959560394287, | |
| "learning_rate": 1.0052055391887027e-07, | |
| "loss": 1.1536, | |
| "mean_token_accuracy": 0.7575215709209442, | |
| "num_tokens": 1019247827.0, | |
| "step": 41450 | |
| }, | |
| { | |
| "entropy": 1.5448267459869385, | |
| "epoch": 1.9660792116732992, | |
| "grad_norm": 1.159401297569275, | |
| "learning_rate": 8.784576270428058e-08, | |
| "loss": 1.1252, | |
| "mean_token_accuracy": 0.7638620465993882, | |
| "num_tokens": 1020467713.0, | |
| "step": 41500 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 42216, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.1122127009405626e+20, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |