Text Generation
Transformers
Safetensors
qwen3
8B
restoration
joseon-dynasty
conversational
text-generation-inference
Instructions to use DAMI-Lab/ARI-8B with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use DAMI-Lab/ARI-8B with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="DAMI-Lab/ARI-8B") messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("DAMI-Lab/ARI-8B") model = AutoModelForCausalLM.from_pretrained("DAMI-Lab/ARI-8B") messages = [ {"role": "user", "content": "Who are you?"}, ] inputs = tokenizer.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use DAMI-Lab/ARI-8B with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "DAMI-Lab/ARI-8B" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "DAMI-Lab/ARI-8B", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/DAMI-Lab/ARI-8B
- SGLang
How to use DAMI-Lab/ARI-8B with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "DAMI-Lab/ARI-8B" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "DAMI-Lab/ARI-8B", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "DAMI-Lab/ARI-8B" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "DAMI-Lab/ARI-8B", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use DAMI-Lab/ARI-8B with Docker Model Runner:
docker model run hf.co/DAMI-Lab/ARI-8B
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.0, | |
| "eval_steps": 10000, | |
| "global_step": 81684, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 1.13171875, | |
| "epoch": 0.0012242299593555653, | |
| "grad_norm": 83.0, | |
| "learning_rate": 7.197062423500612e-08, | |
| "loss": 1.0766, | |
| "mean_token_accuracy": 0.8785432744026184, | |
| "num_tokens": 5226465.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 1.1265625, | |
| "epoch": 0.0024484599187111307, | |
| "grad_norm": 78.0, | |
| "learning_rate": 1.4541003671970627e-07, | |
| "loss": 1.0658, | |
| "mean_token_accuracy": 0.8774244856834411, | |
| "num_tokens": 10264339.0, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 1.12125, | |
| "epoch": 0.003672689878066696, | |
| "grad_norm": 65.5, | |
| "learning_rate": 2.1884944920440638e-07, | |
| "loss": 1.0161, | |
| "mean_token_accuracy": 0.8803484535217285, | |
| "num_tokens": 15456221.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 1.12859375, | |
| "epoch": 0.004896919837422261, | |
| "grad_norm": 73.0, | |
| "learning_rate": 2.922888616891065e-07, | |
| "loss": 0.9905, | |
| "mean_token_accuracy": 0.8795530760288238, | |
| "num_tokens": 20433188.0, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 1.14265625, | |
| "epoch": 0.006121149796777827, | |
| "grad_norm": 69.0, | |
| "learning_rate": 3.6572827417380663e-07, | |
| "loss": 0.8835, | |
| "mean_token_accuracy": 0.884142210483551, | |
| "num_tokens": 25654586.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 1.1475, | |
| "epoch": 0.007345379756133392, | |
| "grad_norm": 60.0, | |
| "learning_rate": 4.391676866585067e-07, | |
| "loss": 0.7555, | |
| "mean_token_accuracy": 0.8876442670822143, | |
| "num_tokens": 30682210.0, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 1.14234375, | |
| "epoch": 0.008569609715488957, | |
| "grad_norm": 30.0, | |
| "learning_rate": 5.126070991432069e-07, | |
| "loss": 0.691, | |
| "mean_token_accuracy": 0.8891012752056122, | |
| "num_tokens": 36107614.0, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 1.1603125, | |
| "epoch": 0.009793839674844523, | |
| "grad_norm": 15.6875, | |
| "learning_rate": 5.860465116279069e-07, | |
| "loss": 0.5872, | |
| "mean_token_accuracy": 0.9278727066516876, | |
| "num_tokens": 41528585.0, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 1.20984375, | |
| "epoch": 0.011018069634200088, | |
| "grad_norm": 10.375, | |
| "learning_rate": 6.594859241126071e-07, | |
| "loss": 0.5128, | |
| "mean_token_accuracy": 0.9328850126266479, | |
| "num_tokens": 47205376.0, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 1.23328125, | |
| "epoch": 0.012242299593555654, | |
| "grad_norm": 8.875, | |
| "learning_rate": 7.329253365973072e-07, | |
| "loss": 0.464, | |
| "mean_token_accuracy": 0.9372936522960663, | |
| "num_tokens": 52484312.0, | |
| "step": 500 | |
| }, | |
| { | |
| "entropy": 1.2515625, | |
| "epoch": 0.013466529552911218, | |
| "grad_norm": 10.375, | |
| "learning_rate": 8.063647490820073e-07, | |
| "loss": 0.4469, | |
| "mean_token_accuracy": 0.9350792992115021, | |
| "num_tokens": 57610761.0, | |
| "step": 550 | |
| }, | |
| { | |
| "entropy": 1.28046875, | |
| "epoch": 0.014690759512266784, | |
| "grad_norm": 7.8125, | |
| "learning_rate": 8.798041615667075e-07, | |
| "loss": 0.4233, | |
| "mean_token_accuracy": 0.9361693727970123, | |
| "num_tokens": 62744047.0, | |
| "step": 600 | |
| }, | |
| { | |
| "entropy": 1.30703125, | |
| "epoch": 0.01591498947162235, | |
| "grad_norm": 11.375, | |
| "learning_rate": 9.532435740514075e-07, | |
| "loss": 0.4228, | |
| "mean_token_accuracy": 0.9353253149986267, | |
| "num_tokens": 68113654.0, | |
| "step": 650 | |
| }, | |
| { | |
| "entropy": 1.3890625, | |
| "epoch": 0.017139219430977914, | |
| "grad_norm": 4.96875, | |
| "learning_rate": 1.0266829865361079e-06, | |
| "loss": 0.3897, | |
| "mean_token_accuracy": 0.934454687833786, | |
| "num_tokens": 73614017.0, | |
| "step": 700 | |
| }, | |
| { | |
| "entropy": 1.4703125, | |
| "epoch": 0.01836344939033348, | |
| "grad_norm": 4.25, | |
| "learning_rate": 1.100122399020808e-06, | |
| "loss": 0.3618, | |
| "mean_token_accuracy": 0.9344593751430511, | |
| "num_tokens": 79174232.0, | |
| "step": 750 | |
| }, | |
| { | |
| "entropy": 1.52625, | |
| "epoch": 0.019587679349689045, | |
| "grad_norm": 3.671875, | |
| "learning_rate": 1.173561811505508e-06, | |
| "loss": 0.3804, | |
| "mean_token_accuracy": 0.9336516118049621, | |
| "num_tokens": 84608483.0, | |
| "step": 800 | |
| }, | |
| { | |
| "entropy": 1.5390625, | |
| "epoch": 0.02081190930904461, | |
| "grad_norm": 3.8125, | |
| "learning_rate": 1.2470012239902082e-06, | |
| "loss": 0.353, | |
| "mean_token_accuracy": 0.9379545438289643, | |
| "num_tokens": 89999996.0, | |
| "step": 850 | |
| }, | |
| { | |
| "entropy": 1.54515625, | |
| "epoch": 0.022036139268400177, | |
| "grad_norm": 3.296875, | |
| "learning_rate": 1.3204406364749082e-06, | |
| "loss": 0.3294, | |
| "mean_token_accuracy": 0.9422214996814727, | |
| "num_tokens": 95124008.0, | |
| "step": 900 | |
| }, | |
| { | |
| "entropy": 1.5690625, | |
| "epoch": 0.02326036922775574, | |
| "grad_norm": 4.09375, | |
| "learning_rate": 1.3938800489596082e-06, | |
| "loss": 0.3514, | |
| "mean_token_accuracy": 0.9378081679344177, | |
| "num_tokens": 100136013.0, | |
| "step": 950 | |
| }, | |
| { | |
| "entropy": 1.55765625, | |
| "epoch": 0.02448459918711131, | |
| "grad_norm": 3.171875, | |
| "learning_rate": 1.4673194614443085e-06, | |
| "loss": 0.3343, | |
| "mean_token_accuracy": 0.9409069657325745, | |
| "num_tokens": 105114554.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "entropy": 1.53890625, | |
| "epoch": 0.025708829146466872, | |
| "grad_norm": 3.296875, | |
| "learning_rate": 1.5407588739290085e-06, | |
| "loss": 0.3284, | |
| "mean_token_accuracy": 0.9414800906181335, | |
| "num_tokens": 110370176.0, | |
| "step": 1050 | |
| }, | |
| { | |
| "entropy": 1.5546875, | |
| "epoch": 0.026933059105822436, | |
| "grad_norm": 3.625, | |
| "learning_rate": 1.6141982864137085e-06, | |
| "loss": 0.3183, | |
| "mean_token_accuracy": 0.9426046288013459, | |
| "num_tokens": 115321229.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "entropy": 1.55046875, | |
| "epoch": 0.028157289065178004, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 1.687637698898409e-06, | |
| "loss": 0.332, | |
| "mean_token_accuracy": 0.9409434747695923, | |
| "num_tokens": 120648053.0, | |
| "step": 1150 | |
| }, | |
| { | |
| "entropy": 1.54046875, | |
| "epoch": 0.029381519024533568, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 1.761077111383109e-06, | |
| "loss": 0.3266, | |
| "mean_token_accuracy": 0.941448210477829, | |
| "num_tokens": 126142957.0, | |
| "step": 1200 | |
| }, | |
| { | |
| "entropy": 1.54453125, | |
| "epoch": 0.030605748983889135, | |
| "grad_norm": 3.484375, | |
| "learning_rate": 1.8345165238678093e-06, | |
| "loss": 0.3357, | |
| "mean_token_accuracy": 0.9392308318614959, | |
| "num_tokens": 131721983.0, | |
| "step": 1250 | |
| }, | |
| { | |
| "entropy": 1.54265625, | |
| "epoch": 0.0318299789432447, | |
| "grad_norm": 3.890625, | |
| "learning_rate": 1.9079559363525093e-06, | |
| "loss": 0.323, | |
| "mean_token_accuracy": 0.9425621521472931, | |
| "num_tokens": 136834110.0, | |
| "step": 1300 | |
| }, | |
| { | |
| "entropy": 1.55375, | |
| "epoch": 0.03305420890260027, | |
| "grad_norm": 3.046875, | |
| "learning_rate": 1.9813953488372093e-06, | |
| "loss": 0.3103, | |
| "mean_token_accuracy": 0.9435288536548615, | |
| "num_tokens": 142077567.0, | |
| "step": 1350 | |
| }, | |
| { | |
| "entropy": 1.5815625, | |
| "epoch": 0.03427843886195583, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 2.0548347613219094e-06, | |
| "loss": 0.325, | |
| "mean_token_accuracy": 0.9404278743267059, | |
| "num_tokens": 147938512.0, | |
| "step": 1400 | |
| }, | |
| { | |
| "entropy": 1.603125, | |
| "epoch": 0.035502668821311395, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 2.1282741738066094e-06, | |
| "loss": 0.292, | |
| "mean_token_accuracy": 0.94657958984375, | |
| "num_tokens": 152967734.0, | |
| "step": 1450 | |
| }, | |
| { | |
| "entropy": 1.5790625, | |
| "epoch": 0.03672689878066696, | |
| "grad_norm": 4.84375, | |
| "learning_rate": 2.20171358629131e-06, | |
| "loss": 0.3027, | |
| "mean_token_accuracy": 0.9442594313621521, | |
| "num_tokens": 158252140.0, | |
| "step": 1500 | |
| }, | |
| { | |
| "entropy": 1.5709375, | |
| "epoch": 0.03795112874002252, | |
| "grad_norm": 2.796875, | |
| "learning_rate": 2.27515299877601e-06, | |
| "loss": 0.2935, | |
| "mean_token_accuracy": 0.9451294171810151, | |
| "num_tokens": 163906622.0, | |
| "step": 1550 | |
| }, | |
| { | |
| "entropy": 1.58890625, | |
| "epoch": 0.03917535869937809, | |
| "grad_norm": 2.703125, | |
| "learning_rate": 2.34859241126071e-06, | |
| "loss": 0.2963, | |
| "mean_token_accuracy": 0.9451341640949249, | |
| "num_tokens": 169532126.0, | |
| "step": 1600 | |
| }, | |
| { | |
| "entropy": 1.55125, | |
| "epoch": 0.04039958865873366, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 2.42203182374541e-06, | |
| "loss": 0.2704, | |
| "mean_token_accuracy": 0.9490817248821258, | |
| "num_tokens": 174811062.0, | |
| "step": 1650 | |
| }, | |
| { | |
| "entropy": 1.54515625, | |
| "epoch": 0.04162381861808922, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 2.49547123623011e-06, | |
| "loss": 0.2704, | |
| "mean_token_accuracy": 0.9497274696826935, | |
| "num_tokens": 180019609.0, | |
| "step": 1700 | |
| }, | |
| { | |
| "entropy": 1.545625, | |
| "epoch": 0.042848048577444786, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 2.56891064871481e-06, | |
| "loss": 0.2729, | |
| "mean_token_accuracy": 0.9472972440719605, | |
| "num_tokens": 185410342.0, | |
| "step": 1750 | |
| }, | |
| { | |
| "entropy": 1.54171875, | |
| "epoch": 0.044072278536800354, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 2.6423500611995105e-06, | |
| "loss": 0.2723, | |
| "mean_token_accuracy": 0.9487947750091553, | |
| "num_tokens": 190878398.0, | |
| "step": 1800 | |
| }, | |
| { | |
| "entropy": 1.54234375, | |
| "epoch": 0.04529650849615592, | |
| "grad_norm": 2.8125, | |
| "learning_rate": 2.715789473684211e-06, | |
| "loss": 0.2761, | |
| "mean_token_accuracy": 0.9490153706073761, | |
| "num_tokens": 196027836.0, | |
| "step": 1850 | |
| }, | |
| { | |
| "entropy": 1.53703125, | |
| "epoch": 0.04652073845551148, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 2.789228886168911e-06, | |
| "loss": 0.2882, | |
| "mean_token_accuracy": 0.9455779695510864, | |
| "num_tokens": 201616019.0, | |
| "step": 1900 | |
| }, | |
| { | |
| "entropy": 1.52796875, | |
| "epoch": 0.04774496841486705, | |
| "grad_norm": 1.671875, | |
| "learning_rate": 2.862668298653611e-06, | |
| "loss": 0.2609, | |
| "mean_token_accuracy": 0.9508947324752808, | |
| "num_tokens": 206756502.0, | |
| "step": 1950 | |
| }, | |
| { | |
| "entropy": 1.51796875, | |
| "epoch": 0.04896919837422262, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 2.936107711138311e-06, | |
| "loss": 0.2627, | |
| "mean_token_accuracy": 0.9504942214488983, | |
| "num_tokens": 211966544.0, | |
| "step": 2000 | |
| }, | |
| { | |
| "entropy": 1.5309375, | |
| "epoch": 0.05019342833357818, | |
| "grad_norm": 2.84375, | |
| "learning_rate": 3.0095471236230106e-06, | |
| "loss": 0.2622, | |
| "mean_token_accuracy": 0.9505769073963165, | |
| "num_tokens": 217058889.0, | |
| "step": 2050 | |
| }, | |
| { | |
| "entropy": 1.52125, | |
| "epoch": 0.051417658292933745, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 3.082986536107711e-06, | |
| "loss": 0.271, | |
| "mean_token_accuracy": 0.9492247033119202, | |
| "num_tokens": 222302364.0, | |
| "step": 2100 | |
| }, | |
| { | |
| "entropy": 1.5103125, | |
| "epoch": 0.05264188825228931, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 3.1564259485924115e-06, | |
| "loss": 0.2836, | |
| "mean_token_accuracy": 0.9467552089691162, | |
| "num_tokens": 227892169.0, | |
| "step": 2150 | |
| }, | |
| { | |
| "entropy": 1.5121875, | |
| "epoch": 0.05386611821164487, | |
| "grad_norm": 1.6796875, | |
| "learning_rate": 3.2298653610771116e-06, | |
| "loss": 0.2772, | |
| "mean_token_accuracy": 0.9473760116100312, | |
| "num_tokens": 233522252.0, | |
| "step": 2200 | |
| }, | |
| { | |
| "entropy": 1.51453125, | |
| "epoch": 0.05509034817100044, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 3.303304773561812e-06, | |
| "loss": 0.2814, | |
| "mean_token_accuracy": 0.9471155571937561, | |
| "num_tokens": 239241678.0, | |
| "step": 2250 | |
| }, | |
| { | |
| "entropy": 1.50359375, | |
| "epoch": 0.05631457813035601, | |
| "grad_norm": 6.65625, | |
| "learning_rate": 3.3767441860465116e-06, | |
| "loss": 0.252, | |
| "mean_token_accuracy": 0.9517535066604614, | |
| "num_tokens": 244573352.0, | |
| "step": 2300 | |
| }, | |
| { | |
| "entropy": 1.489375, | |
| "epoch": 0.05753880808971157, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 3.450183598531212e-06, | |
| "loss": 0.2686, | |
| "mean_token_accuracy": 0.9490506839752197, | |
| "num_tokens": 249799704.0, | |
| "step": 2350 | |
| }, | |
| { | |
| "entropy": 1.5084375, | |
| "epoch": 0.058763038049067136, | |
| "grad_norm": 2.859375, | |
| "learning_rate": 3.5236230110159117e-06, | |
| "loss": 0.2593, | |
| "mean_token_accuracy": 0.951296364068985, | |
| "num_tokens": 255107263.0, | |
| "step": 2400 | |
| }, | |
| { | |
| "entropy": 1.49984375, | |
| "epoch": 0.0599872680084227, | |
| "grad_norm": 3.03125, | |
| "learning_rate": 3.597062423500612e-06, | |
| "loss": 0.2734, | |
| "mean_token_accuracy": 0.9485626530647278, | |
| "num_tokens": 260533835.0, | |
| "step": 2450 | |
| }, | |
| { | |
| "entropy": 1.48375, | |
| "epoch": 0.06121149796777827, | |
| "grad_norm": 1.0390625, | |
| "learning_rate": 3.670501835985312e-06, | |
| "loss": 0.2529, | |
| "mean_token_accuracy": 0.9517404818534851, | |
| "num_tokens": 265773085.0, | |
| "step": 2500 | |
| }, | |
| { | |
| "entropy": 1.4821875, | |
| "epoch": 0.06243572792713383, | |
| "grad_norm": 2.703125, | |
| "learning_rate": 3.743941248470012e-06, | |
| "loss": 0.2616, | |
| "mean_token_accuracy": 0.9500162851810455, | |
| "num_tokens": 271036305.0, | |
| "step": 2550 | |
| }, | |
| { | |
| "entropy": 1.47765625, | |
| "epoch": 0.0636599578864894, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 3.817380660954712e-06, | |
| "loss": 0.2462, | |
| "mean_token_accuracy": 0.9525675988197326, | |
| "num_tokens": 275952458.0, | |
| "step": 2600 | |
| }, | |
| { | |
| "entropy": 1.48328125, | |
| "epoch": 0.06488418784584496, | |
| "grad_norm": 2.125, | |
| "learning_rate": 3.890820073439412e-06, | |
| "loss": 0.2592, | |
| "mean_token_accuracy": 0.9498057246208191, | |
| "num_tokens": 281644324.0, | |
| "step": 2650 | |
| }, | |
| { | |
| "entropy": 1.47390625, | |
| "epoch": 0.06610841780520053, | |
| "grad_norm": 1.90625, | |
| "learning_rate": 3.964259485924113e-06, | |
| "loss": 0.2416, | |
| "mean_token_accuracy": 0.9530806469917298, | |
| "num_tokens": 286839400.0, | |
| "step": 2700 | |
| }, | |
| { | |
| "entropy": 1.471875, | |
| "epoch": 0.0673326477645561, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 4.037698898408813e-06, | |
| "loss": 0.2483, | |
| "mean_token_accuracy": 0.9517722308635712, | |
| "num_tokens": 292713093.0, | |
| "step": 2750 | |
| }, | |
| { | |
| "entropy": 1.47, | |
| "epoch": 0.06855687772391166, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 4.111138310893514e-06, | |
| "loss": 0.2357, | |
| "mean_token_accuracy": 0.9542278277873993, | |
| "num_tokens": 297994633.0, | |
| "step": 2800 | |
| }, | |
| { | |
| "entropy": 1.48640625, | |
| "epoch": 0.06978110768326723, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 4.184577723378213e-06, | |
| "loss": 0.2434, | |
| "mean_token_accuracy": 0.9529701387882232, | |
| "num_tokens": 303305735.0, | |
| "step": 2850 | |
| }, | |
| { | |
| "entropy": 1.46640625, | |
| "epoch": 0.07100533764262279, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 4.258017135862914e-06, | |
| "loss": 0.2228, | |
| "mean_token_accuracy": 0.9564117324352265, | |
| "num_tokens": 308665099.0, | |
| "step": 2900 | |
| }, | |
| { | |
| "entropy": 1.47671875, | |
| "epoch": 0.07222956760197835, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 4.331456548347613e-06, | |
| "loss": 0.2485, | |
| "mean_token_accuracy": 0.9520271122455597, | |
| "num_tokens": 313894105.0, | |
| "step": 2950 | |
| }, | |
| { | |
| "entropy": 1.46984375, | |
| "epoch": 0.07345379756133393, | |
| "grad_norm": 2.125, | |
| "learning_rate": 4.404895960832314e-06, | |
| "loss": 0.2354, | |
| "mean_token_accuracy": 0.9531759965419769, | |
| "num_tokens": 319439357.0, | |
| "step": 3000 | |
| }, | |
| { | |
| "entropy": 1.479375, | |
| "epoch": 0.07467802752068949, | |
| "grad_norm": 1.96875, | |
| "learning_rate": 4.478335373317013e-06, | |
| "loss": 0.2506, | |
| "mean_token_accuracy": 0.9517410743236542, | |
| "num_tokens": 325090760.0, | |
| "step": 3050 | |
| }, | |
| { | |
| "entropy": 1.475, | |
| "epoch": 0.07590225748004505, | |
| "grad_norm": 1.6796875, | |
| "learning_rate": 4.551774785801714e-06, | |
| "loss": 0.2273, | |
| "mean_token_accuracy": 0.955747674703598, | |
| "num_tokens": 330405470.0, | |
| "step": 3100 | |
| }, | |
| { | |
| "entropy": 1.47546875, | |
| "epoch": 0.07712648743940062, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 4.6252141982864134e-06, | |
| "loss": 0.2391, | |
| "mean_token_accuracy": 0.9522111368179321, | |
| "num_tokens": 335678826.0, | |
| "step": 3150 | |
| }, | |
| { | |
| "entropy": 1.4603125, | |
| "epoch": 0.07835071739875618, | |
| "grad_norm": 1.53125, | |
| "learning_rate": 4.698653610771114e-06, | |
| "loss": 0.2344, | |
| "mean_token_accuracy": 0.9539849495887757, | |
| "num_tokens": 340918671.0, | |
| "step": 3200 | |
| }, | |
| { | |
| "entropy": 1.4509375, | |
| "epoch": 0.07957494735811174, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 4.7720930232558135e-06, | |
| "loss": 0.2191, | |
| "mean_token_accuracy": 0.9559646666049957, | |
| "num_tokens": 346171106.0, | |
| "step": 3250 | |
| }, | |
| { | |
| "entropy": 1.454375, | |
| "epoch": 0.08079917731746732, | |
| "grad_norm": 5.6875, | |
| "learning_rate": 4.845532435740514e-06, | |
| "loss": 0.2356, | |
| "mean_token_accuracy": 0.9528819477558136, | |
| "num_tokens": 351560226.0, | |
| "step": 3300 | |
| }, | |
| { | |
| "entropy": 1.46609375, | |
| "epoch": 0.08202340727682288, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 4.918971848225214e-06, | |
| "loss": 0.2387, | |
| "mean_token_accuracy": 0.9533221650123597, | |
| "num_tokens": 357311606.0, | |
| "step": 3350 | |
| }, | |
| { | |
| "entropy": 1.45046875, | |
| "epoch": 0.08324763723617844, | |
| "grad_norm": 3.0625, | |
| "learning_rate": 4.992411260709914e-06, | |
| "loss": 0.218, | |
| "mean_token_accuracy": 0.9566865241527558, | |
| "num_tokens": 362184714.0, | |
| "step": 3400 | |
| }, | |
| { | |
| "entropy": 1.44765625, | |
| "epoch": 0.08447186719553401, | |
| "grad_norm": 3.03125, | |
| "learning_rate": 5.0658506731946145e-06, | |
| "loss": 0.2163, | |
| "mean_token_accuracy": 0.9571156585216523, | |
| "num_tokens": 367118033.0, | |
| "step": 3450 | |
| }, | |
| { | |
| "entropy": 1.4721875, | |
| "epoch": 0.08569609715488957, | |
| "grad_norm": 1.90625, | |
| "learning_rate": 5.139290085679315e-06, | |
| "loss": 0.2269, | |
| "mean_token_accuracy": 0.9551365935802459, | |
| "num_tokens": 372554179.0, | |
| "step": 3500 | |
| }, | |
| { | |
| "entropy": 1.43546875, | |
| "epoch": 0.08692032711424515, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 5.212729498164015e-06, | |
| "loss": 0.2235, | |
| "mean_token_accuracy": 0.9559626686573028, | |
| "num_tokens": 377909880.0, | |
| "step": 3550 | |
| }, | |
| { | |
| "entropy": 1.4384375, | |
| "epoch": 0.08814455707360071, | |
| "grad_norm": 1.7578125, | |
| "learning_rate": 5.286168910648715e-06, | |
| "loss": 0.2151, | |
| "mean_token_accuracy": 0.9575206100940704, | |
| "num_tokens": 383194488.0, | |
| "step": 3600 | |
| }, | |
| { | |
| "entropy": 1.42265625, | |
| "epoch": 0.08936878703295627, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 5.3596083231334154e-06, | |
| "loss": 0.229, | |
| "mean_token_accuracy": 0.9538651633262635, | |
| "num_tokens": 389073618.0, | |
| "step": 3650 | |
| }, | |
| { | |
| "entropy": 1.429375, | |
| "epoch": 0.09059301699231184, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 5.433047735618115e-06, | |
| "loss": 0.2294, | |
| "mean_token_accuracy": 0.9545065891742707, | |
| "num_tokens": 394553347.0, | |
| "step": 3700 | |
| }, | |
| { | |
| "entropy": 1.42375, | |
| "epoch": 0.0918172469516674, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 5.5064871481028155e-06, | |
| "loss": 0.2085, | |
| "mean_token_accuracy": 0.9575728678703308, | |
| "num_tokens": 399579739.0, | |
| "step": 3750 | |
| }, | |
| { | |
| "entropy": 1.411875, | |
| "epoch": 0.09304147691102296, | |
| "grad_norm": 1.6484375, | |
| "learning_rate": 5.579926560587515e-06, | |
| "loss": 0.2211, | |
| "mean_token_accuracy": 0.9557280552387237, | |
| "num_tokens": 404841496.0, | |
| "step": 3800 | |
| }, | |
| { | |
| "entropy": 1.40765625, | |
| "epoch": 0.09426570687037854, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 5.6533659730722156e-06, | |
| "loss": 0.2125, | |
| "mean_token_accuracy": 0.9576599287986756, | |
| "num_tokens": 410023001.0, | |
| "step": 3850 | |
| }, | |
| { | |
| "entropy": 1.42984375, | |
| "epoch": 0.0954899368297341, | |
| "grad_norm": 2.640625, | |
| "learning_rate": 5.726805385556916e-06, | |
| "loss": 0.2279, | |
| "mean_token_accuracy": 0.9547258257865906, | |
| "num_tokens": 415549547.0, | |
| "step": 3900 | |
| }, | |
| { | |
| "entropy": 1.3978125, | |
| "epoch": 0.09671416678908966, | |
| "grad_norm": 2.59375, | |
| "learning_rate": 5.800244798041616e-06, | |
| "loss": 0.2232, | |
| "mean_token_accuracy": 0.9551710951328277, | |
| "num_tokens": 421034105.0, | |
| "step": 3950 | |
| }, | |
| { | |
| "entropy": 1.38796875, | |
| "epoch": 0.09793839674844523, | |
| "grad_norm": 1.515625, | |
| "learning_rate": 5.873684210526316e-06, | |
| "loss": 0.2162, | |
| "mean_token_accuracy": 0.9557711553573608, | |
| "num_tokens": 426688731.0, | |
| "step": 4000 | |
| }, | |
| { | |
| "entropy": 1.3903125, | |
| "epoch": 0.0991626267078008, | |
| "grad_norm": 10.25, | |
| "learning_rate": 5.947123623011016e-06, | |
| "loss": 0.2102, | |
| "mean_token_accuracy": 0.9573217809200287, | |
| "num_tokens": 431945587.0, | |
| "step": 4050 | |
| }, | |
| { | |
| "entropy": 1.37515625, | |
| "epoch": 0.10038685666715635, | |
| "grad_norm": 2.703125, | |
| "learning_rate": 5.9999995181245345e-06, | |
| "loss": 0.2068, | |
| "mean_token_accuracy": 0.9580986511707306, | |
| "num_tokens": 436945746.0, | |
| "step": 4100 | |
| }, | |
| { | |
| "entropy": 1.3790625, | |
| "epoch": 0.10161108662651193, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 5.999989929791556e-06, | |
| "loss": 0.2008, | |
| "mean_token_accuracy": 0.9594962692260742, | |
| "num_tokens": 441913649.0, | |
| "step": 4150 | |
| }, | |
| { | |
| "entropy": 1.39890625, | |
| "epoch": 0.10283531658586749, | |
| "grad_norm": 2.25, | |
| "learning_rate": 5.9999680487622435e-06, | |
| "loss": 0.2158, | |
| "mean_token_accuracy": 0.9564687287807465, | |
| "num_tokens": 447263639.0, | |
| "step": 4200 | |
| }, | |
| { | |
| "entropy": 1.39796875, | |
| "epoch": 0.10405954654522305, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 5.999933875126256e-06, | |
| "loss": 0.2235, | |
| "mean_token_accuracy": 0.9537206184864044, | |
| "num_tokens": 452831245.0, | |
| "step": 4250 | |
| }, | |
| { | |
| "entropy": 1.40046875, | |
| "epoch": 0.10528377650457862, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 5.999887409023625e-06, | |
| "loss": 0.1983, | |
| "mean_token_accuracy": 0.9605963575839996, | |
| "num_tokens": 457920235.0, | |
| "step": 4300 | |
| }, | |
| { | |
| "entropy": 1.37109375, | |
| "epoch": 0.10650800646393419, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 5.9998286506447455e-06, | |
| "loss": 0.1985, | |
| "mean_token_accuracy": 0.9589159560203552, | |
| "num_tokens": 463428491.0, | |
| "step": 4350 | |
| }, | |
| { | |
| "entropy": 1.393125, | |
| "epoch": 0.10773223642328975, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 5.999757600230387e-06, | |
| "loss": 0.2181, | |
| "mean_token_accuracy": 0.9564608442783356, | |
| "num_tokens": 469183579.0, | |
| "step": 4400 | |
| }, | |
| { | |
| "entropy": 1.40828125, | |
| "epoch": 0.10895646638264532, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 5.999674258071684e-06, | |
| "loss": 0.1997, | |
| "mean_token_accuracy": 0.9596063613891601, | |
| "num_tokens": 474548123.0, | |
| "step": 4450 | |
| }, | |
| { | |
| "entropy": 1.38171875, | |
| "epoch": 0.11018069634200088, | |
| "grad_norm": 2.25, | |
| "learning_rate": 5.999578624510137e-06, | |
| "loss": 0.2113, | |
| "mean_token_accuracy": 0.9565052735805512, | |
| "num_tokens": 480099691.0, | |
| "step": 4500 | |
| }, | |
| { | |
| "entropy": 1.39328125, | |
| "epoch": 0.11140492630135644, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 5.9994706999376126e-06, | |
| "loss": 0.2096, | |
| "mean_token_accuracy": 0.9578315222263336, | |
| "num_tokens": 485485141.0, | |
| "step": 4550 | |
| }, | |
| { | |
| "entropy": 1.39828125, | |
| "epoch": 0.11262915626071202, | |
| "grad_norm": 2.125, | |
| "learning_rate": 5.999350484796339e-06, | |
| "loss": 0.1935, | |
| "mean_token_accuracy": 0.9609186232089997, | |
| "num_tokens": 490314941.0, | |
| "step": 4600 | |
| }, | |
| { | |
| "entropy": 1.41859375, | |
| "epoch": 0.11385338622006758, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 5.999217979578909e-06, | |
| "loss": 0.2132, | |
| "mean_token_accuracy": 0.9569031345844269, | |
| "num_tokens": 495604676.0, | |
| "step": 4650 | |
| }, | |
| { | |
| "entropy": 1.41984375, | |
| "epoch": 0.11507761617942314, | |
| "grad_norm": 1.90625, | |
| "learning_rate": 5.999073184828273e-06, | |
| "loss": 0.1948, | |
| "mean_token_accuracy": 0.9596328222751618, | |
| "num_tokens": 500772718.0, | |
| "step": 4700 | |
| }, | |
| { | |
| "entropy": 1.42, | |
| "epoch": 0.11630184613877871, | |
| "grad_norm": 2.75, | |
| "learning_rate": 5.998916101137737e-06, | |
| "loss": 0.2128, | |
| "mean_token_accuracy": 0.9574012553691864, | |
| "num_tokens": 506105312.0, | |
| "step": 4750 | |
| }, | |
| { | |
| "entropy": 1.40890625, | |
| "epoch": 0.11752607609813427, | |
| "grad_norm": 2.671875, | |
| "learning_rate": 5.998746729150967e-06, | |
| "loss": 0.2019, | |
| "mean_token_accuracy": 0.958700270652771, | |
| "num_tokens": 511311990.0, | |
| "step": 4800 | |
| }, | |
| { | |
| "entropy": 1.41671875, | |
| "epoch": 0.11875030605748983, | |
| "grad_norm": 1.515625, | |
| "learning_rate": 5.998565069561976e-06, | |
| "loss": 0.2044, | |
| "mean_token_accuracy": 0.9582890093326568, | |
| "num_tokens": 516615202.0, | |
| "step": 4850 | |
| }, | |
| { | |
| "entropy": 1.4115625, | |
| "epoch": 0.1199745360168454, | |
| "grad_norm": 1.828125, | |
| "learning_rate": 5.998371123115128e-06, | |
| "loss": 0.207, | |
| "mean_token_accuracy": 0.9571990466117859, | |
| "num_tokens": 521934656.0, | |
| "step": 4900 | |
| }, | |
| { | |
| "entropy": 1.396875, | |
| "epoch": 0.12119876597620097, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 5.9981648906051355e-06, | |
| "loss": 0.2069, | |
| "mean_token_accuracy": 0.9578309345245362, | |
| "num_tokens": 527328179.0, | |
| "step": 4950 | |
| }, | |
| { | |
| "entropy": 1.41046875, | |
| "epoch": 0.12242299593555654, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 5.9979463728770525e-06, | |
| "loss": 0.1965, | |
| "mean_token_accuracy": 0.9601268231868744, | |
| "num_tokens": 532420262.0, | |
| "step": 5000 | |
| }, | |
| { | |
| "entropy": 1.3953125, | |
| "epoch": 0.1236472258949121, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 5.997715570826272e-06, | |
| "loss": 0.1938, | |
| "mean_token_accuracy": 0.9605181181430816, | |
| "num_tokens": 537756232.0, | |
| "step": 5050 | |
| }, | |
| { | |
| "entropy": 1.390625, | |
| "epoch": 0.12487145585426766, | |
| "grad_norm": 1.5703125, | |
| "learning_rate": 5.997472485398524e-06, | |
| "loss": 0.2038, | |
| "mean_token_accuracy": 0.9585963201522827, | |
| "num_tokens": 543281806.0, | |
| "step": 5100 | |
| }, | |
| { | |
| "entropy": 1.4215625, | |
| "epoch": 0.12609568581362324, | |
| "grad_norm": 1.75, | |
| "learning_rate": 5.99721711758987e-06, | |
| "loss": 0.1969, | |
| "mean_token_accuracy": 0.9599570655822753, | |
| "num_tokens": 548233812.0, | |
| "step": 5150 | |
| }, | |
| { | |
| "entropy": 1.40515625, | |
| "epoch": 0.1273199157729788, | |
| "grad_norm": 2.375, | |
| "learning_rate": 5.9969494684466985e-06, | |
| "loss": 0.2041, | |
| "mean_token_accuracy": 0.9577370703220367, | |
| "num_tokens": 553736654.0, | |
| "step": 5200 | |
| }, | |
| { | |
| "entropy": 1.3990625, | |
| "epoch": 0.12854414573233436, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 5.996669539065727e-06, | |
| "loss": 0.1945, | |
| "mean_token_accuracy": 0.9612773549556732, | |
| "num_tokens": 558856334.0, | |
| "step": 5250 | |
| }, | |
| { | |
| "entropy": 1.40203125, | |
| "epoch": 0.12976837569168992, | |
| "grad_norm": 1.7734375, | |
| "learning_rate": 5.996377330593983e-06, | |
| "loss": 0.2145, | |
| "mean_token_accuracy": 0.9565242063999176, | |
| "num_tokens": 564032272.0, | |
| "step": 5300 | |
| }, | |
| { | |
| "entropy": 1.39671875, | |
| "epoch": 0.13099260565104548, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 5.9960728442288186e-06, | |
| "loss": 0.1992, | |
| "mean_token_accuracy": 0.958374012708664, | |
| "num_tokens": 569306892.0, | |
| "step": 5350 | |
| }, | |
| { | |
| "entropy": 1.38578125, | |
| "epoch": 0.13221683561040107, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 5.995756081217889e-06, | |
| "loss": 0.1979, | |
| "mean_token_accuracy": 0.9593621265888214, | |
| "num_tokens": 574741752.0, | |
| "step": 5400 | |
| }, | |
| { | |
| "entropy": 1.38234375, | |
| "epoch": 0.13344106556975663, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 5.9954270428591555e-06, | |
| "loss": 0.2003, | |
| "mean_token_accuracy": 0.9591895163059234, | |
| "num_tokens": 580457265.0, | |
| "step": 5450 | |
| }, | |
| { | |
| "entropy": 1.394375, | |
| "epoch": 0.1346652955291122, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 5.995085730500878e-06, | |
| "loss": 0.1896, | |
| "mean_token_accuracy": 0.9607266175746918, | |
| "num_tokens": 585705175.0, | |
| "step": 5500 | |
| }, | |
| { | |
| "entropy": 1.39078125, | |
| "epoch": 0.13588952548846775, | |
| "grad_norm": 1.5234375, | |
| "learning_rate": 5.994732145541613e-06, | |
| "loss": 0.2003, | |
| "mean_token_accuracy": 0.9587921166419983, | |
| "num_tokens": 590923544.0, | |
| "step": 5550 | |
| }, | |
| { | |
| "entropy": 1.380625, | |
| "epoch": 0.1371137554478233, | |
| "grad_norm": 3.265625, | |
| "learning_rate": 5.9943662894302e-06, | |
| "loss": 0.1945, | |
| "mean_token_accuracy": 0.9587338602542878, | |
| "num_tokens": 596469221.0, | |
| "step": 5600 | |
| }, | |
| { | |
| "entropy": 1.4028125, | |
| "epoch": 0.1383379854071789, | |
| "grad_norm": 1.5859375, | |
| "learning_rate": 5.993988163665767e-06, | |
| "loss": 0.2225, | |
| "mean_token_accuracy": 0.9551014530658722, | |
| "num_tokens": 602167038.0, | |
| "step": 5650 | |
| }, | |
| { | |
| "entropy": 1.3846875, | |
| "epoch": 0.13956221536653446, | |
| "grad_norm": 2.640625, | |
| "learning_rate": 5.9935977697977114e-06, | |
| "loss": 0.201, | |
| "mean_token_accuracy": 0.958451042175293, | |
| "num_tokens": 607292638.0, | |
| "step": 5700 | |
| }, | |
| { | |
| "entropy": 1.3784375, | |
| "epoch": 0.14078644532589002, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 5.993195109425705e-06, | |
| "loss": 0.2112, | |
| "mean_token_accuracy": 0.9564135050773621, | |
| "num_tokens": 613202323.0, | |
| "step": 5750 | |
| }, | |
| { | |
| "entropy": 1.38828125, | |
| "epoch": 0.14201067528524558, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 5.9927801841996784e-06, | |
| "loss": 0.1937, | |
| "mean_token_accuracy": 0.9602376103401185, | |
| "num_tokens": 618640198.0, | |
| "step": 5800 | |
| }, | |
| { | |
| "entropy": 1.385, | |
| "epoch": 0.14323490524460114, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 5.992352995819822e-06, | |
| "loss": 0.2075, | |
| "mean_token_accuracy": 0.9579639828205109, | |
| "num_tokens": 623893423.0, | |
| "step": 5850 | |
| }, | |
| { | |
| "entropy": 1.375625, | |
| "epoch": 0.1444591352039567, | |
| "grad_norm": 2.84375, | |
| "learning_rate": 5.991913546036574e-06, | |
| "loss": 0.2106, | |
| "mean_token_accuracy": 0.9564978110790253, | |
| "num_tokens": 629592369.0, | |
| "step": 5900 | |
| }, | |
| { | |
| "entropy": 1.37296875, | |
| "epoch": 0.1456833651633123, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 5.991461836650615e-06, | |
| "loss": 0.211, | |
| "mean_token_accuracy": 0.9563369131088257, | |
| "num_tokens": 635736307.0, | |
| "step": 5950 | |
| }, | |
| { | |
| "entropy": 1.38203125, | |
| "epoch": 0.14690759512266785, | |
| "grad_norm": 3.0, | |
| "learning_rate": 5.990997869512859e-06, | |
| "loss": 0.1961, | |
| "mean_token_accuracy": 0.9592690026760101, | |
| "num_tokens": 641116233.0, | |
| "step": 6000 | |
| }, | |
| { | |
| "entropy": 1.378125, | |
| "epoch": 0.1481318250820234, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 5.990521646524447e-06, | |
| "loss": 0.2008, | |
| "mean_token_accuracy": 0.9585745882987976, | |
| "num_tokens": 646167116.0, | |
| "step": 6050 | |
| }, | |
| { | |
| "entropy": 1.37140625, | |
| "epoch": 0.14935605504137897, | |
| "grad_norm": 2.25, | |
| "learning_rate": 5.990033169636744e-06, | |
| "loss": 0.1783, | |
| "mean_token_accuracy": 0.962623051404953, | |
| "num_tokens": 651158602.0, | |
| "step": 6100 | |
| }, | |
| { | |
| "entropy": 1.38609375, | |
| "epoch": 0.15058028500073453, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 5.989532440851319e-06, | |
| "loss": 0.1925, | |
| "mean_token_accuracy": 0.9600079596042633, | |
| "num_tokens": 656353157.0, | |
| "step": 6150 | |
| }, | |
| { | |
| "entropy": 1.375625, | |
| "epoch": 0.1518045149600901, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 5.98901946221995e-06, | |
| "loss": 0.1956, | |
| "mean_token_accuracy": 0.9591733336448669, | |
| "num_tokens": 661516084.0, | |
| "step": 6200 | |
| }, | |
| { | |
| "entropy": 1.3775, | |
| "epoch": 0.15302874491944568, | |
| "grad_norm": 2.59375, | |
| "learning_rate": 5.988494235844608e-06, | |
| "loss": 0.1857, | |
| "mean_token_accuracy": 0.9618037152290344, | |
| "num_tokens": 666952800.0, | |
| "step": 6250 | |
| }, | |
| { | |
| "entropy": 1.3721875, | |
| "epoch": 0.15425297487880124, | |
| "grad_norm": 1.546875, | |
| "learning_rate": 5.987956763877448e-06, | |
| "loss": 0.1994, | |
| "mean_token_accuracy": 0.9587778007984161, | |
| "num_tokens": 672306196.0, | |
| "step": 6300 | |
| }, | |
| { | |
| "entropy": 1.390625, | |
| "epoch": 0.1554772048381568, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 5.987407048520806e-06, | |
| "loss": 0.1843, | |
| "mean_token_accuracy": 0.9617053723335266, | |
| "num_tokens": 677399978.0, | |
| "step": 6350 | |
| }, | |
| { | |
| "entropy": 1.38171875, | |
| "epoch": 0.15670143479751236, | |
| "grad_norm": 1.8671875, | |
| "learning_rate": 5.986845092027181e-06, | |
| "loss": 0.1937, | |
| "mean_token_accuracy": 0.9602959334850312, | |
| "num_tokens": 682747630.0, | |
| "step": 6400 | |
| }, | |
| { | |
| "entropy": 1.38578125, | |
| "epoch": 0.15792566475686792, | |
| "grad_norm": 2.671875, | |
| "learning_rate": 5.986270896699237e-06, | |
| "loss": 0.177, | |
| "mean_token_accuracy": 0.964161764383316, | |
| "num_tokens": 687573308.0, | |
| "step": 6450 | |
| }, | |
| { | |
| "entropy": 1.394375, | |
| "epoch": 0.15914989471622348, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 5.985684464889784e-06, | |
| "loss": 0.1956, | |
| "mean_token_accuracy": 0.9590267181396485, | |
| "num_tokens": 692719553.0, | |
| "step": 6500 | |
| }, | |
| { | |
| "entropy": 1.4165625, | |
| "epoch": 0.16037412467557907, | |
| "grad_norm": 2.640625, | |
| "learning_rate": 5.985085799001773e-06, | |
| "loss": 0.21, | |
| "mean_token_accuracy": 0.9567484962940216, | |
| "num_tokens": 698446523.0, | |
| "step": 6550 | |
| }, | |
| { | |
| "entropy": 1.39546875, | |
| "epoch": 0.16159835463493463, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 5.984474901488284e-06, | |
| "loss": 0.1936, | |
| "mean_token_accuracy": 0.9587848937511444, | |
| "num_tokens": 703964383.0, | |
| "step": 6600 | |
| }, | |
| { | |
| "entropy": 1.3865625, | |
| "epoch": 0.1628225845942902, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 5.983851774852519e-06, | |
| "loss": 0.1814, | |
| "mean_token_accuracy": 0.9620046615600586, | |
| "num_tokens": 708987822.0, | |
| "step": 6650 | |
| }, | |
| { | |
| "entropy": 1.38390625, | |
| "epoch": 0.16404681455364575, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 5.983216421647789e-06, | |
| "loss": 0.1997, | |
| "mean_token_accuracy": 0.9585830473899841, | |
| "num_tokens": 714405287.0, | |
| "step": 6700 | |
| }, | |
| { | |
| "entropy": 1.37453125, | |
| "epoch": 0.16527104451300131, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 5.982568844477502e-06, | |
| "loss": 0.1944, | |
| "mean_token_accuracy": 0.9597526073455811, | |
| "num_tokens": 719693246.0, | |
| "step": 6750 | |
| }, | |
| { | |
| "entropy": 1.34859375, | |
| "epoch": 0.16649527447235687, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 5.9819090459951595e-06, | |
| "loss": 0.1792, | |
| "mean_token_accuracy": 0.9628249955177307, | |
| "num_tokens": 724856885.0, | |
| "step": 6800 | |
| }, | |
| { | |
| "entropy": 1.37203125, | |
| "epoch": 0.16771950443171246, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 5.981237028904336e-06, | |
| "loss": 0.2106, | |
| "mean_token_accuracy": 0.9559559297561645, | |
| "num_tokens": 730337882.0, | |
| "step": 6850 | |
| }, | |
| { | |
| "entropy": 1.3596875, | |
| "epoch": 0.16894373439106802, | |
| "grad_norm": 2.78125, | |
| "learning_rate": 5.980552795958676e-06, | |
| "loss": 0.1715, | |
| "mean_token_accuracy": 0.964083902835846, | |
| "num_tokens": 735194384.0, | |
| "step": 6900 | |
| }, | |
| { | |
| "entropy": 1.37875, | |
| "epoch": 0.17016796435042358, | |
| "grad_norm": 2.890625, | |
| "learning_rate": 5.979856349961876e-06, | |
| "loss": 0.1884, | |
| "mean_token_accuracy": 0.961032167673111, | |
| "num_tokens": 740456561.0, | |
| "step": 6950 | |
| }, | |
| { | |
| "entropy": 1.34078125, | |
| "epoch": 0.17139219430977914, | |
| "grad_norm": 1.875, | |
| "learning_rate": 5.979147693767682e-06, | |
| "loss": 0.1824, | |
| "mean_token_accuracy": 0.9612845265865326, | |
| "num_tokens": 745438122.0, | |
| "step": 7000 | |
| }, | |
| { | |
| "entropy": 1.35234375, | |
| "epoch": 0.1726164242691347, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 5.978426830279867e-06, | |
| "loss": 0.2001, | |
| "mean_token_accuracy": 0.9585837364196778, | |
| "num_tokens": 750857417.0, | |
| "step": 7050 | |
| }, | |
| { | |
| "entropy": 1.35828125, | |
| "epoch": 0.1738406542284903, | |
| "grad_norm": 1.5703125, | |
| "learning_rate": 5.977693762452226e-06, | |
| "loss": 0.2077, | |
| "mean_token_accuracy": 0.956944135427475, | |
| "num_tokens": 756565585.0, | |
| "step": 7100 | |
| }, | |
| { | |
| "entropy": 1.37453125, | |
| "epoch": 0.17506488418784585, | |
| "grad_norm": 1.59375, | |
| "learning_rate": 5.976948493288563e-06, | |
| "loss": 0.1978, | |
| "mean_token_accuracy": 0.9594669210910797, | |
| "num_tokens": 762042483.0, | |
| "step": 7150 | |
| }, | |
| { | |
| "entropy": 1.38609375, | |
| "epoch": 0.17628911414720141, | |
| "grad_norm": 1.96875, | |
| "learning_rate": 5.976191025842678e-06, | |
| "loss": 0.1967, | |
| "mean_token_accuracy": 0.9588606441020966, | |
| "num_tokens": 767082096.0, | |
| "step": 7200 | |
| }, | |
| { | |
| "entropy": 1.3721875, | |
| "epoch": 0.17751334410655698, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 5.975421363218352e-06, | |
| "loss": 0.1896, | |
| "mean_token_accuracy": 0.9610229313373566, | |
| "num_tokens": 772416657.0, | |
| "step": 7250 | |
| }, | |
| { | |
| "entropy": 1.37078125, | |
| "epoch": 0.17873757406591254, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 5.97463950856934e-06, | |
| "loss": 0.187, | |
| "mean_token_accuracy": 0.9611088275909424, | |
| "num_tokens": 777391863.0, | |
| "step": 7300 | |
| }, | |
| { | |
| "entropy": 1.3696875, | |
| "epoch": 0.1799618040252681, | |
| "grad_norm": 2.9375, | |
| "learning_rate": 5.973845465099352e-06, | |
| "loss": 0.196, | |
| "mean_token_accuracy": 0.9594384169578553, | |
| "num_tokens": 782502134.0, | |
| "step": 7350 | |
| }, | |
| { | |
| "entropy": 1.3825, | |
| "epoch": 0.18118603398462368, | |
| "grad_norm": 3.296875, | |
| "learning_rate": 5.973039236062047e-06, | |
| "loss": 0.1826, | |
| "mean_token_accuracy": 0.9621104383468628, | |
| "num_tokens": 787376887.0, | |
| "step": 7400 | |
| }, | |
| { | |
| "entropy": 1.3746875, | |
| "epoch": 0.18241026394397925, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 5.9722208247610095e-06, | |
| "loss": 0.1904, | |
| "mean_token_accuracy": 0.9605046558380127, | |
| "num_tokens": 792554125.0, | |
| "step": 7450 | |
| }, | |
| { | |
| "entropy": 1.39890625, | |
| "epoch": 0.1836344939033348, | |
| "grad_norm": 2.375, | |
| "learning_rate": 5.971390234549746e-06, | |
| "loss": 0.1981, | |
| "mean_token_accuracy": 0.9588062584400177, | |
| "num_tokens": 797990011.0, | |
| "step": 7500 | |
| }, | |
| { | |
| "entropy": 1.39328125, | |
| "epoch": 0.18485872386269037, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 5.970547468831664e-06, | |
| "loss": 0.1827, | |
| "mean_token_accuracy": 0.9626439011096954, | |
| "num_tokens": 802985973.0, | |
| "step": 7550 | |
| }, | |
| { | |
| "entropy": 1.40375, | |
| "epoch": 0.18608295382204593, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 5.969692531060065e-06, | |
| "loss": 0.1851, | |
| "mean_token_accuracy": 0.9621277391910553, | |
| "num_tokens": 808398744.0, | |
| "step": 7600 | |
| }, | |
| { | |
| "entropy": 1.391875, | |
| "epoch": 0.1873071837814015, | |
| "grad_norm": 1.421875, | |
| "learning_rate": 5.9688254247381225e-06, | |
| "loss": 0.1859, | |
| "mean_token_accuracy": 0.9607931089401245, | |
| "num_tokens": 813549741.0, | |
| "step": 7650 | |
| }, | |
| { | |
| "entropy": 1.3784375, | |
| "epoch": 0.18853141374075708, | |
| "grad_norm": 3.171875, | |
| "learning_rate": 5.967946153418875e-06, | |
| "loss": 0.1862, | |
| "mean_token_accuracy": 0.9606724309921265, | |
| "num_tokens": 818604872.0, | |
| "step": 7700 | |
| }, | |
| { | |
| "entropy": 1.3865625, | |
| "epoch": 0.18975564370011264, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 5.967054720705204e-06, | |
| "loss": 0.1934, | |
| "mean_token_accuracy": 0.9598609590530396, | |
| "num_tokens": 824064581.0, | |
| "step": 7750 | |
| }, | |
| { | |
| "entropy": 1.39875, | |
| "epoch": 0.1909798736594682, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 5.966151130249828e-06, | |
| "loss": 0.1926, | |
| "mean_token_accuracy": 0.9593923246860504, | |
| "num_tokens": 829369830.0, | |
| "step": 7800 | |
| }, | |
| { | |
| "entropy": 1.3865625, | |
| "epoch": 0.19220410361882376, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 5.965235385755279e-06, | |
| "loss": 0.1926, | |
| "mean_token_accuracy": 0.9593356001377106, | |
| "num_tokens": 834877335.0, | |
| "step": 7850 | |
| }, | |
| { | |
| "entropy": 1.39328125, | |
| "epoch": 0.19342833357817932, | |
| "grad_norm": 9.0, | |
| "learning_rate": 5.9643074909738936e-06, | |
| "loss": 0.1847, | |
| "mean_token_accuracy": 0.9613538563251496, | |
| "num_tokens": 840076176.0, | |
| "step": 7900 | |
| }, | |
| { | |
| "entropy": 1.38703125, | |
| "epoch": 0.19465256353753488, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 5.963367449707793e-06, | |
| "loss": 0.1815, | |
| "mean_token_accuracy": 0.9614927160739899, | |
| "num_tokens": 845350867.0, | |
| "step": 7950 | |
| }, | |
| { | |
| "entropy": 1.39875, | |
| "epoch": 0.19587679349689047, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 5.962415265808872e-06, | |
| "loss": 0.1921, | |
| "mean_token_accuracy": 0.9596588695049286, | |
| "num_tokens": 850547684.0, | |
| "step": 8000 | |
| }, | |
| { | |
| "entropy": 1.3890625, | |
| "epoch": 0.19710102345624603, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 5.961450943178779e-06, | |
| "loss": 0.1915, | |
| "mean_token_accuracy": 0.9603916919231414, | |
| "num_tokens": 855721426.0, | |
| "step": 8050 | |
| }, | |
| { | |
| "entropy": 1.37421875, | |
| "epoch": 0.1983252534156016, | |
| "grad_norm": 2.734375, | |
| "learning_rate": 5.960474485768902e-06, | |
| "loss": 0.1722, | |
| "mean_token_accuracy": 0.963141576051712, | |
| "num_tokens": 860509090.0, | |
| "step": 8100 | |
| }, | |
| { | |
| "entropy": 1.34984375, | |
| "epoch": 0.19954948337495715, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 5.959485897580353e-06, | |
| "loss": 0.1799, | |
| "mean_token_accuracy": 0.9624167239665985, | |
| "num_tokens": 865732499.0, | |
| "step": 8150 | |
| }, | |
| { | |
| "entropy": 1.37765625, | |
| "epoch": 0.2007737133343127, | |
| "grad_norm": 2.875, | |
| "learning_rate": 5.95848518266395e-06, | |
| "loss": 0.1955, | |
| "mean_token_accuracy": 0.9592999804019928, | |
| "num_tokens": 870715442.0, | |
| "step": 8200 | |
| }, | |
| { | |
| "entropy": 1.3496875, | |
| "epoch": 0.20199794329366827, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 5.957472345120202e-06, | |
| "loss": 0.1826, | |
| "mean_token_accuracy": 0.9611281609535217, | |
| "num_tokens": 875976771.0, | |
| "step": 8250 | |
| }, | |
| { | |
| "entropy": 1.331875, | |
| "epoch": 0.20322217325302386, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 5.95644738909929e-06, | |
| "loss": 0.1801, | |
| "mean_token_accuracy": 0.9619064545631408, | |
| "num_tokens": 881030532.0, | |
| "step": 8300 | |
| }, | |
| { | |
| "entropy": 1.33828125, | |
| "epoch": 0.20444640321237942, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 5.9554103188010544e-06, | |
| "loss": 0.1844, | |
| "mean_token_accuracy": 0.9607453966140747, | |
| "num_tokens": 886102364.0, | |
| "step": 8350 | |
| }, | |
| { | |
| "entropy": 1.33625, | |
| "epoch": 0.20567063317173498, | |
| "grad_norm": 2.59375, | |
| "learning_rate": 5.9543611384749716e-06, | |
| "loss": 0.1896, | |
| "mean_token_accuracy": 0.9599519455432892, | |
| "num_tokens": 891339628.0, | |
| "step": 8400 | |
| }, | |
| { | |
| "entropy": 1.3515625, | |
| "epoch": 0.20689486313109054, | |
| "grad_norm": 3.1875, | |
| "learning_rate": 5.953299852420142e-06, | |
| "loss": 0.1963, | |
| "mean_token_accuracy": 0.9594342112541199, | |
| "num_tokens": 896598491.0, | |
| "step": 8450 | |
| }, | |
| { | |
| "entropy": 1.3475, | |
| "epoch": 0.2081190930904461, | |
| "grad_norm": 1.6171875, | |
| "learning_rate": 5.952226464985268e-06, | |
| "loss": 0.1876, | |
| "mean_token_accuracy": 0.9601819491386414, | |
| "num_tokens": 901857034.0, | |
| "step": 8500 | |
| }, | |
| { | |
| "entropy": 1.34546875, | |
| "epoch": 0.2093433230498017, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 5.951140980568639e-06, | |
| "loss": 0.2025, | |
| "mean_token_accuracy": 0.9580735051631928, | |
| "num_tokens": 907672007.0, | |
| "step": 8550 | |
| }, | |
| { | |
| "entropy": 1.3434375, | |
| "epoch": 0.21056755300915725, | |
| "grad_norm": 2.859375, | |
| "learning_rate": 5.950043403618116e-06, | |
| "loss": 0.182, | |
| "mean_token_accuracy": 0.9620107614994049, | |
| "num_tokens": 912959621.0, | |
| "step": 8600 | |
| }, | |
| { | |
| "entropy": 1.34140625, | |
| "epoch": 0.2117917829685128, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 5.948933738631106e-06, | |
| "loss": 0.182, | |
| "mean_token_accuracy": 0.9617352223396302, | |
| "num_tokens": 918075673.0, | |
| "step": 8650 | |
| }, | |
| { | |
| "entropy": 1.3446875, | |
| "epoch": 0.21301601292786837, | |
| "grad_norm": 2.625, | |
| "learning_rate": 5.9478119901545485e-06, | |
| "loss": 0.1863, | |
| "mean_token_accuracy": 0.960466115474701, | |
| "num_tokens": 923511470.0, | |
| "step": 8700 | |
| }, | |
| { | |
| "entropy": 1.3490625, | |
| "epoch": 0.21424024288722393, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 5.946678162784898e-06, | |
| "loss": 0.1997, | |
| "mean_token_accuracy": 0.9574442803859711, | |
| "num_tokens": 929168035.0, | |
| "step": 8750 | |
| }, | |
| { | |
| "entropy": 1.3559375, | |
| "epoch": 0.2154644728465795, | |
| "grad_norm": 2.59375, | |
| "learning_rate": 5.945532261168101e-06, | |
| "loss": 0.188, | |
| "mean_token_accuracy": 0.9608505368232727, | |
| "num_tokens": 934643696.0, | |
| "step": 8800 | |
| }, | |
| { | |
| "entropy": 1.37, | |
| "epoch": 0.21668870280593508, | |
| "grad_norm": 2.84375, | |
| "learning_rate": 5.9443742899995815e-06, | |
| "loss": 0.1987, | |
| "mean_token_accuracy": 0.9590060126781463, | |
| "num_tokens": 940012909.0, | |
| "step": 8850 | |
| }, | |
| { | |
| "entropy": 1.360625, | |
| "epoch": 0.21791293276529064, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 5.943204254024216e-06, | |
| "loss": 0.1835, | |
| "mean_token_accuracy": 0.9617989957332611, | |
| "num_tokens": 945384360.0, | |
| "step": 8900 | |
| }, | |
| { | |
| "entropy": 1.3675, | |
| "epoch": 0.2191371627246462, | |
| "grad_norm": 3.03125, | |
| "learning_rate": 5.942022158036322e-06, | |
| "loss": 0.1955, | |
| "mean_token_accuracy": 0.9601530432701111, | |
| "num_tokens": 950833742.0, | |
| "step": 8950 | |
| }, | |
| { | |
| "entropy": 1.38125, | |
| "epoch": 0.22036139268400176, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 5.9408280068796286e-06, | |
| "loss": 0.2066, | |
| "mean_token_accuracy": 0.9570643317699432, | |
| "num_tokens": 956401892.0, | |
| "step": 9000 | |
| }, | |
| { | |
| "entropy": 1.37234375, | |
| "epoch": 0.22158562264335732, | |
| "grad_norm": 1.71875, | |
| "learning_rate": 5.939621805447267e-06, | |
| "loss": 0.1804, | |
| "mean_token_accuracy": 0.9623953711986541, | |
| "num_tokens": 961223140.0, | |
| "step": 9050 | |
| }, | |
| { | |
| "entropy": 1.391875, | |
| "epoch": 0.22280985260271288, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 5.938403558681743e-06, | |
| "loss": 0.202, | |
| "mean_token_accuracy": 0.9580870044231414, | |
| "num_tokens": 966771629.0, | |
| "step": 9100 | |
| }, | |
| { | |
| "entropy": 1.36703125, | |
| "epoch": 0.22403408256206847, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 5.9371732715749175e-06, | |
| "loss": 0.1866, | |
| "mean_token_accuracy": 0.9609157121181489, | |
| "num_tokens": 972305399.0, | |
| "step": 9150 | |
| }, | |
| { | |
| "entropy": 1.35140625, | |
| "epoch": 0.22525831252142403, | |
| "grad_norm": 1.6796875, | |
| "learning_rate": 5.935930949167991e-06, | |
| "loss": 0.1815, | |
| "mean_token_accuracy": 0.9617423331737518, | |
| "num_tokens": 977370470.0, | |
| "step": 9200 | |
| }, | |
| { | |
| "entropy": 1.36953125, | |
| "epoch": 0.2264825424807796, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 5.934676596551477e-06, | |
| "loss": 0.1884, | |
| "mean_token_accuracy": 0.9609754991531372, | |
| "num_tokens": 982652269.0, | |
| "step": 9250 | |
| }, | |
| { | |
| "entropy": 1.363125, | |
| "epoch": 0.22770677244013515, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 5.933410218865186e-06, | |
| "loss": 0.1858, | |
| "mean_token_accuracy": 0.9611955726146698, | |
| "num_tokens": 988014138.0, | |
| "step": 9300 | |
| }, | |
| { | |
| "entropy": 1.37265625, | |
| "epoch": 0.2289310023994907, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 5.932131821298198e-06, | |
| "loss": 0.1856, | |
| "mean_token_accuracy": 0.9616758930683136, | |
| "num_tokens": 993370242.0, | |
| "step": 9350 | |
| }, | |
| { | |
| "entropy": 1.38515625, | |
| "epoch": 0.23015523235884627, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 5.930841409088853e-06, | |
| "loss": 0.1906, | |
| "mean_token_accuracy": 0.9603582990169525, | |
| "num_tokens": 998918502.0, | |
| "step": 9400 | |
| }, | |
| { | |
| "entropy": 1.39, | |
| "epoch": 0.23137946231820186, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 5.929538987524712e-06, | |
| "loss": 0.1854, | |
| "mean_token_accuracy": 0.9604568040370941, | |
| "num_tokens": 1004326538.0, | |
| "step": 9450 | |
| }, | |
| { | |
| "entropy": 1.3890625, | |
| "epoch": 0.23260369227755742, | |
| "grad_norm": 2.75, | |
| "learning_rate": 5.928224561942554e-06, | |
| "loss": 0.1812, | |
| "mean_token_accuracy": 0.9616895508766174, | |
| "num_tokens": 1009603548.0, | |
| "step": 9500 | |
| }, | |
| { | |
| "entropy": 1.3871875, | |
| "epoch": 0.23382792223691298, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 5.92689813772834e-06, | |
| "loss": 0.1963, | |
| "mean_token_accuracy": 0.9590861582756043, | |
| "num_tokens": 1015070964.0, | |
| "step": 9550 | |
| }, | |
| { | |
| "entropy": 1.36609375, | |
| "epoch": 0.23505215219626854, | |
| "grad_norm": 3.109375, | |
| "learning_rate": 5.9255597203172e-06, | |
| "loss": 0.1828, | |
| "mean_token_accuracy": 0.9619620275497437, | |
| "num_tokens": 1020492153.0, | |
| "step": 9600 | |
| }, | |
| { | |
| "entropy": 1.38609375, | |
| "epoch": 0.2362763821556241, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 5.924209315193405e-06, | |
| "loss": 0.1845, | |
| "mean_token_accuracy": 0.961515667438507, | |
| "num_tokens": 1025864529.0, | |
| "step": 9650 | |
| }, | |
| { | |
| "entropy": 1.3715625, | |
| "epoch": 0.23750061211497966, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 5.922846927890345e-06, | |
| "loss": 0.1797, | |
| "mean_token_accuracy": 0.9618804860115051, | |
| "num_tokens": 1031024359.0, | |
| "step": 9700 | |
| }, | |
| { | |
| "entropy": 1.36359375, | |
| "epoch": 0.23872484207433525, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 5.9214725639905115e-06, | |
| "loss": 0.1863, | |
| "mean_token_accuracy": 0.9610350334644318, | |
| "num_tokens": 1036377471.0, | |
| "step": 9750 | |
| }, | |
| { | |
| "entropy": 1.3715625, | |
| "epoch": 0.2399490720336908, | |
| "grad_norm": 2.859375, | |
| "learning_rate": 5.92008622912547e-06, | |
| "loss": 0.1831, | |
| "mean_token_accuracy": 0.9612818145751953, | |
| "num_tokens": 1041703688.0, | |
| "step": 9800 | |
| }, | |
| { | |
| "entropy": 1.35671875, | |
| "epoch": 0.24117330199304637, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 5.918687928975836e-06, | |
| "loss": 0.1839, | |
| "mean_token_accuracy": 0.9616091656684875, | |
| "num_tokens": 1046917985.0, | |
| "step": 9850 | |
| }, | |
| { | |
| "entropy": 1.39015625, | |
| "epoch": 0.24239753195240193, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 5.9172776692712575e-06, | |
| "loss": 0.1965, | |
| "mean_token_accuracy": 0.9584881782531738, | |
| "num_tokens": 1052482737.0, | |
| "step": 9900 | |
| }, | |
| { | |
| "entropy": 1.38703125, | |
| "epoch": 0.2436217619117575, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 5.915855455790381e-06, | |
| "loss": 0.1884, | |
| "mean_token_accuracy": 0.9608153140544892, | |
| "num_tokens": 1057868410.0, | |
| "step": 9950 | |
| }, | |
| { | |
| "entropy": 1.395, | |
| "epoch": 0.24484599187111308, | |
| "grad_norm": 2.8125, | |
| "learning_rate": 5.914421294360843e-06, | |
| "loss": 0.1904, | |
| "mean_token_accuracy": 0.9597806739807129, | |
| "num_tokens": 1063175179.0, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.24484599187111308, | |
| "eval_entropy": 1.359765625, | |
| "eval_loss": 0.20250044763088226, | |
| "eval_mean_token_accuracy": 0.9580152039726575, | |
| "eval_num_tokens": 1063175179.0, | |
| "eval_runtime": 600.0597, | |
| "eval_samples_per_second": 16.092, | |
| "eval_steps_per_second": 0.202, | |
| "step": 10000 | |
| }, | |
| { | |
| "entropy": 1.3840625, | |
| "epoch": 0.24607022183046864, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 5.912975190859232e-06, | |
| "loss": 0.195, | |
| "mean_token_accuracy": 0.9596641564369202, | |
| "num_tokens": 1068741854.0, | |
| "step": 10050 | |
| }, | |
| { | |
| "entropy": 1.3790625, | |
| "epoch": 0.2472944517898242, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 5.9115171512110714e-06, | |
| "loss": 0.1854, | |
| "mean_token_accuracy": 0.9604480576515197, | |
| "num_tokens": 1074116479.0, | |
| "step": 10100 | |
| }, | |
| { | |
| "entropy": 1.36453125, | |
| "epoch": 0.24851868174917977, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 5.910047181390794e-06, | |
| "loss": 0.1697, | |
| "mean_token_accuracy": 0.9642793035507202, | |
| "num_tokens": 1079159902.0, | |
| "step": 10150 | |
| }, | |
| { | |
| "entropy": 1.373125, | |
| "epoch": 0.24974291170853533, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 5.908565287421718e-06, | |
| "loss": 0.1861, | |
| "mean_token_accuracy": 0.9611909198760986, | |
| "num_tokens": 1084521049.0, | |
| "step": 10200 | |
| }, | |
| { | |
| "entropy": 1.3578125, | |
| "epoch": 0.2509671416678909, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 5.907071475376021e-06, | |
| "loss": 0.1787, | |
| "mean_token_accuracy": 0.9620854771137237, | |
| "num_tokens": 1089493722.0, | |
| "step": 10250 | |
| }, | |
| { | |
| "entropy": 1.36484375, | |
| "epoch": 0.2521913716272465, | |
| "grad_norm": 2.640625, | |
| "learning_rate": 5.905565751374717e-06, | |
| "loss": 0.1732, | |
| "mean_token_accuracy": 0.9639436435699463, | |
| "num_tokens": 1094338571.0, | |
| "step": 10300 | |
| }, | |
| { | |
| "entropy": 1.37234375, | |
| "epoch": 0.25341560158660204, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 5.904048121587628e-06, | |
| "loss": 0.1772, | |
| "mean_token_accuracy": 0.9625762343406677, | |
| "num_tokens": 1099742354.0, | |
| "step": 10350 | |
| }, | |
| { | |
| "entropy": 1.38359375, | |
| "epoch": 0.2546398315459576, | |
| "grad_norm": 1.5078125, | |
| "learning_rate": 5.902518592233363e-06, | |
| "loss": 0.1987, | |
| "mean_token_accuracy": 0.9577878427505493, | |
| "num_tokens": 1105617487.0, | |
| "step": 10400 | |
| }, | |
| { | |
| "entropy": 1.3615625, | |
| "epoch": 0.25586406150531316, | |
| "grad_norm": 3.234375, | |
| "learning_rate": 5.9009771695792905e-06, | |
| "loss": 0.1811, | |
| "mean_token_accuracy": 0.9621189975738526, | |
| "num_tokens": 1110680544.0, | |
| "step": 10450 | |
| }, | |
| { | |
| "entropy": 1.37375, | |
| "epoch": 0.2570882914646687, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 5.899423859941511e-06, | |
| "loss": 0.1882, | |
| "mean_token_accuracy": 0.9606586790084839, | |
| "num_tokens": 1116178837.0, | |
| "step": 10500 | |
| }, | |
| { | |
| "entropy": 1.37484375, | |
| "epoch": 0.2583125214240243, | |
| "grad_norm": 1.7578125, | |
| "learning_rate": 5.897858669684833e-06, | |
| "loss": 0.1893, | |
| "mean_token_accuracy": 0.9598471677303314, | |
| "num_tokens": 1121511467.0, | |
| "step": 10550 | |
| }, | |
| { | |
| "entropy": 1.3609375, | |
| "epoch": 0.25953675138337984, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 5.896281605222749e-06, | |
| "loss": 0.1806, | |
| "mean_token_accuracy": 0.9624120283126831, | |
| "num_tokens": 1126507233.0, | |
| "step": 10600 | |
| }, | |
| { | |
| "entropy": 1.34734375, | |
| "epoch": 0.2607609813427354, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 5.8946926730174045e-06, | |
| "loss": 0.1863, | |
| "mean_token_accuracy": 0.9608824181556702, | |
| "num_tokens": 1131912464.0, | |
| "step": 10650 | |
| }, | |
| { | |
| "entropy": 1.33921875, | |
| "epoch": 0.26198521130209096, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 5.893091879579575e-06, | |
| "loss": 0.1856, | |
| "mean_token_accuracy": 0.9607326745986938, | |
| "num_tokens": 1136882208.0, | |
| "step": 10700 | |
| }, | |
| { | |
| "entropy": 1.343125, | |
| "epoch": 0.2632094412614466, | |
| "grad_norm": 1.9921875, | |
| "learning_rate": 5.89147923146864e-06, | |
| "loss": 0.1813, | |
| "mean_token_accuracy": 0.9620126748085022, | |
| "num_tokens": 1142095292.0, | |
| "step": 10750 | |
| }, | |
| { | |
| "entropy": 1.34765625, | |
| "epoch": 0.26443367122080214, | |
| "grad_norm": 3.234375, | |
| "learning_rate": 5.889854735292551e-06, | |
| "loss": 0.1841, | |
| "mean_token_accuracy": 0.9618128108978271, | |
| "num_tokens": 1147363920.0, | |
| "step": 10800 | |
| }, | |
| { | |
| "entropy": 1.356875, | |
| "epoch": 0.2656579011801577, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 5.888218397707811e-06, | |
| "loss": 0.1742, | |
| "mean_token_accuracy": 0.9638459277153015, | |
| "num_tokens": 1152380705.0, | |
| "step": 10850 | |
| }, | |
| { | |
| "entropy": 1.32984375, | |
| "epoch": 0.26688213113951326, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 5.886570225419441e-06, | |
| "loss": 0.1865, | |
| "mean_token_accuracy": 0.9608019030094147, | |
| "num_tokens": 1157839898.0, | |
| "step": 10900 | |
| }, | |
| { | |
| "entropy": 1.34609375, | |
| "epoch": 0.2681063610988688, | |
| "grad_norm": 3.453125, | |
| "learning_rate": 5.88491022518096e-06, | |
| "loss": 0.1918, | |
| "mean_token_accuracy": 0.9609634006023406, | |
| "num_tokens": 1163068506.0, | |
| "step": 10950 | |
| }, | |
| { | |
| "entropy": 1.32734375, | |
| "epoch": 0.2693305910582244, | |
| "grad_norm": 2.125, | |
| "learning_rate": 5.883238403794349e-06, | |
| "loss": 0.1758, | |
| "mean_token_accuracy": 0.9633646559715271, | |
| "num_tokens": 1168287852.0, | |
| "step": 11000 | |
| }, | |
| { | |
| "entropy": 1.34375, | |
| "epoch": 0.27055482101757994, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 5.881554768110028e-06, | |
| "loss": 0.1914, | |
| "mean_token_accuracy": 0.9605349290370941, | |
| "num_tokens": 1173597061.0, | |
| "step": 11050 | |
| }, | |
| { | |
| "entropy": 1.3434375, | |
| "epoch": 0.2717790509769355, | |
| "grad_norm": 3.5, | |
| "learning_rate": 5.879859325026828e-06, | |
| "loss": 0.1864, | |
| "mean_token_accuracy": 0.9604840254783631, | |
| "num_tokens": 1178845621.0, | |
| "step": 11100 | |
| }, | |
| { | |
| "entropy": 1.35984375, | |
| "epoch": 0.27300328093629106, | |
| "grad_norm": 2.734375, | |
| "learning_rate": 5.878152081491963e-06, | |
| "loss": 0.1925, | |
| "mean_token_accuracy": 0.9589577269554138, | |
| "num_tokens": 1184054388.0, | |
| "step": 11150 | |
| }, | |
| { | |
| "entropy": 1.34875, | |
| "epoch": 0.2742275108956466, | |
| "grad_norm": 2.625, | |
| "learning_rate": 5.876433044500996e-06, | |
| "loss": 0.1921, | |
| "mean_token_accuracy": 0.9595346593856812, | |
| "num_tokens": 1189697396.0, | |
| "step": 11200 | |
| }, | |
| { | |
| "entropy": 1.34390625, | |
| "epoch": 0.2754517408550022, | |
| "grad_norm": 2.0, | |
| "learning_rate": 5.874702221097819e-06, | |
| "loss": 0.1882, | |
| "mean_token_accuracy": 0.960370112657547, | |
| "num_tokens": 1195166226.0, | |
| "step": 11250 | |
| }, | |
| { | |
| "entropy": 1.34515625, | |
| "epoch": 0.2766759708143578, | |
| "grad_norm": 2.734375, | |
| "learning_rate": 5.8729596183746175e-06, | |
| "loss": 0.1805, | |
| "mean_token_accuracy": 0.9621370649337768, | |
| "num_tokens": 1200392905.0, | |
| "step": 11300 | |
| }, | |
| { | |
| "entropy": 1.3428125, | |
| "epoch": 0.27790020077371336, | |
| "grad_norm": 3.078125, | |
| "learning_rate": 5.871205243471844e-06, | |
| "loss": 0.1841, | |
| "mean_token_accuracy": 0.9613085889816284, | |
| "num_tokens": 1205618541.0, | |
| "step": 11350 | |
| }, | |
| { | |
| "entropy": 1.35171875, | |
| "epoch": 0.2791244307330689, | |
| "grad_norm": 3.40625, | |
| "learning_rate": 5.869439103578189e-06, | |
| "loss": 0.1852, | |
| "mean_token_accuracy": 0.9616814315319061, | |
| "num_tokens": 1210836329.0, | |
| "step": 11400 | |
| }, | |
| { | |
| "entropy": 1.3453125, | |
| "epoch": 0.2803486606924245, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 5.867661205930549e-06, | |
| "loss": 0.1821, | |
| "mean_token_accuracy": 0.9620612812042236, | |
| "num_tokens": 1215867506.0, | |
| "step": 11450 | |
| }, | |
| { | |
| "entropy": 1.35875, | |
| "epoch": 0.28157289065178004, | |
| "grad_norm": 2.953125, | |
| "learning_rate": 5.865871557814003e-06, | |
| "loss": 0.1915, | |
| "mean_token_accuracy": 0.9604600322246551, | |
| "num_tokens": 1220793244.0, | |
| "step": 11500 | |
| }, | |
| { | |
| "entropy": 1.353125, | |
| "epoch": 0.2827971206111356, | |
| "grad_norm": 2.796875, | |
| "learning_rate": 5.864070166561775e-06, | |
| "loss": 0.1937, | |
| "mean_token_accuracy": 0.9599918603897095, | |
| "num_tokens": 1226305868.0, | |
| "step": 11550 | |
| }, | |
| { | |
| "entropy": 1.394375, | |
| "epoch": 0.28402135057049116, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 5.862257039555207e-06, | |
| "loss": 0.1991, | |
| "mean_token_accuracy": 0.9583842658996582, | |
| "num_tokens": 1232013095.0, | |
| "step": 11600 | |
| }, | |
| { | |
| "entropy": 1.37578125, | |
| "epoch": 0.2852455805298467, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 5.860432184223731e-06, | |
| "loss": 0.1913, | |
| "mean_token_accuracy": 0.9596893274784088, | |
| "num_tokens": 1237458606.0, | |
| "step": 11650 | |
| }, | |
| { | |
| "entropy": 1.35703125, | |
| "epoch": 0.2864698104892023, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 5.858595608044837e-06, | |
| "loss": 0.1835, | |
| "mean_token_accuracy": 0.9611952984333039, | |
| "num_tokens": 1242972251.0, | |
| "step": 11700 | |
| }, | |
| { | |
| "entropy": 1.37078125, | |
| "epoch": 0.28769404044855784, | |
| "grad_norm": 3.1875, | |
| "learning_rate": 5.856747318544041e-06, | |
| "loss": 0.1865, | |
| "mean_token_accuracy": 0.9609648621082306, | |
| "num_tokens": 1248318638.0, | |
| "step": 11750 | |
| }, | |
| { | |
| "entropy": 1.365, | |
| "epoch": 0.2889182704079134, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 5.854887323294856e-06, | |
| "loss": 0.183, | |
| "mean_token_accuracy": 0.9627510058879852, | |
| "num_tokens": 1253680002.0, | |
| "step": 11800 | |
| }, | |
| { | |
| "entropy": 1.37578125, | |
| "epoch": 0.29014250036726896, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 5.853015629918759e-06, | |
| "loss": 0.1862, | |
| "mean_token_accuracy": 0.9614068794250489, | |
| "num_tokens": 1258924764.0, | |
| "step": 11850 | |
| }, | |
| { | |
| "entropy": 1.37796875, | |
| "epoch": 0.2913667303266246, | |
| "grad_norm": 1.90625, | |
| "learning_rate": 5.8511322460851624e-06, | |
| "loss": 0.1832, | |
| "mean_token_accuracy": 0.9620686209201813, | |
| "num_tokens": 1264051390.0, | |
| "step": 11900 | |
| }, | |
| { | |
| "entropy": 1.37328125, | |
| "epoch": 0.29259096028598014, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 5.849237179511381e-06, | |
| "loss": 0.1769, | |
| "mean_token_accuracy": 0.9628199970722199, | |
| "num_tokens": 1269148836.0, | |
| "step": 11950 | |
| }, | |
| { | |
| "entropy": 1.376875, | |
| "epoch": 0.2938151902453357, | |
| "grad_norm": 3.125, | |
| "learning_rate": 5.8473304379626e-06, | |
| "loss": 0.1871, | |
| "mean_token_accuracy": 0.9601672506332397, | |
| "num_tokens": 1274348582.0, | |
| "step": 12000 | |
| }, | |
| { | |
| "entropy": 1.35203125, | |
| "epoch": 0.29503942020469126, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 5.845412029251843e-06, | |
| "loss": 0.1796, | |
| "mean_token_accuracy": 0.9622039210796356, | |
| "num_tokens": 1279184908.0, | |
| "step": 12050 | |
| }, | |
| { | |
| "entropy": 1.35859375, | |
| "epoch": 0.2962636501640468, | |
| "grad_norm": 2.921875, | |
| "learning_rate": 5.843481961239942e-06, | |
| "loss": 0.1772, | |
| "mean_token_accuracy": 0.9627481973171235, | |
| "num_tokens": 1284410532.0, | |
| "step": 12100 | |
| }, | |
| { | |
| "entropy": 1.35953125, | |
| "epoch": 0.2974878801234024, | |
| "grad_norm": 7.40625, | |
| "learning_rate": 5.841540241835504e-06, | |
| "loss": 0.1768, | |
| "mean_token_accuracy": 0.9626896047592163, | |
| "num_tokens": 1289768837.0, | |
| "step": 12150 | |
| }, | |
| { | |
| "entropy": 1.378125, | |
| "epoch": 0.29871211008275794, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 5.8395868789948775e-06, | |
| "loss": 0.1848, | |
| "mean_token_accuracy": 0.9612694227695465, | |
| "num_tokens": 1295005247.0, | |
| "step": 12200 | |
| }, | |
| { | |
| "entropy": 1.37359375, | |
| "epoch": 0.2999363400421135, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 5.837621880722122e-06, | |
| "loss": 0.1909, | |
| "mean_token_accuracy": 0.9603909432888031, | |
| "num_tokens": 1300316507.0, | |
| "step": 12250 | |
| }, | |
| { | |
| "entropy": 1.35953125, | |
| "epoch": 0.30116057000146906, | |
| "grad_norm": 2.75, | |
| "learning_rate": 5.835645255068973e-06, | |
| "loss": 0.1838, | |
| "mean_token_accuracy": 0.9617878496646881, | |
| "num_tokens": 1305931141.0, | |
| "step": 12300 | |
| }, | |
| { | |
| "entropy": 1.34640625, | |
| "epoch": 0.3023847999608246, | |
| "grad_norm": 2.375, | |
| "learning_rate": 5.8336570101348115e-06, | |
| "loss": 0.1651, | |
| "mean_token_accuracy": 0.9648260760307312, | |
| "num_tokens": 1310803906.0, | |
| "step": 12350 | |
| }, | |
| { | |
| "entropy": 1.358125, | |
| "epoch": 0.3036090299201802, | |
| "grad_norm": 2.84375, | |
| "learning_rate": 5.831657154066629e-06, | |
| "loss": 0.1827, | |
| "mean_token_accuracy": 0.9618698525428772, | |
| "num_tokens": 1315973080.0, | |
| "step": 12400 | |
| }, | |
| { | |
| "entropy": 1.35328125, | |
| "epoch": 0.30483325987953575, | |
| "grad_norm": 3.578125, | |
| "learning_rate": 5.829645695058992e-06, | |
| "loss": 0.1747, | |
| "mean_token_accuracy": 0.9627145206928254, | |
| "num_tokens": 1321381888.0, | |
| "step": 12450 | |
| }, | |
| { | |
| "entropy": 1.37859375, | |
| "epoch": 0.30605748983889136, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 5.827622641354014e-06, | |
| "loss": 0.1787, | |
| "mean_token_accuracy": 0.9626282620429992, | |
| "num_tokens": 1326557068.0, | |
| "step": 12500 | |
| }, | |
| { | |
| "entropy": 1.3759375, | |
| "epoch": 0.3072817197982469, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 5.825588001241318e-06, | |
| "loss": 0.1912, | |
| "mean_token_accuracy": 0.9598784649372101, | |
| "num_tokens": 1332216024.0, | |
| "step": 12550 | |
| }, | |
| { | |
| "entropy": 1.35890625, | |
| "epoch": 0.3085059497576025, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 5.823541783058005e-06, | |
| "loss": 0.174, | |
| "mean_token_accuracy": 0.962734831571579, | |
| "num_tokens": 1337390329.0, | |
| "step": 12600 | |
| }, | |
| { | |
| "entropy": 1.37375, | |
| "epoch": 0.30973017971695804, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 5.821483995188612e-06, | |
| "loss": 0.1881, | |
| "mean_token_accuracy": 0.9605675613880158, | |
| "num_tokens": 1343045143.0, | |
| "step": 12650 | |
| }, | |
| { | |
| "entropy": 1.3415625, | |
| "epoch": 0.3109544096763136, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 5.81941464606509e-06, | |
| "loss": 0.1666, | |
| "mean_token_accuracy": 0.9643463969230652, | |
| "num_tokens": 1348034262.0, | |
| "step": 12700 | |
| }, | |
| { | |
| "entropy": 1.3440625, | |
| "epoch": 0.31217863963566916, | |
| "grad_norm": 2.96875, | |
| "learning_rate": 5.817333744166762e-06, | |
| "loss": 0.1921, | |
| "mean_token_accuracy": 0.9586631393432617, | |
| "num_tokens": 1353723053.0, | |
| "step": 12750 | |
| }, | |
| { | |
| "entropy": 1.3721875, | |
| "epoch": 0.3134028695950247, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 5.815241298020286e-06, | |
| "loss": 0.1846, | |
| "mean_token_accuracy": 0.9600662136077881, | |
| "num_tokens": 1358674728.0, | |
| "step": 12800 | |
| }, | |
| { | |
| "entropy": 1.365625, | |
| "epoch": 0.3146270995543803, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 5.813137316199628e-06, | |
| "loss": 0.1835, | |
| "mean_token_accuracy": 0.961768034696579, | |
| "num_tokens": 1363933473.0, | |
| "step": 12850 | |
| }, | |
| { | |
| "entropy": 1.38015625, | |
| "epoch": 0.31585132951373585, | |
| "grad_norm": 2.5, | |
| "learning_rate": 5.811021807326018e-06, | |
| "loss": 0.1982, | |
| "mean_token_accuracy": 0.9590709102153778, | |
| "num_tokens": 1369281803.0, | |
| "step": 12900 | |
| }, | |
| { | |
| "entropy": 1.37, | |
| "epoch": 0.3170755594730914, | |
| "grad_norm": 3.03125, | |
| "learning_rate": 5.808894780067923e-06, | |
| "loss": 0.1949, | |
| "mean_token_accuracy": 0.9586555528640747, | |
| "num_tokens": 1374853145.0, | |
| "step": 12950 | |
| }, | |
| { | |
| "entropy": 1.36421875, | |
| "epoch": 0.31829978943244697, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 5.8067562431410045e-06, | |
| "loss": 0.171, | |
| "mean_token_accuracy": 0.9631958258152008, | |
| "num_tokens": 1379934830.0, | |
| "step": 13000 | |
| }, | |
| { | |
| "entropy": 1.3609375, | |
| "epoch": 0.3195240193918026, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 5.804606205308088e-06, | |
| "loss": 0.1841, | |
| "mean_token_accuracy": 0.9605684506893158, | |
| "num_tokens": 1385105704.0, | |
| "step": 13050 | |
| }, | |
| { | |
| "entropy": 1.37671875, | |
| "epoch": 0.32074824935115814, | |
| "grad_norm": 2.875, | |
| "learning_rate": 5.802444675379122e-06, | |
| "loss": 0.1947, | |
| "mean_token_accuracy": 0.9595759809017181, | |
| "num_tokens": 1390581041.0, | |
| "step": 13100 | |
| }, | |
| { | |
| "entropy": 1.37828125, | |
| "epoch": 0.3219724793105137, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 5.8002716622111485e-06, | |
| "loss": 0.1858, | |
| "mean_token_accuracy": 0.9617175209522247, | |
| "num_tokens": 1395850769.0, | |
| "step": 13150 | |
| }, | |
| { | |
| "entropy": 1.365, | |
| "epoch": 0.32319670926986926, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 5.79808717470826e-06, | |
| "loss": 0.1676, | |
| "mean_token_accuracy": 0.9655633735656738, | |
| "num_tokens": 1400935540.0, | |
| "step": 13200 | |
| }, | |
| { | |
| "entropy": 1.3709375, | |
| "epoch": 0.3244209392292248, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 5.795891221821569e-06, | |
| "loss": 0.1807, | |
| "mean_token_accuracy": 0.9624592447280884, | |
| "num_tokens": 1406376315.0, | |
| "step": 13250 | |
| }, | |
| { | |
| "entropy": 1.34875, | |
| "epoch": 0.3256451691885804, | |
| "grad_norm": 3.09375, | |
| "learning_rate": 5.793683812549162e-06, | |
| "loss": 0.1727, | |
| "mean_token_accuracy": 0.9637568819522858, | |
| "num_tokens": 1411533562.0, | |
| "step": 13300 | |
| }, | |
| { | |
| "entropy": 1.36421875, | |
| "epoch": 0.32686939914793595, | |
| "grad_norm": 2.703125, | |
| "learning_rate": 5.791464955936077e-06, | |
| "loss": 0.1938, | |
| "mean_token_accuracy": 0.9592576730251312, | |
| "num_tokens": 1417402528.0, | |
| "step": 13350 | |
| }, | |
| { | |
| "entropy": 1.36109375, | |
| "epoch": 0.3280936291072915, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 5.789234661074254e-06, | |
| "loss": 0.1744, | |
| "mean_token_accuracy": 0.9627709448337555, | |
| "num_tokens": 1422622878.0, | |
| "step": 13400 | |
| }, | |
| { | |
| "entropy": 1.3790625, | |
| "epoch": 0.32931785906664707, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 5.786992937102503e-06, | |
| "loss": 0.1959, | |
| "mean_token_accuracy": 0.9586515820026398, | |
| "num_tokens": 1427838914.0, | |
| "step": 13450 | |
| }, | |
| { | |
| "entropy": 1.36, | |
| "epoch": 0.33054208902600263, | |
| "grad_norm": 3.140625, | |
| "learning_rate": 5.784739793206464e-06, | |
| "loss": 0.1794, | |
| "mean_token_accuracy": 0.9625478911399842, | |
| "num_tokens": 1432973891.0, | |
| "step": 13500 | |
| }, | |
| { | |
| "entropy": 1.37546875, | |
| "epoch": 0.3317663189853582, | |
| "grad_norm": 2.875, | |
| "learning_rate": 5.782475238618574e-06, | |
| "loss": 0.1952, | |
| "mean_token_accuracy": 0.958906524181366, | |
| "num_tokens": 1438425313.0, | |
| "step": 13550 | |
| }, | |
| { | |
| "entropy": 1.39109375, | |
| "epoch": 0.33299054894471375, | |
| "grad_norm": 3.15625, | |
| "learning_rate": 5.780199282618026e-06, | |
| "loss": 0.1937, | |
| "mean_token_accuracy": 0.9599265992641449, | |
| "num_tokens": 1443930223.0, | |
| "step": 13600 | |
| }, | |
| { | |
| "entropy": 1.3784375, | |
| "epoch": 0.33421477890406937, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 5.777911934530726e-06, | |
| "loss": 0.1896, | |
| "mean_token_accuracy": 0.9606879663467407, | |
| "num_tokens": 1449235492.0, | |
| "step": 13650 | |
| }, | |
| { | |
| "entropy": 1.3740625, | |
| "epoch": 0.3354390088634249, | |
| "grad_norm": 2.25, | |
| "learning_rate": 5.7756132037292665e-06, | |
| "loss": 0.1845, | |
| "mean_token_accuracy": 0.9607800352573395, | |
| "num_tokens": 1454874971.0, | |
| "step": 13700 | |
| }, | |
| { | |
| "entropy": 1.3565625, | |
| "epoch": 0.3366632388227805, | |
| "grad_norm": 3.0, | |
| "learning_rate": 5.77330309963288e-06, | |
| "loss": 0.1664, | |
| "mean_token_accuracy": 0.9650224351882934, | |
| "num_tokens": 1459910564.0, | |
| "step": 13750 | |
| }, | |
| { | |
| "entropy": 1.3896875, | |
| "epoch": 0.33788746878213605, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 5.7709816317074e-06, | |
| "loss": 0.1852, | |
| "mean_token_accuracy": 0.9610321772098541, | |
| "num_tokens": 1465214852.0, | |
| "step": 13800 | |
| }, | |
| { | |
| "entropy": 1.3609375, | |
| "epoch": 0.3391116987414916, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 5.768648809465223e-06, | |
| "loss": 0.173, | |
| "mean_token_accuracy": 0.9646092760562897, | |
| "num_tokens": 1470405224.0, | |
| "step": 13850 | |
| }, | |
| { | |
| "entropy": 1.3671875, | |
| "epoch": 0.34033592870084717, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 5.766304642465277e-06, | |
| "loss": 0.1684, | |
| "mean_token_accuracy": 0.964150664806366, | |
| "num_tokens": 1475222511.0, | |
| "step": 13900 | |
| }, | |
| { | |
| "entropy": 1.3615625, | |
| "epoch": 0.34156015866020273, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 5.763949140312969e-06, | |
| "loss": 0.1903, | |
| "mean_token_accuracy": 0.9601925635337829, | |
| "num_tokens": 1480884593.0, | |
| "step": 13950 | |
| }, | |
| { | |
| "entropy": 1.35734375, | |
| "epoch": 0.3427843886195583, | |
| "grad_norm": 2.859375, | |
| "learning_rate": 5.7615823126601565e-06, | |
| "loss": 0.1853, | |
| "mean_token_accuracy": 0.9617584705352783, | |
| "num_tokens": 1485873672.0, | |
| "step": 14000 | |
| }, | |
| { | |
| "entropy": 1.37375, | |
| "epoch": 0.34400861857891385, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 5.759204169205102e-06, | |
| "loss": 0.1862, | |
| "mean_token_accuracy": 0.9605587136745453, | |
| "num_tokens": 1490904541.0, | |
| "step": 14050 | |
| }, | |
| { | |
| "entropy": 1.36359375, | |
| "epoch": 0.3452328485382694, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 5.7568147196924395e-06, | |
| "loss": 0.1891, | |
| "mean_token_accuracy": 0.9609455835819244, | |
| "num_tokens": 1496373059.0, | |
| "step": 14100 | |
| }, | |
| { | |
| "entropy": 1.35421875, | |
| "epoch": 0.34645707849762497, | |
| "grad_norm": 0.0322265625, | |
| "learning_rate": 5.754413973913126e-06, | |
| "loss": 0.1673, | |
| "mean_token_accuracy": 0.9642012619972229, | |
| "num_tokens": 1500901681.0, | |
| "step": 14150 | |
| }, | |
| { | |
| "entropy": 1.343125, | |
| "epoch": 0.3476813084569806, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 5.752001941704407e-06, | |
| "loss": 0.1759, | |
| "mean_token_accuracy": 0.9625442051887512, | |
| "num_tokens": 1506040261.0, | |
| "step": 14200 | |
| }, | |
| { | |
| "entropy": 1.36625, | |
| "epoch": 0.34890553841633615, | |
| "grad_norm": 3.0, | |
| "learning_rate": 5.749578632949776e-06, | |
| "loss": 0.1802, | |
| "mean_token_accuracy": 0.9619328999519348, | |
| "num_tokens": 1511536121.0, | |
| "step": 14250 | |
| }, | |
| { | |
| "entropy": 1.356875, | |
| "epoch": 0.3501297683756917, | |
| "grad_norm": 3.703125, | |
| "learning_rate": 5.747144057578932e-06, | |
| "loss": 0.1843, | |
| "mean_token_accuracy": 0.9613735234737396, | |
| "num_tokens": 1516899260.0, | |
| "step": 14300 | |
| }, | |
| { | |
| "entropy": 1.36203125, | |
| "epoch": 0.35135399833504727, | |
| "grad_norm": 3.671875, | |
| "learning_rate": 5.744698225567742e-06, | |
| "loss": 0.1929, | |
| "mean_token_accuracy": 0.9596503937244415, | |
| "num_tokens": 1522277914.0, | |
| "step": 14350 | |
| }, | |
| { | |
| "entropy": 1.35921875, | |
| "epoch": 0.35257822829440283, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 5.742241146938195e-06, | |
| "loss": 0.18, | |
| "mean_token_accuracy": 0.9617201662063599, | |
| "num_tokens": 1527559983.0, | |
| "step": 14400 | |
| }, | |
| { | |
| "entropy": 1.3353125, | |
| "epoch": 0.3538024582537584, | |
| "grad_norm": 3.03125, | |
| "learning_rate": 5.739772831758365e-06, | |
| "loss": 0.171, | |
| "mean_token_accuracy": 0.9635174345970153, | |
| "num_tokens": 1532501983.0, | |
| "step": 14450 | |
| }, | |
| { | |
| "entropy": 1.37234375, | |
| "epoch": 0.35502668821311395, | |
| "grad_norm": 1.84375, | |
| "learning_rate": 5.737293290142369e-06, | |
| "loss": 0.1957, | |
| "mean_token_accuracy": 0.9595348858833312, | |
| "num_tokens": 1538384868.0, | |
| "step": 14500 | |
| }, | |
| { | |
| "entropy": 1.36453125, | |
| "epoch": 0.3562509181724695, | |
| "grad_norm": 2.734375, | |
| "learning_rate": 5.734802532250327e-06, | |
| "loss": 0.1721, | |
| "mean_token_accuracy": 0.9636399447917938, | |
| "num_tokens": 1543550967.0, | |
| "step": 14550 | |
| }, | |
| { | |
| "entropy": 1.36703125, | |
| "epoch": 0.35747514813182507, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 5.7323005682883144e-06, | |
| "loss": 0.1817, | |
| "mean_token_accuracy": 0.9614765977859497, | |
| "num_tokens": 1548814643.0, | |
| "step": 14600 | |
| }, | |
| { | |
| "entropy": 1.37171875, | |
| "epoch": 0.35869937809118063, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 5.729787408508328e-06, | |
| "loss": 0.1854, | |
| "mean_token_accuracy": 0.9606961834430695, | |
| "num_tokens": 1554002337.0, | |
| "step": 14650 | |
| }, | |
| { | |
| "entropy": 1.363125, | |
| "epoch": 0.3599236080505362, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 5.7272630632082385e-06, | |
| "loss": 0.1788, | |
| "mean_token_accuracy": 0.9617051208019256, | |
| "num_tokens": 1558888261.0, | |
| "step": 14700 | |
| }, | |
| { | |
| "entropy": 1.3603125, | |
| "epoch": 0.36114783800989175, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 5.7247275427317515e-06, | |
| "loss": 0.1882, | |
| "mean_token_accuracy": 0.9613351905345917, | |
| "num_tokens": 1564034699.0, | |
| "step": 14750 | |
| }, | |
| { | |
| "entropy": 1.38765625, | |
| "epoch": 0.36237206796924737, | |
| "grad_norm": 3.90625, | |
| "learning_rate": 5.722180857468361e-06, | |
| "loss": 0.2015, | |
| "mean_token_accuracy": 0.9581510519981384, | |
| "num_tokens": 1569662314.0, | |
| "step": 14800 | |
| }, | |
| { | |
| "entropy": 1.35671875, | |
| "epoch": 0.36359629792860293, | |
| "grad_norm": 1.875, | |
| "learning_rate": 5.719623017853315e-06, | |
| "loss": 0.1858, | |
| "mean_token_accuracy": 0.9616824269294739, | |
| "num_tokens": 1575167487.0, | |
| "step": 14850 | |
| }, | |
| { | |
| "entropy": 1.36796875, | |
| "epoch": 0.3648205278879585, | |
| "grad_norm": 2.921875, | |
| "learning_rate": 5.7170540343675596e-06, | |
| "loss": 0.1858, | |
| "mean_token_accuracy": 0.9607573926448822, | |
| "num_tokens": 1580657915.0, | |
| "step": 14900 | |
| }, | |
| { | |
| "entropy": 1.3684375, | |
| "epoch": 0.36604475784731405, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 5.714473917537712e-06, | |
| "loss": 0.1771, | |
| "mean_token_accuracy": 0.9625304937362671, | |
| "num_tokens": 1585664001.0, | |
| "step": 14950 | |
| }, | |
| { | |
| "entropy": 1.36109375, | |
| "epoch": 0.3672689878066696, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 5.711882677936003e-06, | |
| "loss": 0.1781, | |
| "mean_token_accuracy": 0.961945322751999, | |
| "num_tokens": 1590920113.0, | |
| "step": 15000 | |
| }, | |
| { | |
| "entropy": 1.3575, | |
| "epoch": 0.36849321776602517, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 5.709280326180242e-06, | |
| "loss": 0.1737, | |
| "mean_token_accuracy": 0.9629940688610077, | |
| "num_tokens": 1596062396.0, | |
| "step": 15050 | |
| }, | |
| { | |
| "entropy": 1.37359375, | |
| "epoch": 0.36971744772538073, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 5.7066668729337725e-06, | |
| "loss": 0.1782, | |
| "mean_token_accuracy": 0.9626081240177154, | |
| "num_tokens": 1601254217.0, | |
| "step": 15100 | |
| }, | |
| { | |
| "entropy": 1.36609375, | |
| "epoch": 0.3709416776847363, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 5.704042328905426e-06, | |
| "loss": 0.1851, | |
| "mean_token_accuracy": 0.9608933937549591, | |
| "num_tokens": 1606561855.0, | |
| "step": 15150 | |
| }, | |
| { | |
| "entropy": 1.34859375, | |
| "epoch": 0.37216590764409185, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 5.701406704849479e-06, | |
| "loss": 0.1893, | |
| "mean_token_accuracy": 0.9602335524559021, | |
| "num_tokens": 1612223884.0, | |
| "step": 15200 | |
| }, | |
| { | |
| "entropy": 1.36765625, | |
| "epoch": 0.3733901376034474, | |
| "grad_norm": 2.703125, | |
| "learning_rate": 5.69876001156561e-06, | |
| "loss": 0.1837, | |
| "mean_token_accuracy": 0.9612676846981049, | |
| "num_tokens": 1617459423.0, | |
| "step": 15250 | |
| }, | |
| { | |
| "entropy": 1.366875, | |
| "epoch": 0.374614367562803, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 5.696102259898855e-06, | |
| "loss": 0.1895, | |
| "mean_token_accuracy": 0.9605361771583557, | |
| "num_tokens": 1622772691.0, | |
| "step": 15300 | |
| }, | |
| { | |
| "entropy": 1.3678125, | |
| "epoch": 0.37583859752215854, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 5.693433460739561e-06, | |
| "loss": 0.1794, | |
| "mean_token_accuracy": 0.9623438572883606, | |
| "num_tokens": 1627992421.0, | |
| "step": 15350 | |
| }, | |
| { | |
| "entropy": 1.385, | |
| "epoch": 0.37706282748151415, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 5.690753625023344e-06, | |
| "loss": 0.1903, | |
| "mean_token_accuracy": 0.9602718544006348, | |
| "num_tokens": 1633295976.0, | |
| "step": 15400 | |
| }, | |
| { | |
| "entropy": 1.36546875, | |
| "epoch": 0.3782870574408697, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 5.688062763731044e-06, | |
| "loss": 0.2002, | |
| "mean_token_accuracy": 0.9582274675369262, | |
| "num_tokens": 1638988248.0, | |
| "step": 15450 | |
| }, | |
| { | |
| "entropy": 1.35359375, | |
| "epoch": 0.3795112874002253, | |
| "grad_norm": 1.9921875, | |
| "learning_rate": 5.685360887888677e-06, | |
| "loss": 0.1789, | |
| "mean_token_accuracy": 0.9629680168628693, | |
| "num_tokens": 1644498341.0, | |
| "step": 15500 | |
| }, | |
| { | |
| "entropy": 1.369375, | |
| "epoch": 0.38073551735958083, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 5.682648008567394e-06, | |
| "loss": 0.1758, | |
| "mean_token_accuracy": 0.9636906123161316, | |
| "num_tokens": 1649900901.0, | |
| "step": 15550 | |
| }, | |
| { | |
| "entropy": 1.36546875, | |
| "epoch": 0.3819597473189364, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 5.679924136883432e-06, | |
| "loss": 0.1916, | |
| "mean_token_accuracy": 0.9601245021820068, | |
| "num_tokens": 1655743468.0, | |
| "step": 15600 | |
| }, | |
| { | |
| "entropy": 1.37828125, | |
| "epoch": 0.38318397727829195, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 5.677189283998073e-06, | |
| "loss": 0.1755, | |
| "mean_token_accuracy": 0.963598461151123, | |
| "num_tokens": 1660916320.0, | |
| "step": 15650 | |
| }, | |
| { | |
| "entropy": 1.35796875, | |
| "epoch": 0.3844082072376475, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 5.674443461117591e-06, | |
| "loss": 0.1778, | |
| "mean_token_accuracy": 0.9613646280765533, | |
| "num_tokens": 1666271922.0, | |
| "step": 15700 | |
| }, | |
| { | |
| "entropy": 1.3571875, | |
| "epoch": 0.3856324371970031, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 5.671686679493215e-06, | |
| "loss": 0.187, | |
| "mean_token_accuracy": 0.9609103786945343, | |
| "num_tokens": 1671766527.0, | |
| "step": 15750 | |
| }, | |
| { | |
| "entropy": 1.36625, | |
| "epoch": 0.38685666715635864, | |
| "grad_norm": 1.6328125, | |
| "learning_rate": 5.668918950421074e-06, | |
| "loss": 0.1886, | |
| "mean_token_accuracy": 0.9606494891643524, | |
| "num_tokens": 1677165332.0, | |
| "step": 15800 | |
| }, | |
| { | |
| "entropy": 1.3475, | |
| "epoch": 0.3880808971157142, | |
| "grad_norm": 3.046875, | |
| "learning_rate": 5.666140285242158e-06, | |
| "loss": 0.1801, | |
| "mean_token_accuracy": 0.9625120401382447, | |
| "num_tokens": 1682494165.0, | |
| "step": 15850 | |
| }, | |
| { | |
| "entropy": 1.36125, | |
| "epoch": 0.38930512707506976, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 5.663350695342268e-06, | |
| "loss": 0.1892, | |
| "mean_token_accuracy": 0.9604367816448212, | |
| "num_tokens": 1688253134.0, | |
| "step": 15900 | |
| }, | |
| { | |
| "entropy": 1.35328125, | |
| "epoch": 0.3905293570344254, | |
| "grad_norm": 1.6640625, | |
| "learning_rate": 5.660550192151967e-06, | |
| "loss": 0.1845, | |
| "mean_token_accuracy": 0.9621007204055786, | |
| "num_tokens": 1693632232.0, | |
| "step": 15950 | |
| }, | |
| { | |
| "entropy": 1.3690625, | |
| "epoch": 0.39175358699378093, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 5.657738787146543e-06, | |
| "loss": 0.1885, | |
| "mean_token_accuracy": 0.9610405099391938, | |
| "num_tokens": 1698678337.0, | |
| "step": 16000 | |
| }, | |
| { | |
| "entropy": 1.346875, | |
| "epoch": 0.3929778169531365, | |
| "grad_norm": 2.765625, | |
| "learning_rate": 5.654916491845947e-06, | |
| "loss": 0.1733, | |
| "mean_token_accuracy": 0.9640054357051849, | |
| "num_tokens": 1704187251.0, | |
| "step": 16050 | |
| }, | |
| { | |
| "entropy": 1.35375, | |
| "epoch": 0.39420204691249205, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 5.652083317814759e-06, | |
| "loss": 0.1745, | |
| "mean_token_accuracy": 0.9634167146682739, | |
| "num_tokens": 1709408694.0, | |
| "step": 16100 | |
| }, | |
| { | |
| "entropy": 1.34265625, | |
| "epoch": 0.3954262768718476, | |
| "grad_norm": 2.8125, | |
| "learning_rate": 5.649239276662133e-06, | |
| "loss": 0.1724, | |
| "mean_token_accuracy": 0.963241057395935, | |
| "num_tokens": 1714585157.0, | |
| "step": 16150 | |
| }, | |
| { | |
| "entropy": 1.3303125, | |
| "epoch": 0.3966505068312032, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 5.646384380041755e-06, | |
| "loss": 0.1759, | |
| "mean_token_accuracy": 0.9634040462970733, | |
| "num_tokens": 1719749974.0, | |
| "step": 16200 | |
| }, | |
| { | |
| "entropy": 1.33890625, | |
| "epoch": 0.39787473679055874, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 5.643518639651789e-06, | |
| "loss": 0.1754, | |
| "mean_token_accuracy": 0.963290364742279, | |
| "num_tokens": 1724935979.0, | |
| "step": 16250 | |
| }, | |
| { | |
| "entropy": 1.341875, | |
| "epoch": 0.3990989667499143, | |
| "grad_norm": 3.828125, | |
| "learning_rate": 5.640642067234832e-06, | |
| "loss": 0.1869, | |
| "mean_token_accuracy": 0.9608835780620575, | |
| "num_tokens": 1729904911.0, | |
| "step": 16300 | |
| }, | |
| { | |
| "entropy": 1.3525, | |
| "epoch": 0.40032319670926986, | |
| "grad_norm": 3.015625, | |
| "learning_rate": 5.637754674577869e-06, | |
| "loss": 0.193, | |
| "mean_token_accuracy": 0.9592759358882904, | |
| "num_tokens": 1735603402.0, | |
| "step": 16350 | |
| }, | |
| { | |
| "entropy": 1.33984375, | |
| "epoch": 0.4015474266686254, | |
| "grad_norm": 2.671875, | |
| "learning_rate": 5.634856473512218e-06, | |
| "loss": 0.1787, | |
| "mean_token_accuracy": 0.9626182532310485, | |
| "num_tokens": 1740876722.0, | |
| "step": 16400 | |
| }, | |
| { | |
| "entropy": 1.3328125, | |
| "epoch": 0.402771656627981, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 5.631947475913489e-06, | |
| "loss": 0.1951, | |
| "mean_token_accuracy": 0.9596171510219574, | |
| "num_tokens": 1746470991.0, | |
| "step": 16450 | |
| }, | |
| { | |
| "entropy": 1.31375, | |
| "epoch": 0.40399588658733654, | |
| "grad_norm": 2.734375, | |
| "learning_rate": 5.629027693701531e-06, | |
| "loss": 0.1646, | |
| "mean_token_accuracy": 0.9641488230228424, | |
| "num_tokens": 1751600795.0, | |
| "step": 16500 | |
| }, | |
| { | |
| "entropy": 1.3459375, | |
| "epoch": 0.40522011654669216, | |
| "grad_norm": 0.01904296875, | |
| "learning_rate": 5.626097138840379e-06, | |
| "loss": 0.1931, | |
| "mean_token_accuracy": 0.9586203134059906, | |
| "num_tokens": 1757280148.0, | |
| "step": 16550 | |
| }, | |
| { | |
| "entropy": 1.32203125, | |
| "epoch": 0.4064443465060477, | |
| "grad_norm": 3.125, | |
| "learning_rate": 5.623155823338219e-06, | |
| "loss": 0.1845, | |
| "mean_token_accuracy": 0.961804312467575, | |
| "num_tokens": 1762386072.0, | |
| "step": 16600 | |
| }, | |
| { | |
| "entropy": 1.309375, | |
| "epoch": 0.4076685764654033, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 5.62020375924732e-06, | |
| "loss": 0.1679, | |
| "mean_token_accuracy": 0.9640087175369263, | |
| "num_tokens": 1767593608.0, | |
| "step": 16650 | |
| }, | |
| { | |
| "entropy": 1.33890625, | |
| "epoch": 0.40889280642475884, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 5.617240958664e-06, | |
| "loss": 0.1778, | |
| "mean_token_accuracy": 0.9619925379753113, | |
| "num_tokens": 1772859293.0, | |
| "step": 16700 | |
| }, | |
| { | |
| "entropy": 1.3303125, | |
| "epoch": 0.4101170363841144, | |
| "grad_norm": 2.453125, | |
| "learning_rate": 5.614267433728569e-06, | |
| "loss": 0.1784, | |
| "mean_token_accuracy": 0.9621168851852417, | |
| "num_tokens": 1778176957.0, | |
| "step": 16750 | |
| }, | |
| { | |
| "entropy": 1.33359375, | |
| "epoch": 0.41134126634346996, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 5.611283196625281e-06, | |
| "loss": 0.1876, | |
| "mean_token_accuracy": 0.9608843457698822, | |
| "num_tokens": 1783513531.0, | |
| "step": 16800 | |
| }, | |
| { | |
| "entropy": 1.31875, | |
| "epoch": 0.4125654963028255, | |
| "grad_norm": 2.375, | |
| "learning_rate": 5.6082882595822835e-06, | |
| "loss": 0.1743, | |
| "mean_token_accuracy": 0.9634191727638245, | |
| "num_tokens": 1788649179.0, | |
| "step": 16850 | |
| }, | |
| { | |
| "entropy": 1.34703125, | |
| "epoch": 0.4137897262621811, | |
| "grad_norm": 3.0, | |
| "learning_rate": 5.605282634871569e-06, | |
| "loss": 0.1846, | |
| "mean_token_accuracy": 0.9604820072650909, | |
| "num_tokens": 1794020681.0, | |
| "step": 16900 | |
| }, | |
| { | |
| "entropy": 1.341875, | |
| "epoch": 0.41501395622153664, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 5.602266334808922e-06, | |
| "loss": 0.1917, | |
| "mean_token_accuracy": 0.9598517632484436, | |
| "num_tokens": 1799786050.0, | |
| "step": 16950 | |
| }, | |
| { | |
| "entropy": 1.32484375, | |
| "epoch": 0.4162381861808922, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 5.599239371753871e-06, | |
| "loss": 0.1843, | |
| "mean_token_accuracy": 0.9613809895515442, | |
| "num_tokens": 1805308121.0, | |
| "step": 17000 | |
| }, | |
| { | |
| "entropy": 1.3296875, | |
| "epoch": 0.41746241614024776, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 5.596201758109636e-06, | |
| "loss": 0.1971, | |
| "mean_token_accuracy": 0.9585018038749695, | |
| "num_tokens": 1811016191.0, | |
| "step": 17050 | |
| }, | |
| { | |
| "entropy": 1.34390625, | |
| "epoch": 0.4186866460996034, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 5.593153506323082e-06, | |
| "loss": 0.1912, | |
| "mean_token_accuracy": 0.9609514188766479, | |
| "num_tokens": 1816538866.0, | |
| "step": 17100 | |
| }, | |
| { | |
| "entropy": 1.319375, | |
| "epoch": 0.41991087605895894, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 5.59009462888466e-06, | |
| "loss": 0.1692, | |
| "mean_token_accuracy": 0.9638219344615936, | |
| "num_tokens": 1821484676.0, | |
| "step": 17150 | |
| }, | |
| { | |
| "entropy": 1.3296875, | |
| "epoch": 0.4211351060183145, | |
| "grad_norm": 3.078125, | |
| "learning_rate": 5.587025138328363e-06, | |
| "loss": 0.1855, | |
| "mean_token_accuracy": 0.9604250502586364, | |
| "num_tokens": 1826760752.0, | |
| "step": 17200 | |
| }, | |
| { | |
| "entropy": 1.32703125, | |
| "epoch": 0.42235933597767006, | |
| "grad_norm": 4.375, | |
| "learning_rate": 5.583945047231672e-06, | |
| "loss": 0.1756, | |
| "mean_token_accuracy": 0.9626831936836243, | |
| "num_tokens": 1831709955.0, | |
| "step": 17250 | |
| }, | |
| { | |
| "entropy": 1.3278125, | |
| "epoch": 0.4235835659370256, | |
| "grad_norm": 3.578125, | |
| "learning_rate": 5.580854368215504e-06, | |
| "loss": 0.1688, | |
| "mean_token_accuracy": 0.9641677963733674, | |
| "num_tokens": 1836539757.0, | |
| "step": 17300 | |
| }, | |
| { | |
| "entropy": 1.35453125, | |
| "epoch": 0.4248077958963812, | |
| "grad_norm": 3.203125, | |
| "learning_rate": 5.577753113944161e-06, | |
| "loss": 0.1795, | |
| "mean_token_accuracy": 0.9620350849628448, | |
| "num_tokens": 1841748836.0, | |
| "step": 17350 | |
| }, | |
| { | |
| "entropy": 1.35484375, | |
| "epoch": 0.42603202585573674, | |
| "grad_norm": 3.046875, | |
| "learning_rate": 5.574641297125277e-06, | |
| "loss": 0.1903, | |
| "mean_token_accuracy": 0.9602237248420715, | |
| "num_tokens": 1846964872.0, | |
| "step": 17400 | |
| }, | |
| { | |
| "entropy": 1.3465625, | |
| "epoch": 0.4272562558150923, | |
| "grad_norm": 2.375, | |
| "learning_rate": 5.5715189305097705e-06, | |
| "loss": 0.18, | |
| "mean_token_accuracy": 0.9612255036830902, | |
| "num_tokens": 1852195890.0, | |
| "step": 17450 | |
| }, | |
| { | |
| "entropy": 1.34734375, | |
| "epoch": 0.42848048577444786, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 5.568386026891784e-06, | |
| "loss": 0.1852, | |
| "mean_token_accuracy": 0.9614002680778504, | |
| "num_tokens": 1857781986.0, | |
| "step": 17500 | |
| }, | |
| { | |
| "entropy": 1.383125, | |
| "epoch": 0.4297047157338034, | |
| "grad_norm": 3.59375, | |
| "learning_rate": 5.565242599108638e-06, | |
| "loss": 0.1733, | |
| "mean_token_accuracy": 0.9632753264904023, | |
| "num_tokens": 1862697378.0, | |
| "step": 17550 | |
| }, | |
| { | |
| "entropy": 1.37734375, | |
| "epoch": 0.430928945693159, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 5.5620886600407775e-06, | |
| "loss": 0.1793, | |
| "mean_token_accuracy": 0.9618914890289306, | |
| "num_tokens": 1867900164.0, | |
| "step": 17600 | |
| }, | |
| { | |
| "entropy": 1.37453125, | |
| "epoch": 0.43215317565251454, | |
| "grad_norm": 3.359375, | |
| "learning_rate": 5.558924222611718e-06, | |
| "loss": 0.189, | |
| "mean_token_accuracy": 0.9601231980323791, | |
| "num_tokens": 1873349723.0, | |
| "step": 17650 | |
| }, | |
| { | |
| "entropy": 1.3796875, | |
| "epoch": 0.43337740561187016, | |
| "grad_norm": 2.125, | |
| "learning_rate": 5.555749299787992e-06, | |
| "loss": 0.183, | |
| "mean_token_accuracy": 0.9612041318416595, | |
| "num_tokens": 1878516011.0, | |
| "step": 17700 | |
| }, | |
| { | |
| "entropy": 1.36796875, | |
| "epoch": 0.4346016355712257, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 5.552563904579097e-06, | |
| "loss": 0.1666, | |
| "mean_token_accuracy": 0.965571962594986, | |
| "num_tokens": 1883672436.0, | |
| "step": 17750 | |
| }, | |
| { | |
| "entropy": 1.37421875, | |
| "epoch": 0.4358258655305813, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 5.549368050037442e-06, | |
| "loss": 0.1822, | |
| "mean_token_accuracy": 0.9618594205379486, | |
| "num_tokens": 1889075709.0, | |
| "step": 17800 | |
| }, | |
| { | |
| "entropy": 1.3753125, | |
| "epoch": 0.43705009548993684, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 5.5461617492582955e-06, | |
| "loss": 0.1847, | |
| "mean_token_accuracy": 0.9609970545768738, | |
| "num_tokens": 1894320611.0, | |
| "step": 17850 | |
| }, | |
| { | |
| "entropy": 1.35203125, | |
| "epoch": 0.4382743254492924, | |
| "grad_norm": 3.265625, | |
| "learning_rate": 5.542945015379727e-06, | |
| "loss": 0.1819, | |
| "mean_token_accuracy": 0.9610999655723572, | |
| "num_tokens": 1899502888.0, | |
| "step": 17900 | |
| }, | |
| { | |
| "entropy": 1.3653125, | |
| "epoch": 0.43949855540864796, | |
| "grad_norm": 3.125, | |
| "learning_rate": 5.53971786158256e-06, | |
| "loss": 0.1783, | |
| "mean_token_accuracy": 0.9628078281879425, | |
| "num_tokens": 1904727333.0, | |
| "step": 17950 | |
| }, | |
| { | |
| "entropy": 1.37265625, | |
| "epoch": 0.4407227853680035, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 5.536480301090311e-06, | |
| "loss": 0.1825, | |
| "mean_token_accuracy": 0.9612684857845306, | |
| "num_tokens": 1910269964.0, | |
| "step": 18000 | |
| }, | |
| { | |
| "entropy": 1.36875, | |
| "epoch": 0.4419470153273591, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 5.533232347169142e-06, | |
| "loss": 0.1769, | |
| "mean_token_accuracy": 0.9630991363525391, | |
| "num_tokens": 1915481678.0, | |
| "step": 18050 | |
| }, | |
| { | |
| "entropy": 1.37703125, | |
| "epoch": 0.44317124528671464, | |
| "grad_norm": 0.007720947265625, | |
| "learning_rate": 5.5299740131278e-06, | |
| "loss": 0.1776, | |
| "mean_token_accuracy": 0.9631426560878754, | |
| "num_tokens": 1920892313.0, | |
| "step": 18100 | |
| }, | |
| { | |
| "entropy": 1.3784375, | |
| "epoch": 0.4443954752460702, | |
| "grad_norm": 2.25, | |
| "learning_rate": 5.5267053123175685e-06, | |
| "loss": 0.1793, | |
| "mean_token_accuracy": 0.9618562459945679, | |
| "num_tokens": 1925855441.0, | |
| "step": 18150 | |
| }, | |
| { | |
| "entropy": 1.40484375, | |
| "epoch": 0.44561970520542576, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 5.523426258132208e-06, | |
| "loss": 0.1895, | |
| "mean_token_accuracy": 0.9602830135822296, | |
| "num_tokens": 1931433927.0, | |
| "step": 18200 | |
| }, | |
| { | |
| "entropy": 1.381875, | |
| "epoch": 0.4468439351647813, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 5.520136864007901e-06, | |
| "loss": 0.179, | |
| "mean_token_accuracy": 0.9617183935642243, | |
| "num_tokens": 1937093589.0, | |
| "step": 18250 | |
| }, | |
| { | |
| "entropy": 1.3784375, | |
| "epoch": 0.44806816512413694, | |
| "grad_norm": 2.890625, | |
| "learning_rate": 5.516837143423201e-06, | |
| "loss": 0.1807, | |
| "mean_token_accuracy": 0.9620720791816711, | |
| "num_tokens": 1942266157.0, | |
| "step": 18300 | |
| }, | |
| { | |
| "entropy": 1.3815625, | |
| "epoch": 0.4492923950834925, | |
| "grad_norm": 2.734375, | |
| "learning_rate": 5.5135271098989745e-06, | |
| "loss": 0.1739, | |
| "mean_token_accuracy": 0.9636857545375824, | |
| "num_tokens": 1947254229.0, | |
| "step": 18350 | |
| }, | |
| { | |
| "entropy": 1.39609375, | |
| "epoch": 0.45051662504284806, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 5.510206776998347e-06, | |
| "loss": 0.2004, | |
| "mean_token_accuracy": 0.9576922535896302, | |
| "num_tokens": 1953541405.0, | |
| "step": 18400 | |
| }, | |
| { | |
| "entropy": 1.38515625, | |
| "epoch": 0.4517408550022036, | |
| "grad_norm": 1.8671875, | |
| "learning_rate": 5.5068761583266446e-06, | |
| "loss": 0.1815, | |
| "mean_token_accuracy": 0.9612382733821869, | |
| "num_tokens": 1958947967.0, | |
| "step": 18450 | |
| }, | |
| { | |
| "entropy": 1.38546875, | |
| "epoch": 0.4529650849615592, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 5.503535267531341e-06, | |
| "loss": 0.1756, | |
| "mean_token_accuracy": 0.9630067098140717, | |
| "num_tokens": 1964172588.0, | |
| "step": 18500 | |
| }, | |
| { | |
| "entropy": 1.37171875, | |
| "epoch": 0.45418931492091474, | |
| "grad_norm": 2.453125, | |
| "learning_rate": 5.500184118302001e-06, | |
| "loss": 0.1737, | |
| "mean_token_accuracy": 0.9629046404361725, | |
| "num_tokens": 1969146021.0, | |
| "step": 18550 | |
| }, | |
| { | |
| "entropy": 1.35796875, | |
| "epoch": 0.4554135448802703, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 5.496822724370225e-06, | |
| "loss": 0.1726, | |
| "mean_token_accuracy": 0.9641622114181518, | |
| "num_tokens": 1974171622.0, | |
| "step": 18600 | |
| }, | |
| { | |
| "entropy": 1.35109375, | |
| "epoch": 0.45663777483962587, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 5.493451099509589e-06, | |
| "loss": 0.1797, | |
| "mean_token_accuracy": 0.9615970349311829, | |
| "num_tokens": 1979453512.0, | |
| "step": 18650 | |
| }, | |
| { | |
| "entropy": 1.3515625, | |
| "epoch": 0.4578620047989814, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 5.490069257535595e-06, | |
| "loss": 0.1786, | |
| "mean_token_accuracy": 0.9625794899463653, | |
| "num_tokens": 1984570640.0, | |
| "step": 18700 | |
| }, | |
| { | |
| "entropy": 1.37140625, | |
| "epoch": 0.459086234758337, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 5.4866772123056055e-06, | |
| "loss": 0.1928, | |
| "mean_token_accuracy": 0.9605653440952301, | |
| "num_tokens": 1990199710.0, | |
| "step": 18750 | |
| }, | |
| { | |
| "entropy": 1.375625, | |
| "epoch": 0.46031046471769255, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 5.483274977718797e-06, | |
| "loss": 0.1885, | |
| "mean_token_accuracy": 0.9597025084495544, | |
| "num_tokens": 1995518980.0, | |
| "step": 18800 | |
| }, | |
| { | |
| "entropy": 1.37984375, | |
| "epoch": 0.46153469467704816, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 5.479862567716095e-06, | |
| "loss": 0.1703, | |
| "mean_token_accuracy": 0.9633987152576446, | |
| "num_tokens": 2000479352.0, | |
| "step": 18850 | |
| }, | |
| { | |
| "entropy": 1.38640625, | |
| "epoch": 0.4627589246364037, | |
| "grad_norm": 3.671875, | |
| "learning_rate": 5.476439996280118e-06, | |
| "loss": 0.1941, | |
| "mean_token_accuracy": 0.959332902431488, | |
| "num_tokens": 2005933401.0, | |
| "step": 18900 | |
| }, | |
| { | |
| "entropy": 1.3975, | |
| "epoch": 0.4639831545957593, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 5.473007277435125e-06, | |
| "loss": 0.1731, | |
| "mean_token_accuracy": 0.9638979506492614, | |
| "num_tokens": 2010666027.0, | |
| "step": 18950 | |
| }, | |
| { | |
| "entropy": 1.38140625, | |
| "epoch": 0.46520738455511484, | |
| "grad_norm": 3.640625, | |
| "learning_rate": 5.469564425246953e-06, | |
| "loss": 0.1852, | |
| "mean_token_accuracy": 0.9617711079120635, | |
| "num_tokens": 2016049085.0, | |
| "step": 19000 | |
| }, | |
| { | |
| "entropy": 1.37015625, | |
| "epoch": 0.4664316145144704, | |
| "grad_norm": 1.71875, | |
| "learning_rate": 5.46611145382296e-06, | |
| "loss": 0.1678, | |
| "mean_token_accuracy": 0.9642109513282776, | |
| "num_tokens": 2021148599.0, | |
| "step": 19050 | |
| }, | |
| { | |
| "entropy": 1.35875, | |
| "epoch": 0.46765584447382597, | |
| "grad_norm": 1.6875, | |
| "learning_rate": 5.462648377311973e-06, | |
| "loss": 0.1785, | |
| "mean_token_accuracy": 0.9610287690162659, | |
| "num_tokens": 2026306056.0, | |
| "step": 19100 | |
| }, | |
| { | |
| "entropy": 1.34953125, | |
| "epoch": 0.4688800744331815, | |
| "grad_norm": 2.78125, | |
| "learning_rate": 5.459175209904221e-06, | |
| "loss": 0.1769, | |
| "mean_token_accuracy": 0.9627043080329895, | |
| "num_tokens": 2031493225.0, | |
| "step": 19150 | |
| }, | |
| { | |
| "entropy": 1.34484375, | |
| "epoch": 0.4701043043925371, | |
| "grad_norm": 1.8671875, | |
| "learning_rate": 5.455691965831281e-06, | |
| "loss": 0.1758, | |
| "mean_token_accuracy": 0.9625547790527343, | |
| "num_tokens": 2036730518.0, | |
| "step": 19200 | |
| }, | |
| { | |
| "entropy": 1.3490625, | |
| "epoch": 0.47132853435189265, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 5.452198659366023e-06, | |
| "loss": 0.167, | |
| "mean_token_accuracy": 0.9653509867191314, | |
| "num_tokens": 2041648821.0, | |
| "step": 19250 | |
| }, | |
| { | |
| "entropy": 1.33796875, | |
| "epoch": 0.4725527643112482, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 5.448695304822545e-06, | |
| "loss": 0.1733, | |
| "mean_token_accuracy": 0.9637433886528015, | |
| "num_tokens": 2046695948.0, | |
| "step": 19300 | |
| }, | |
| { | |
| "entropy": 1.35109375, | |
| "epoch": 0.47377699427060377, | |
| "grad_norm": 3.15625, | |
| "learning_rate": 5.445181916556123e-06, | |
| "loss": 0.1712, | |
| "mean_token_accuracy": 0.96383709192276, | |
| "num_tokens": 2051915262.0, | |
| "step": 19350 | |
| }, | |
| { | |
| "entropy": 1.3453125, | |
| "epoch": 0.47500122422995933, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 5.4416585089631414e-06, | |
| "loss": 0.163, | |
| "mean_token_accuracy": 0.9646891450881958, | |
| "num_tokens": 2056999566.0, | |
| "step": 19400 | |
| }, | |
| { | |
| "entropy": 1.36125, | |
| "epoch": 0.47622545418931495, | |
| "grad_norm": 2.875, | |
| "learning_rate": 5.438125096481043e-06, | |
| "loss": 0.1833, | |
| "mean_token_accuracy": 0.96080885887146, | |
| "num_tokens": 2062335975.0, | |
| "step": 19450 | |
| }, | |
| { | |
| "entropy": 1.368125, | |
| "epoch": 0.4774496841486705, | |
| "grad_norm": 3.140625, | |
| "learning_rate": 5.434581693588263e-06, | |
| "loss": 0.175, | |
| "mean_token_accuracy": 0.9632956290245056, | |
| "num_tokens": 2067247038.0, | |
| "step": 19500 | |
| }, | |
| { | |
| "entropy": 1.36484375, | |
| "epoch": 0.47867391410802607, | |
| "grad_norm": 2.59375, | |
| "learning_rate": 5.4310283148041775e-06, | |
| "loss": 0.185, | |
| "mean_token_accuracy": 0.9606440508365631, | |
| "num_tokens": 2072775995.0, | |
| "step": 19550 | |
| }, | |
| { | |
| "entropy": 1.36171875, | |
| "epoch": 0.4798981440673816, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 5.427464974689038e-06, | |
| "loss": 0.1772, | |
| "mean_token_accuracy": 0.963237328529358, | |
| "num_tokens": 2078139054.0, | |
| "step": 19600 | |
| }, | |
| { | |
| "entropy": 1.35703125, | |
| "epoch": 0.4811223740267372, | |
| "grad_norm": 2.90625, | |
| "learning_rate": 5.42389168784391e-06, | |
| "loss": 0.1726, | |
| "mean_token_accuracy": 0.9635715174674988, | |
| "num_tokens": 2083527202.0, | |
| "step": 19650 | |
| }, | |
| { | |
| "entropy": 1.37875, | |
| "epoch": 0.48234660398609275, | |
| "grad_norm": 3.3125, | |
| "learning_rate": 5.4203084689106225e-06, | |
| "loss": 0.1927, | |
| "mean_token_accuracy": 0.9599621570110322, | |
| "num_tokens": 2089385771.0, | |
| "step": 19700 | |
| }, | |
| { | |
| "entropy": 1.34265625, | |
| "epoch": 0.4835708339454483, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 5.4167153325716976e-06, | |
| "loss": 0.1663, | |
| "mean_token_accuracy": 0.9641843712329865, | |
| "num_tokens": 2094456460.0, | |
| "step": 19750 | |
| }, | |
| { | |
| "entropy": 1.3609375, | |
| "epoch": 0.48479506390480387, | |
| "grad_norm": 3.734375, | |
| "learning_rate": 5.413112293550296e-06, | |
| "loss": 0.181, | |
| "mean_token_accuracy": 0.9612398469448089, | |
| "num_tokens": 2099504284.0, | |
| "step": 19800 | |
| }, | |
| { | |
| "entropy": 1.3709375, | |
| "epoch": 0.48601929386415943, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 5.409499366610154e-06, | |
| "loss": 0.1699, | |
| "mean_token_accuracy": 0.9642571318149566, | |
| "num_tokens": 2104524371.0, | |
| "step": 19850 | |
| }, | |
| { | |
| "entropy": 1.378125, | |
| "epoch": 0.487243523823515, | |
| "grad_norm": 5.53125, | |
| "learning_rate": 5.405876566555529e-06, | |
| "loss": 0.181, | |
| "mean_token_accuracy": 0.9618199968338013, | |
| "num_tokens": 2109740174.0, | |
| "step": 19900 | |
| }, | |
| { | |
| "entropy": 1.40078125, | |
| "epoch": 0.48846775378287055, | |
| "grad_norm": 2.0, | |
| "learning_rate": 5.402243908231129e-06, | |
| "loss": 0.1804, | |
| "mean_token_accuracy": 0.962717422246933, | |
| "num_tokens": 2115362415.0, | |
| "step": 19950 | |
| }, | |
| { | |
| "entropy": 1.37703125, | |
| "epoch": 0.48969198374222617, | |
| "grad_norm": 3.40625, | |
| "learning_rate": 5.398601406522059e-06, | |
| "loss": 0.19, | |
| "mean_token_accuracy": 0.9599020183086395, | |
| "num_tokens": 2121188022.0, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.48969198374222617, | |
| "eval_entropy": 1.366015625, | |
| "eval_loss": 0.1947789192199707, | |
| "eval_mean_token_accuracy": 0.9590674425164859, | |
| "eval_num_tokens": 2121188022.0, | |
| "eval_runtime": 605.3557, | |
| "eval_samples_per_second": 15.951, | |
| "eval_steps_per_second": 0.2, | |
| "step": 20000 | |
| }, | |
| { | |
| "entropy": 1.36578125, | |
| "epoch": 0.4909162137015817, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 5.3949490763537594e-06, | |
| "loss": 0.1838, | |
| "mean_token_accuracy": 0.9606946921348571, | |
| "num_tokens": 2126472622.0, | |
| "step": 20050 | |
| }, | |
| { | |
| "entropy": 1.36359375, | |
| "epoch": 0.4921404436609373, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 5.391286932691941e-06, | |
| "loss": 0.1717, | |
| "mean_token_accuracy": 0.963376579284668, | |
| "num_tokens": 2131377659.0, | |
| "step": 20100 | |
| }, | |
| { | |
| "entropy": 1.37875, | |
| "epoch": 0.49336467362029285, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 5.38761499054253e-06, | |
| "loss": 0.1855, | |
| "mean_token_accuracy": 0.9612623798847199, | |
| "num_tokens": 2136546167.0, | |
| "step": 20150 | |
| }, | |
| { | |
| "entropy": 1.37296875, | |
| "epoch": 0.4945889035796484, | |
| "grad_norm": 4.40625, | |
| "learning_rate": 5.383933264951596e-06, | |
| "loss": 0.1826, | |
| "mean_token_accuracy": 0.9621403360366821, | |
| "num_tokens": 2141814792.0, | |
| "step": 20200 | |
| }, | |
| { | |
| "entropy": 1.37328125, | |
| "epoch": 0.49581313353900397, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 5.3802417710053056e-06, | |
| "loss": 0.1804, | |
| "mean_token_accuracy": 0.9616746437549591, | |
| "num_tokens": 2147071830.0, | |
| "step": 20250 | |
| }, | |
| { | |
| "entropy": 1.38625, | |
| "epoch": 0.49703736349835953, | |
| "grad_norm": 3.375, | |
| "learning_rate": 5.376540523829846e-06, | |
| "loss": 0.1782, | |
| "mean_token_accuracy": 0.9625440466403962, | |
| "num_tokens": 2152428456.0, | |
| "step": 20300 | |
| }, | |
| { | |
| "entropy": 1.3896875, | |
| "epoch": 0.4982615934577151, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 5.372829538591368e-06, | |
| "loss": 0.1876, | |
| "mean_token_accuracy": 0.9597011947631836, | |
| "num_tokens": 2157932348.0, | |
| "step": 20350 | |
| }, | |
| { | |
| "entropy": 1.38671875, | |
| "epoch": 0.49948582341707065, | |
| "grad_norm": 2.78125, | |
| "learning_rate": 5.369108830495932e-06, | |
| "loss": 0.1791, | |
| "mean_token_accuracy": 0.9618503451347351, | |
| "num_tokens": 2163273400.0, | |
| "step": 20400 | |
| }, | |
| { | |
| "entropy": 1.39640625, | |
| "epoch": 0.5007100533764263, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 5.365378414789431e-06, | |
| "loss": 0.1744, | |
| "mean_token_accuracy": 0.9630714511871338, | |
| "num_tokens": 2168498693.0, | |
| "step": 20450 | |
| }, | |
| { | |
| "entropy": 1.38453125, | |
| "epoch": 0.5019342833357818, | |
| "grad_norm": 5.0625, | |
| "learning_rate": 5.361638306757539e-06, | |
| "loss": 0.1757, | |
| "mean_token_accuracy": 0.963210039138794, | |
| "num_tokens": 2173679268.0, | |
| "step": 20500 | |
| }, | |
| { | |
| "entropy": 1.40171875, | |
| "epoch": 0.5031585132951374, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 5.357888521725646e-06, | |
| "loss": 0.1827, | |
| "mean_token_accuracy": 0.9613598906993865, | |
| "num_tokens": 2178826743.0, | |
| "step": 20550 | |
| }, | |
| { | |
| "entropy": 1.3775, | |
| "epoch": 0.504382743254493, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 5.354129075058793e-06, | |
| "loss": 0.1786, | |
| "mean_token_accuracy": 0.9626466917991638, | |
| "num_tokens": 2184130873.0, | |
| "step": 20600 | |
| }, | |
| { | |
| "entropy": 1.35796875, | |
| "epoch": 0.5056069732138485, | |
| "grad_norm": 1.546875, | |
| "learning_rate": 5.35035998216161e-06, | |
| "loss": 0.1699, | |
| "mean_token_accuracy": 0.9637439405918121, | |
| "num_tokens": 2189388837.0, | |
| "step": 20650 | |
| }, | |
| { | |
| "entropy": 1.38328125, | |
| "epoch": 0.5068312031732041, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 5.3465812584782545e-06, | |
| "loss": 0.1964, | |
| "mean_token_accuracy": 0.9594271278381348, | |
| "num_tokens": 2195050047.0, | |
| "step": 20700 | |
| }, | |
| { | |
| "entropy": 1.34203125, | |
| "epoch": 0.5080554331325596, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 5.342792919492344e-06, | |
| "loss": 0.1749, | |
| "mean_token_accuracy": 0.9626959478855133, | |
| "num_tokens": 2200302347.0, | |
| "step": 20750 | |
| }, | |
| { | |
| "entropy": 1.356875, | |
| "epoch": 0.5092796630919152, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 5.338994980726901e-06, | |
| "loss": 0.1794, | |
| "mean_token_accuracy": 0.9620554232597351, | |
| "num_tokens": 2205512738.0, | |
| "step": 20800 | |
| }, | |
| { | |
| "entropy": 1.3575, | |
| "epoch": 0.5105038930512708, | |
| "grad_norm": 2.78125, | |
| "learning_rate": 5.335187457744277e-06, | |
| "loss": 0.1823, | |
| "mean_token_accuracy": 0.9618464136123657, | |
| "num_tokens": 2210651777.0, | |
| "step": 20850 | |
| }, | |
| { | |
| "entropy": 1.33390625, | |
| "epoch": 0.5117281230106263, | |
| "grad_norm": 1.6875, | |
| "learning_rate": 5.3313703661461e-06, | |
| "loss": 0.1819, | |
| "mean_token_accuracy": 0.9613965570926666, | |
| "num_tokens": 2215880518.0, | |
| "step": 20900 | |
| }, | |
| { | |
| "entropy": 1.3253125, | |
| "epoch": 0.5129523529699819, | |
| "grad_norm": 2.984375, | |
| "learning_rate": 5.327543721573206e-06, | |
| "loss": 0.1752, | |
| "mean_token_accuracy": 0.9638756012916565, | |
| "num_tokens": 2221245311.0, | |
| "step": 20950 | |
| }, | |
| { | |
| "entropy": 1.32234375, | |
| "epoch": 0.5141765829293374, | |
| "grad_norm": 3.28125, | |
| "learning_rate": 5.323707539705574e-06, | |
| "loss": 0.1748, | |
| "mean_token_accuracy": 0.963612312078476, | |
| "num_tokens": 2226359631.0, | |
| "step": 21000 | |
| }, | |
| { | |
| "entropy": 1.30609375, | |
| "epoch": 0.515400812888693, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 5.3198618362622614e-06, | |
| "loss": 0.1702, | |
| "mean_token_accuracy": 0.9639462912082672, | |
| "num_tokens": 2231563334.0, | |
| "step": 21050 | |
| }, | |
| { | |
| "entropy": 1.31953125, | |
| "epoch": 0.5166250428480486, | |
| "grad_norm": 3.265625, | |
| "learning_rate": 5.316006627001344e-06, | |
| "loss": 0.1805, | |
| "mean_token_accuracy": 0.961728732585907, | |
| "num_tokens": 2236847732.0, | |
| "step": 21100 | |
| }, | |
| { | |
| "entropy": 1.32125, | |
| "epoch": 0.5178492728074041, | |
| "grad_norm": 2.375, | |
| "learning_rate": 5.312141927719849e-06, | |
| "loss": 0.172, | |
| "mean_token_accuracy": 0.9636801743507385, | |
| "num_tokens": 2242148614.0, | |
| "step": 21150 | |
| }, | |
| { | |
| "entropy": 1.3134375, | |
| "epoch": 0.5190735027667597, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 5.308267754253684e-06, | |
| "loss": 0.1755, | |
| "mean_token_accuracy": 0.9632048571109771, | |
| "num_tokens": 2247694541.0, | |
| "step": 21200 | |
| }, | |
| { | |
| "entropy": 1.36203125, | |
| "epoch": 0.5202977327261152, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 5.304384122477584e-06, | |
| "loss": 0.1983, | |
| "mean_token_accuracy": 0.9583926129341126, | |
| "num_tokens": 2253386473.0, | |
| "step": 21250 | |
| }, | |
| { | |
| "entropy": 1.34703125, | |
| "epoch": 0.5215219626854708, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 5.300491048305037e-06, | |
| "loss": 0.1753, | |
| "mean_token_accuracy": 0.9633457577228546, | |
| "num_tokens": 2258591416.0, | |
| "step": 21300 | |
| }, | |
| { | |
| "entropy": 1.3553125, | |
| "epoch": 0.5227461926448264, | |
| "grad_norm": 3.140625, | |
| "learning_rate": 5.296588547688221e-06, | |
| "loss": 0.1809, | |
| "mean_token_accuracy": 0.9621423208713531, | |
| "num_tokens": 2263908714.0, | |
| "step": 21350 | |
| }, | |
| { | |
| "entropy": 1.35140625, | |
| "epoch": 0.5239704226041819, | |
| "grad_norm": 2.5, | |
| "learning_rate": 5.292676636617946e-06, | |
| "loss": 0.1746, | |
| "mean_token_accuracy": 0.9637291979789734, | |
| "num_tokens": 2269014561.0, | |
| "step": 21400 | |
| }, | |
| { | |
| "entropy": 1.3440625, | |
| "epoch": 0.5251946525635376, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 5.2887553311235736e-06, | |
| "loss": 0.1753, | |
| "mean_token_accuracy": 0.963253127336502, | |
| "num_tokens": 2274143387.0, | |
| "step": 21450 | |
| }, | |
| { | |
| "entropy": 1.34984375, | |
| "epoch": 0.5264188825228932, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 5.284824647272965e-06, | |
| "loss": 0.1751, | |
| "mean_token_accuracy": 0.9633476626873017, | |
| "num_tokens": 2279551937.0, | |
| "step": 21500 | |
| }, | |
| { | |
| "entropy": 1.3815625, | |
| "epoch": 0.5276431124822487, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 5.280884601172408e-06, | |
| "loss": 0.1901, | |
| "mean_token_accuracy": 0.9609255039691925, | |
| "num_tokens": 2284998091.0, | |
| "step": 21550 | |
| }, | |
| { | |
| "entropy": 1.37375, | |
| "epoch": 0.5288673424416043, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 5.276935208966554e-06, | |
| "loss": 0.1805, | |
| "mean_token_accuracy": 0.9621355581283569, | |
| "num_tokens": 2290404419.0, | |
| "step": 21600 | |
| }, | |
| { | |
| "entropy": 1.35875, | |
| "epoch": 0.5300915724009598, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 5.272976486838349e-06, | |
| "loss": 0.1839, | |
| "mean_token_accuracy": 0.9618707728385926, | |
| "num_tokens": 2295855308.0, | |
| "step": 21650 | |
| }, | |
| { | |
| "entropy": 1.34296875, | |
| "epoch": 0.5313158023603154, | |
| "grad_norm": 3.84375, | |
| "learning_rate": 5.269008451008974e-06, | |
| "loss": 0.1683, | |
| "mean_token_accuracy": 0.9649140095710754, | |
| "num_tokens": 2300888682.0, | |
| "step": 21700 | |
| }, | |
| { | |
| "entropy": 1.3709375, | |
| "epoch": 0.532540032319671, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 5.265031117737765e-06, | |
| "loss": 0.1856, | |
| "mean_token_accuracy": 0.9606757354736328, | |
| "num_tokens": 2306530067.0, | |
| "step": 21750 | |
| }, | |
| { | |
| "entropy": 1.3528125, | |
| "epoch": 0.5337642622790265, | |
| "grad_norm": 2.984375, | |
| "learning_rate": 5.261044503322165e-06, | |
| "loss": 0.1826, | |
| "mean_token_accuracy": 0.9615514528751373, | |
| "num_tokens": 2312022301.0, | |
| "step": 21800 | |
| }, | |
| { | |
| "entropy": 1.35828125, | |
| "epoch": 0.5349884922383821, | |
| "grad_norm": 2.5, | |
| "learning_rate": 5.257048624097639e-06, | |
| "loss": 0.1826, | |
| "mean_token_accuracy": 0.9617948019504547, | |
| "num_tokens": 2317336429.0, | |
| "step": 21850 | |
| }, | |
| { | |
| "entropy": 1.365625, | |
| "epoch": 0.5362127221977376, | |
| "grad_norm": 3.25, | |
| "learning_rate": 5.253043496437619e-06, | |
| "loss": 0.1875, | |
| "mean_token_accuracy": 0.9604008531570435, | |
| "num_tokens": 2322605855.0, | |
| "step": 21900 | |
| }, | |
| { | |
| "entropy": 1.3403125, | |
| "epoch": 0.5374369521570932, | |
| "grad_norm": 1.1171875, | |
| "learning_rate": 5.249029136753436e-06, | |
| "loss": 0.1757, | |
| "mean_token_accuracy": 0.9632094752788544, | |
| "num_tokens": 2328163176.0, | |
| "step": 21950 | |
| }, | |
| { | |
| "entropy": 1.3684375, | |
| "epoch": 0.5386611821164488, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 5.245005561494242e-06, | |
| "loss": 0.1804, | |
| "mean_token_accuracy": 0.9627390444278717, | |
| "num_tokens": 2333245056.0, | |
| "step": 22000 | |
| }, | |
| { | |
| "entropy": 1.384375, | |
| "epoch": 0.5398854120758043, | |
| "grad_norm": 2.859375, | |
| "learning_rate": 5.2409727871469585e-06, | |
| "loss": 0.1926, | |
| "mean_token_accuracy": 0.9592073571681976, | |
| "num_tokens": 2338758359.0, | |
| "step": 22050 | |
| }, | |
| { | |
| "entropy": 1.35546875, | |
| "epoch": 0.5411096420351599, | |
| "grad_norm": 2.90625, | |
| "learning_rate": 5.236930830236195e-06, | |
| "loss": 0.179, | |
| "mean_token_accuracy": 0.9627534210681915, | |
| "num_tokens": 2344276248.0, | |
| "step": 22100 | |
| }, | |
| { | |
| "entropy": 1.34953125, | |
| "epoch": 0.5423338719945154, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 5.232879707324194e-06, | |
| "loss": 0.1634, | |
| "mean_token_accuracy": 0.965645101070404, | |
| "num_tokens": 2349615408.0, | |
| "step": 22150 | |
| }, | |
| { | |
| "entropy": 1.37578125, | |
| "epoch": 0.543558101953871, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 5.228819435010749e-06, | |
| "loss": 0.1678, | |
| "mean_token_accuracy": 0.9645935368537902, | |
| "num_tokens": 2354669027.0, | |
| "step": 22200 | |
| }, | |
| { | |
| "entropy": 1.3884375, | |
| "epoch": 0.5447823319132266, | |
| "grad_norm": 3.109375, | |
| "learning_rate": 5.224750029933149e-06, | |
| "loss": 0.1811, | |
| "mean_token_accuracy": 0.9621996486186981, | |
| "num_tokens": 2359585884.0, | |
| "step": 22250 | |
| }, | |
| { | |
| "entropy": 1.38390625, | |
| "epoch": 0.5460065618725821, | |
| "grad_norm": 2.375, | |
| "learning_rate": 5.220671508766104e-06, | |
| "loss": 0.1716, | |
| "mean_token_accuracy": 0.9631420743465423, | |
| "num_tokens": 2364818902.0, | |
| "step": 22300 | |
| }, | |
| { | |
| "entropy": 1.40234375, | |
| "epoch": 0.5472307918319377, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 5.216583888221676e-06, | |
| "loss": 0.1888, | |
| "mean_token_accuracy": 0.9602623808383942, | |
| "num_tokens": 2370249320.0, | |
| "step": 22350 | |
| }, | |
| { | |
| "entropy": 1.3871875, | |
| "epoch": 0.5484550217912932, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 5.212487185049215e-06, | |
| "loss": 0.1656, | |
| "mean_token_accuracy": 0.9649445843696595, | |
| "num_tokens": 2375353386.0, | |
| "step": 22400 | |
| }, | |
| { | |
| "entropy": 1.415625, | |
| "epoch": 0.5496792517506488, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 5.208381416035286e-06, | |
| "loss": 0.1863, | |
| "mean_token_accuracy": 0.9609400224685669, | |
| "num_tokens": 2380836963.0, | |
| "step": 22450 | |
| }, | |
| { | |
| "entropy": 1.395, | |
| "epoch": 0.5509034817100044, | |
| "grad_norm": 0.00396728515625, | |
| "learning_rate": 5.204266598003604e-06, | |
| "loss": 0.1759, | |
| "mean_token_accuracy": 0.9629833257198334, | |
| "num_tokens": 2385836401.0, | |
| "step": 22500 | |
| }, | |
| { | |
| "entropy": 1.39046875, | |
| "epoch": 0.5521277116693599, | |
| "grad_norm": 3.671875, | |
| "learning_rate": 5.20014274781496e-06, | |
| "loss": 0.176, | |
| "mean_token_accuracy": 0.9624341118335724, | |
| "num_tokens": 2391023729.0, | |
| "step": 22550 | |
| }, | |
| { | |
| "entropy": 1.410625, | |
| "epoch": 0.5533519416287156, | |
| "grad_norm": 2.59375, | |
| "learning_rate": 5.196009882367158e-06, | |
| "loss": 0.175, | |
| "mean_token_accuracy": 0.9633600628376007, | |
| "num_tokens": 2396091073.0, | |
| "step": 22600 | |
| }, | |
| { | |
| "entropy": 1.40546875, | |
| "epoch": 0.5545761715880712, | |
| "grad_norm": 1.640625, | |
| "learning_rate": 5.191868018594941e-06, | |
| "loss": 0.1828, | |
| "mean_token_accuracy": 0.9620015740394592, | |
| "num_tokens": 2401188218.0, | |
| "step": 22650 | |
| }, | |
| { | |
| "entropy": 1.4009375, | |
| "epoch": 0.5558004015474267, | |
| "grad_norm": 3.328125, | |
| "learning_rate": 5.187717173469924e-06, | |
| "loss": 0.1711, | |
| "mean_token_accuracy": 0.9637360453605652, | |
| "num_tokens": 2406245988.0, | |
| "step": 22700 | |
| }, | |
| { | |
| "entropy": 1.39234375, | |
| "epoch": 0.5570246315067823, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 5.183557364000523e-06, | |
| "loss": 0.1737, | |
| "mean_token_accuracy": 0.9634659576416016, | |
| "num_tokens": 2411368109.0, | |
| "step": 22750 | |
| }, | |
| { | |
| "entropy": 1.40296875, | |
| "epoch": 0.5582488614661378, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 5.179388607231889e-06, | |
| "loss": 0.1728, | |
| "mean_token_accuracy": 0.9633192873001098, | |
| "num_tokens": 2416689928.0, | |
| "step": 22800 | |
| }, | |
| { | |
| "entropy": 1.410625, | |
| "epoch": 0.5594730914254934, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 5.17521092024583e-06, | |
| "loss": 0.1867, | |
| "mean_token_accuracy": 0.9608077311515808, | |
| "num_tokens": 2422352742.0, | |
| "step": 22850 | |
| }, | |
| { | |
| "entropy": 1.39109375, | |
| "epoch": 0.560697321384849, | |
| "grad_norm": 0.08642578125, | |
| "learning_rate": 5.171024320160752e-06, | |
| "loss": 0.1667, | |
| "mean_token_accuracy": 0.9654168891906738, | |
| "num_tokens": 2427576584.0, | |
| "step": 22900 | |
| }, | |
| { | |
| "entropy": 1.38734375, | |
| "epoch": 0.5619215513442045, | |
| "grad_norm": 2.75, | |
| "learning_rate": 5.166828824131578e-06, | |
| "loss": 0.1696, | |
| "mean_token_accuracy": 0.9640141320228577, | |
| "num_tokens": 2432765937.0, | |
| "step": 22950 | |
| }, | |
| { | |
| "entropy": 1.3884375, | |
| "epoch": 0.5631457813035601, | |
| "grad_norm": 2.75, | |
| "learning_rate": 5.162624449349686e-06, | |
| "loss": 0.1801, | |
| "mean_token_accuracy": 0.9613782787322998, | |
| "num_tokens": 2437980184.0, | |
| "step": 23000 | |
| }, | |
| { | |
| "entropy": 1.3728125, | |
| "epoch": 0.5643700112629156, | |
| "grad_norm": 2.953125, | |
| "learning_rate": 5.158411213042835e-06, | |
| "loss": 0.1675, | |
| "mean_token_accuracy": 0.9656554198265076, | |
| "num_tokens": 2443001633.0, | |
| "step": 23050 | |
| }, | |
| { | |
| "entropy": 1.39265625, | |
| "epoch": 0.5655942412222712, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 5.154189132475095e-06, | |
| "loss": 0.1826, | |
| "mean_token_accuracy": 0.9614216196537018, | |
| "num_tokens": 2448599009.0, | |
| "step": 23100 | |
| }, | |
| { | |
| "entropy": 1.3725, | |
| "epoch": 0.5668184711816268, | |
| "grad_norm": 3.34375, | |
| "learning_rate": 5.149958224946776e-06, | |
| "loss": 0.1871, | |
| "mean_token_accuracy": 0.9604478991031646, | |
| "num_tokens": 2454134698.0, | |
| "step": 23150 | |
| }, | |
| { | |
| "entropy": 1.3503125, | |
| "epoch": 0.5680427011409823, | |
| "grad_norm": 3.140625, | |
| "learning_rate": 5.145718507794354e-06, | |
| "loss": 0.1725, | |
| "mean_token_accuracy": 0.9635867273807526, | |
| "num_tokens": 2459430485.0, | |
| "step": 23200 | |
| }, | |
| { | |
| "entropy": 1.3696875, | |
| "epoch": 0.5692669311003379, | |
| "grad_norm": 2.0, | |
| "learning_rate": 5.141469998390408e-06, | |
| "loss": 0.1778, | |
| "mean_token_accuracy": 0.9624897265434265, | |
| "num_tokens": 2464814573.0, | |
| "step": 23250 | |
| }, | |
| { | |
| "entropy": 1.34359375, | |
| "epoch": 0.5704911610596934, | |
| "grad_norm": 3.109375, | |
| "learning_rate": 5.1372127141435415e-06, | |
| "loss": 0.1866, | |
| "mean_token_accuracy": 0.961111787557602, | |
| "num_tokens": 2470288053.0, | |
| "step": 23300 | |
| }, | |
| { | |
| "entropy": 1.36140625, | |
| "epoch": 0.571715391019049, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 5.132946672498313e-06, | |
| "loss": 0.1847, | |
| "mean_token_accuracy": 0.9609505522251129, | |
| "num_tokens": 2475912972.0, | |
| "step": 23350 | |
| }, | |
| { | |
| "entropy": 1.3640625, | |
| "epoch": 0.5729396209784046, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 5.128671890935168e-06, | |
| "loss": 0.1868, | |
| "mean_token_accuracy": 0.9606727063655853, | |
| "num_tokens": 2481260397.0, | |
| "step": 23400 | |
| }, | |
| { | |
| "entropy": 1.36171875, | |
| "epoch": 0.5741638509377601, | |
| "grad_norm": 3.0625, | |
| "learning_rate": 5.12438838697036e-06, | |
| "loss": 0.1667, | |
| "mean_token_accuracy": 0.9649614369869233, | |
| "num_tokens": 2486480334.0, | |
| "step": 23450 | |
| }, | |
| { | |
| "entropy": 1.34078125, | |
| "epoch": 0.5753880808971157, | |
| "grad_norm": 2.453125, | |
| "learning_rate": 5.120096178155887e-06, | |
| "loss": 0.1739, | |
| "mean_token_accuracy": 0.9637984907627106, | |
| "num_tokens": 2491784273.0, | |
| "step": 23500 | |
| }, | |
| { | |
| "entropy": 1.37375, | |
| "epoch": 0.5766123108564712, | |
| "grad_norm": 2.796875, | |
| "learning_rate": 5.115795282079414e-06, | |
| "loss": 0.1825, | |
| "mean_token_accuracy": 0.9622078704833984, | |
| "num_tokens": 2496936761.0, | |
| "step": 23550 | |
| }, | |
| { | |
| "entropy": 1.37890625, | |
| "epoch": 0.5778365408158268, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 5.111485716364204e-06, | |
| "loss": 0.1713, | |
| "mean_token_accuracy": 0.9633621573448181, | |
| "num_tokens": 2502372671.0, | |
| "step": 23600 | |
| }, | |
| { | |
| "entropy": 1.37671875, | |
| "epoch": 0.5790607707751824, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 5.107167498669044e-06, | |
| "loss": 0.1888, | |
| "mean_token_accuracy": 0.9600040495395661, | |
| "num_tokens": 2508248084.0, | |
| "step": 23650 | |
| }, | |
| { | |
| "entropy": 1.3646875, | |
| "epoch": 0.5802850007345379, | |
| "grad_norm": 3.296875, | |
| "learning_rate": 5.102840646688173e-06, | |
| "loss": 0.1778, | |
| "mean_token_accuracy": 0.9631288397312164, | |
| "num_tokens": 2513722383.0, | |
| "step": 23700 | |
| }, | |
| { | |
| "entropy": 1.3534375, | |
| "epoch": 0.5815092306938935, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 5.0985051781512076e-06, | |
| "loss": 0.1853, | |
| "mean_token_accuracy": 0.9618443667888641, | |
| "num_tokens": 2518947610.0, | |
| "step": 23750 | |
| }, | |
| { | |
| "entropy": 1.34390625, | |
| "epoch": 0.5827334606532492, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 5.094161110823076e-06, | |
| "loss": 0.178, | |
| "mean_token_accuracy": 0.963310706615448, | |
| "num_tokens": 2524269424.0, | |
| "step": 23800 | |
| }, | |
| { | |
| "entropy": 1.35328125, | |
| "epoch": 0.5839576906126047, | |
| "grad_norm": 2.59375, | |
| "learning_rate": 5.089808462503938e-06, | |
| "loss": 0.1839, | |
| "mean_token_accuracy": 0.9614792597293854, | |
| "num_tokens": 2529803600.0, | |
| "step": 23850 | |
| }, | |
| { | |
| "entropy": 1.3525, | |
| "epoch": 0.5851819205719603, | |
| "grad_norm": 3.046875, | |
| "learning_rate": 5.085447251029113e-06, | |
| "loss": 0.1721, | |
| "mean_token_accuracy": 0.963988184928894, | |
| "num_tokens": 2534916174.0, | |
| "step": 23900 | |
| }, | |
| { | |
| "entropy": 1.35859375, | |
| "epoch": 0.5864061505313158, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 5.081077494269013e-06, | |
| "loss": 0.1857, | |
| "mean_token_accuracy": 0.9612233006954193, | |
| "num_tokens": 2540205630.0, | |
| "step": 23950 | |
| }, | |
| { | |
| "entropy": 1.35015625, | |
| "epoch": 0.5876303804906714, | |
| "grad_norm": 2.125, | |
| "learning_rate": 5.076699210129059e-06, | |
| "loss": 0.1741, | |
| "mean_token_accuracy": 0.9633960282802582, | |
| "num_tokens": 2545114709.0, | |
| "step": 24000 | |
| }, | |
| { | |
| "entropy": 1.346875, | |
| "epoch": 0.588854610450027, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 5.072312416549619e-06, | |
| "loss": 0.171, | |
| "mean_token_accuracy": 0.9637422835826874, | |
| "num_tokens": 2550645548.0, | |
| "step": 24050 | |
| }, | |
| { | |
| "entropy": 1.35140625, | |
| "epoch": 0.5900788404093825, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 5.067917131505928e-06, | |
| "loss": 0.186, | |
| "mean_token_accuracy": 0.9609566831588745, | |
| "num_tokens": 2556096356.0, | |
| "step": 24100 | |
| }, | |
| { | |
| "entropy": 1.34828125, | |
| "epoch": 0.5913030703687381, | |
| "grad_norm": 2.375, | |
| "learning_rate": 5.063513373008014e-06, | |
| "loss": 0.1874, | |
| "mean_token_accuracy": 0.9602975726127625, | |
| "num_tokens": 2561716691.0, | |
| "step": 24150 | |
| }, | |
| { | |
| "entropy": 1.36828125, | |
| "epoch": 0.5925273003280936, | |
| "grad_norm": 1.7578125, | |
| "learning_rate": 5.059101159100625e-06, | |
| "loss": 0.1911, | |
| "mean_token_accuracy": 0.9601788830757141, | |
| "num_tokens": 2566995725.0, | |
| "step": 24200 | |
| }, | |
| { | |
| "entropy": 1.36234375, | |
| "epoch": 0.5937515302874492, | |
| "grad_norm": 2.671875, | |
| "learning_rate": 5.054680507863158e-06, | |
| "loss": 0.196, | |
| "mean_token_accuracy": 0.9593268644809723, | |
| "num_tokens": 2572823278.0, | |
| "step": 24250 | |
| }, | |
| { | |
| "entropy": 1.36125, | |
| "epoch": 0.5949757602468048, | |
| "grad_norm": 2.375, | |
| "learning_rate": 5.050251437409581e-06, | |
| "loss": 0.1746, | |
| "mean_token_accuracy": 0.9630362141132355, | |
| "num_tokens": 2577835467.0, | |
| "step": 24300 | |
| }, | |
| { | |
| "entropy": 1.365625, | |
| "epoch": 0.5961999902061603, | |
| "grad_norm": 3.140625, | |
| "learning_rate": 5.045813965888362e-06, | |
| "loss": 0.184, | |
| "mean_token_accuracy": 0.9621260786056518, | |
| "num_tokens": 2582930120.0, | |
| "step": 24350 | |
| }, | |
| { | |
| "entropy": 1.355625, | |
| "epoch": 0.5974242201655159, | |
| "grad_norm": 3.40625, | |
| "learning_rate": 5.04136811148239e-06, | |
| "loss": 0.1697, | |
| "mean_token_accuracy": 0.963900375366211, | |
| "num_tokens": 2587853502.0, | |
| "step": 24400 | |
| }, | |
| { | |
| "entropy": 1.36140625, | |
| "epoch": 0.5986484501248714, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 5.036913892408908e-06, | |
| "loss": 0.1837, | |
| "mean_token_accuracy": 0.9621051216125488, | |
| "num_tokens": 2593227737.0, | |
| "step": 24450 | |
| }, | |
| { | |
| "entropy": 1.3525, | |
| "epoch": 0.599872680084227, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 5.032451326919429e-06, | |
| "loss": 0.1799, | |
| "mean_token_accuracy": 0.962098822593689, | |
| "num_tokens": 2598591436.0, | |
| "step": 24500 | |
| }, | |
| { | |
| "entropy": 1.34015625, | |
| "epoch": 0.6010969100435826, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 5.027980433299671e-06, | |
| "loss": 0.1758, | |
| "mean_token_accuracy": 0.9619297671318054, | |
| "num_tokens": 2604000565.0, | |
| "step": 24550 | |
| }, | |
| { | |
| "entropy": 1.3484375, | |
| "epoch": 0.6023211400029381, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 5.023501229869474e-06, | |
| "loss": 0.1737, | |
| "mean_token_accuracy": 0.9643021488189697, | |
| "num_tokens": 2608991683.0, | |
| "step": 24600 | |
| }, | |
| { | |
| "entropy": 1.33015625, | |
| "epoch": 0.6035453699622937, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 5.0190137349827266e-06, | |
| "loss": 0.1665, | |
| "mean_token_accuracy": 0.9643359172344208, | |
| "num_tokens": 2614123184.0, | |
| "step": 24650 | |
| }, | |
| { | |
| "entropy": 1.344375, | |
| "epoch": 0.6047695999216492, | |
| "grad_norm": 2.96875, | |
| "learning_rate": 5.014517967027297e-06, | |
| "loss": 0.1805, | |
| "mean_token_accuracy": 0.962350081205368, | |
| "num_tokens": 2619309044.0, | |
| "step": 24700 | |
| }, | |
| { | |
| "entropy": 1.3540625, | |
| "epoch": 0.6059938298810048, | |
| "grad_norm": 2.734375, | |
| "learning_rate": 5.01001394442495e-06, | |
| "loss": 0.1776, | |
| "mean_token_accuracy": 0.9621638679504394, | |
| "num_tokens": 2624919047.0, | |
| "step": 24750 | |
| }, | |
| { | |
| "entropy": 1.34859375, | |
| "epoch": 0.6072180598403604, | |
| "grad_norm": 3.03125, | |
| "learning_rate": 5.005501685631273e-06, | |
| "loss": 0.1733, | |
| "mean_token_accuracy": 0.9635497546195984, | |
| "num_tokens": 2630407723.0, | |
| "step": 24800 | |
| }, | |
| { | |
| "entropy": 1.3534375, | |
| "epoch": 0.6084422897997159, | |
| "grad_norm": 1.5390625, | |
| "learning_rate": 5.000981209135607e-06, | |
| "loss": 0.1781, | |
| "mean_token_accuracy": 0.9629986727237702, | |
| "num_tokens": 2635671685.0, | |
| "step": 24850 | |
| }, | |
| { | |
| "entropy": 1.3459375, | |
| "epoch": 0.6096665197590715, | |
| "grad_norm": 3.71875, | |
| "learning_rate": 4.9964525334609604e-06, | |
| "loss": 0.174, | |
| "mean_token_accuracy": 0.9627162063121796, | |
| "num_tokens": 2641068693.0, | |
| "step": 24900 | |
| }, | |
| { | |
| "entropy": 1.35453125, | |
| "epoch": 0.6108907497184272, | |
| "grad_norm": 2.75, | |
| "learning_rate": 4.99191567716394e-06, | |
| "loss": 0.1796, | |
| "mean_token_accuracy": 0.9617865860462189, | |
| "num_tokens": 2646610014.0, | |
| "step": 24950 | |
| }, | |
| { | |
| "entropy": 1.37453125, | |
| "epoch": 0.6121149796777827, | |
| "grad_norm": 3.109375, | |
| "learning_rate": 4.987370658834675e-06, | |
| "loss": 0.1833, | |
| "mean_token_accuracy": 0.9610668885707855, | |
| "num_tokens": 2651951764.0, | |
| "step": 25000 | |
| }, | |
| { | |
| "entropy": 1.40046875, | |
| "epoch": 0.6133392096371383, | |
| "grad_norm": 3.828125, | |
| "learning_rate": 4.982817497096737e-06, | |
| "loss": 0.1758, | |
| "mean_token_accuracy": 0.9631572890281678, | |
| "num_tokens": 2657065776.0, | |
| "step": 25050 | |
| }, | |
| { | |
| "entropy": 1.38859375, | |
| "epoch": 0.6145634395964938, | |
| "grad_norm": 3.0625, | |
| "learning_rate": 4.978256210607068e-06, | |
| "loss": 0.1738, | |
| "mean_token_accuracy": 0.9639844071865081, | |
| "num_tokens": 2662222291.0, | |
| "step": 25100 | |
| }, | |
| { | |
| "entropy": 1.3496875, | |
| "epoch": 0.6157876695558494, | |
| "grad_norm": 3.21875, | |
| "learning_rate": 4.973686818055901e-06, | |
| "loss": 0.1684, | |
| "mean_token_accuracy": 0.9642084753513336, | |
| "num_tokens": 2667209443.0, | |
| "step": 25150 | |
| }, | |
| { | |
| "entropy": 1.36375, | |
| "epoch": 0.617011899515205, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 4.969109338166683e-06, | |
| "loss": 0.1719, | |
| "mean_token_accuracy": 0.9646093189716339, | |
| "num_tokens": 2672346139.0, | |
| "step": 25200 | |
| }, | |
| { | |
| "entropy": 1.38625, | |
| "epoch": 0.6182361294745605, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 4.964523789695999e-06, | |
| "loss": 0.1855, | |
| "mean_token_accuracy": 0.9612112033367157, | |
| "num_tokens": 2677709139.0, | |
| "step": 25250 | |
| }, | |
| { | |
| "entropy": 1.38171875, | |
| "epoch": 0.6194603594339161, | |
| "grad_norm": 2.90625, | |
| "learning_rate": 4.959930191433498e-06, | |
| "loss": 0.1832, | |
| "mean_token_accuracy": 0.9613463747501373, | |
| "num_tokens": 2682889432.0, | |
| "step": 25300 | |
| }, | |
| { | |
| "entropy": 1.39375, | |
| "epoch": 0.6206845893932716, | |
| "grad_norm": 2.8125, | |
| "learning_rate": 4.955328562201814e-06, | |
| "loss": 0.1953, | |
| "mean_token_accuracy": 0.959397931098938, | |
| "num_tokens": 2688531671.0, | |
| "step": 25350 | |
| }, | |
| { | |
| "entropy": 1.396875, | |
| "epoch": 0.6219088193526272, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 4.950718920856486e-06, | |
| "loss": 0.1882, | |
| "mean_token_accuracy": 0.9605313742160797, | |
| "num_tokens": 2693586026.0, | |
| "step": 25400 | |
| }, | |
| { | |
| "entropy": 1.38203125, | |
| "epoch": 0.6231330493119828, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 4.946101286285884e-06, | |
| "loss": 0.1708, | |
| "mean_token_accuracy": 0.9638578796386719, | |
| "num_tokens": 2698728829.0, | |
| "step": 25450 | |
| }, | |
| { | |
| "entropy": 1.3803125, | |
| "epoch": 0.6243572792713383, | |
| "grad_norm": 3.53125, | |
| "learning_rate": 4.9414756774111335e-06, | |
| "loss": 0.167, | |
| "mean_token_accuracy": 0.9648666107654571, | |
| "num_tokens": 2703894118.0, | |
| "step": 25500 | |
| }, | |
| { | |
| "entropy": 1.4071875, | |
| "epoch": 0.6255815092306939, | |
| "grad_norm": 3.46875, | |
| "learning_rate": 4.93684211318603e-06, | |
| "loss": 0.1782, | |
| "mean_token_accuracy": 0.962544618844986, | |
| "num_tokens": 2709087928.0, | |
| "step": 25550 | |
| }, | |
| { | |
| "entropy": 1.40078125, | |
| "epoch": 0.6268057391900494, | |
| "grad_norm": 3.28125, | |
| "learning_rate": 4.932200612596974e-06, | |
| "loss": 0.1757, | |
| "mean_token_accuracy": 0.963033629655838, | |
| "num_tokens": 2714244664.0, | |
| "step": 25600 | |
| }, | |
| { | |
| "entropy": 1.401875, | |
| "epoch": 0.628029969149405, | |
| "grad_norm": 3.859375, | |
| "learning_rate": 4.927551194662878e-06, | |
| "loss": 0.1701, | |
| "mean_token_accuracy": 0.9642516016960144, | |
| "num_tokens": 2719276387.0, | |
| "step": 25650 | |
| }, | |
| { | |
| "entropy": 1.4296875, | |
| "epoch": 0.6292541991087606, | |
| "grad_norm": 2.625, | |
| "learning_rate": 4.922893878435101e-06, | |
| "loss": 0.1877, | |
| "mean_token_accuracy": 0.9612637603282929, | |
| "num_tokens": 2724924886.0, | |
| "step": 25700 | |
| }, | |
| { | |
| "entropy": 1.40390625, | |
| "epoch": 0.6304784290681161, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 4.918228682997367e-06, | |
| "loss": 0.1751, | |
| "mean_token_accuracy": 0.9626137948036194, | |
| "num_tokens": 2730190384.0, | |
| "step": 25750 | |
| }, | |
| { | |
| "entropy": 1.4384375, | |
| "epoch": 0.6317026590274717, | |
| "grad_norm": 1.7421875, | |
| "learning_rate": 4.9135556274656825e-06, | |
| "loss": 0.1921, | |
| "mean_token_accuracy": 0.9599238002300262, | |
| "num_tokens": 2735642568.0, | |
| "step": 25800 | |
| }, | |
| { | |
| "entropy": 1.43296875, | |
| "epoch": 0.6329268889868273, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 4.908874730988262e-06, | |
| "loss": 0.1859, | |
| "mean_token_accuracy": 0.9601176917552948, | |
| "num_tokens": 2741009627.0, | |
| "step": 25850 | |
| }, | |
| { | |
| "entropy": 1.42296875, | |
| "epoch": 0.6341511189461828, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 4.904186012745451e-06, | |
| "loss": 0.1836, | |
| "mean_token_accuracy": 0.9604202997684479, | |
| "num_tokens": 2746576865.0, | |
| "step": 25900 | |
| }, | |
| { | |
| "entropy": 1.42078125, | |
| "epoch": 0.6353753489055384, | |
| "grad_norm": 3.109375, | |
| "learning_rate": 4.899489491949643e-06, | |
| "loss": 0.1678, | |
| "mean_token_accuracy": 0.9639356219768525, | |
| "num_tokens": 2751636571.0, | |
| "step": 25950 | |
| }, | |
| { | |
| "entropy": 1.43125, | |
| "epoch": 0.6365995788648939, | |
| "grad_norm": 3.328125, | |
| "learning_rate": 4.894785187845203e-06, | |
| "loss": 0.1763, | |
| "mean_token_accuracy": 0.9626227140426635, | |
| "num_tokens": 2756749043.0, | |
| "step": 26000 | |
| }, | |
| { | |
| "entropy": 1.41953125, | |
| "epoch": 0.6378238088242495, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 4.890073119708392e-06, | |
| "loss": 0.1716, | |
| "mean_token_accuracy": 0.9636380136013031, | |
| "num_tokens": 2761887971.0, | |
| "step": 26050 | |
| }, | |
| { | |
| "entropy": 1.42109375, | |
| "epoch": 0.6390480387836052, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 4.88535330684728e-06, | |
| "loss": 0.1754, | |
| "mean_token_accuracy": 0.9623912250995637, | |
| "num_tokens": 2767051370.0, | |
| "step": 26100 | |
| }, | |
| { | |
| "entropy": 1.4259375, | |
| "epoch": 0.6402722687429607, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 4.880625768601674e-06, | |
| "loss": 0.1781, | |
| "mean_token_accuracy": 0.9622378349304199, | |
| "num_tokens": 2772481902.0, | |
| "step": 26150 | |
| }, | |
| { | |
| "entropy": 1.4315625, | |
| "epoch": 0.6414964987023163, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 4.87589052434304e-06, | |
| "loss": 0.1874, | |
| "mean_token_accuracy": 0.9602720224857331, | |
| "num_tokens": 2777927527.0, | |
| "step": 26200 | |
| }, | |
| { | |
| "entropy": 1.4140625, | |
| "epoch": 0.6427207286616718, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 4.871147593474412e-06, | |
| "loss": 0.184, | |
| "mean_token_accuracy": 0.9599432504177093, | |
| "num_tokens": 2783446389.0, | |
| "step": 26250 | |
| }, | |
| { | |
| "entropy": 1.4053125, | |
| "epoch": 0.6439449586210274, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 4.866396995430328e-06, | |
| "loss": 0.1786, | |
| "mean_token_accuracy": 0.9628067684173583, | |
| "num_tokens": 2788980882.0, | |
| "step": 26300 | |
| }, | |
| { | |
| "entropy": 1.38875, | |
| "epoch": 0.645169188580383, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 4.861638749676737e-06, | |
| "loss": 0.1677, | |
| "mean_token_accuracy": 0.9639978551864624, | |
| "num_tokens": 2793955184.0, | |
| "step": 26350 | |
| }, | |
| { | |
| "entropy": 1.4034375, | |
| "epoch": 0.6463934185397385, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 4.85687287571093e-06, | |
| "loss": 0.1721, | |
| "mean_token_accuracy": 0.9636970722675323, | |
| "num_tokens": 2799185455.0, | |
| "step": 26400 | |
| }, | |
| { | |
| "entropy": 1.40828125, | |
| "epoch": 0.6476176484990941, | |
| "grad_norm": 3.640625, | |
| "learning_rate": 4.852099393061452e-06, | |
| "loss": 0.1818, | |
| "mean_token_accuracy": 0.962208844423294, | |
| "num_tokens": 2804463803.0, | |
| "step": 26450 | |
| }, | |
| { | |
| "entropy": 1.38484375, | |
| "epoch": 0.6488418784584497, | |
| "grad_norm": 1.75, | |
| "learning_rate": 4.847318321288027e-06, | |
| "loss": 0.165, | |
| "mean_token_accuracy": 0.9649109244346619, | |
| "num_tokens": 2809874779.0, | |
| "step": 26500 | |
| }, | |
| { | |
| "entropy": 1.37953125, | |
| "epoch": 0.6500661084178052, | |
| "grad_norm": 2.984375, | |
| "learning_rate": 4.842529679981474e-06, | |
| "loss": 0.1694, | |
| "mean_token_accuracy": 0.9632159042358398, | |
| "num_tokens": 2814714128.0, | |
| "step": 26550 | |
| }, | |
| { | |
| "entropy": 1.39625, | |
| "epoch": 0.6512903383771608, | |
| "grad_norm": 2.765625, | |
| "learning_rate": 4.8377334887636305e-06, | |
| "loss": 0.1697, | |
| "mean_token_accuracy": 0.9637495183944702, | |
| "num_tokens": 2819740494.0, | |
| "step": 26600 | |
| }, | |
| { | |
| "entropy": 1.39109375, | |
| "epoch": 0.6525145683365163, | |
| "grad_norm": 3.03125, | |
| "learning_rate": 4.8329297672872695e-06, | |
| "loss": 0.1816, | |
| "mean_token_accuracy": 0.9610202670097351, | |
| "num_tokens": 2824966205.0, | |
| "step": 26650 | |
| }, | |
| { | |
| "entropy": 1.37796875, | |
| "epoch": 0.6537387982958719, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 4.828118535236023e-06, | |
| "loss": 0.1742, | |
| "mean_token_accuracy": 0.9625972366333008, | |
| "num_tokens": 2830034251.0, | |
| "step": 26700 | |
| }, | |
| { | |
| "entropy": 1.3953125, | |
| "epoch": 0.6549630282552275, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 4.823299812324291e-06, | |
| "loss": 0.1847, | |
| "mean_token_accuracy": 0.9611959600448609, | |
| "num_tokens": 2835494370.0, | |
| "step": 26750 | |
| }, | |
| { | |
| "entropy": 1.38203125, | |
| "epoch": 0.656187258214583, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 4.818473618297175e-06, | |
| "loss": 0.1728, | |
| "mean_token_accuracy": 0.9636625552177429, | |
| "num_tokens": 2840744565.0, | |
| "step": 26800 | |
| }, | |
| { | |
| "entropy": 1.3696875, | |
| "epoch": 0.6574114881739386, | |
| "grad_norm": 3.671875, | |
| "learning_rate": 4.8136399729303875e-06, | |
| "loss": 0.1599, | |
| "mean_token_accuracy": 0.9664247930049896, | |
| "num_tokens": 2845515500.0, | |
| "step": 26850 | |
| }, | |
| { | |
| "entropy": 1.39671875, | |
| "epoch": 0.6586357181332941, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 4.808798896030171e-06, | |
| "loss": 0.182, | |
| "mean_token_accuracy": 0.9610953998565673, | |
| "num_tokens": 2850746030.0, | |
| "step": 26900 | |
| }, | |
| { | |
| "entropy": 1.38609375, | |
| "epoch": 0.6598599480926497, | |
| "grad_norm": 1.578125, | |
| "learning_rate": 4.803950407433224e-06, | |
| "loss": 0.1774, | |
| "mean_token_accuracy": 0.9627044332027436, | |
| "num_tokens": 2856071580.0, | |
| "step": 26950 | |
| }, | |
| { | |
| "entropy": 1.38640625, | |
| "epoch": 0.6610841780520053, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 4.799094527006611e-06, | |
| "loss": 0.1747, | |
| "mean_token_accuracy": 0.9633591079711914, | |
| "num_tokens": 2861236205.0, | |
| "step": 27000 | |
| }, | |
| { | |
| "entropy": 1.38140625, | |
| "epoch": 0.6623084080113608, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 4.794231274647687e-06, | |
| "loss": 0.175, | |
| "mean_token_accuracy": 0.9629326021671295, | |
| "num_tokens": 2866317531.0, | |
| "step": 27050 | |
| }, | |
| { | |
| "entropy": 1.37421875, | |
| "epoch": 0.6635326379707164, | |
| "grad_norm": 2.765625, | |
| "learning_rate": 4.789360670284014e-06, | |
| "loss": 0.178, | |
| "mean_token_accuracy": 0.962060467004776, | |
| "num_tokens": 2871541131.0, | |
| "step": 27100 | |
| }, | |
| { | |
| "entropy": 1.4078125, | |
| "epoch": 0.6647568679300719, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 4.784482733873279e-06, | |
| "loss": 0.1962, | |
| "mean_token_accuracy": 0.959048901796341, | |
| "num_tokens": 2877146197.0, | |
| "step": 27150 | |
| }, | |
| { | |
| "entropy": 1.3890625, | |
| "epoch": 0.6659810978894275, | |
| "grad_norm": 2.125, | |
| "learning_rate": 4.7795974854032114e-06, | |
| "loss": 0.1823, | |
| "mean_token_accuracy": 0.9619522738456726, | |
| "num_tokens": 2882596630.0, | |
| "step": 27200 | |
| }, | |
| { | |
| "entropy": 1.3603125, | |
| "epoch": 0.6672053278487832, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 4.774704944891505e-06, | |
| "loss": 0.175, | |
| "mean_token_accuracy": 0.9625801253318786, | |
| "num_tokens": 2887948438.0, | |
| "step": 27250 | |
| }, | |
| { | |
| "entropy": 1.39546875, | |
| "epoch": 0.6684295578081387, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 4.769805132385734e-06, | |
| "loss": 0.1879, | |
| "mean_token_accuracy": 0.9613603317737579, | |
| "num_tokens": 2893501173.0, | |
| "step": 27300 | |
| }, | |
| { | |
| "entropy": 1.40875, | |
| "epoch": 0.6696537877674943, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 4.764898067963265e-06, | |
| "loss": 0.1873, | |
| "mean_token_accuracy": 0.9604850566387176, | |
| "num_tokens": 2898869944.0, | |
| "step": 27350 | |
| }, | |
| { | |
| "entropy": 1.37859375, | |
| "epoch": 0.6708780177268499, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 4.759983771731184e-06, | |
| "loss": 0.1679, | |
| "mean_token_accuracy": 0.965053141117096, | |
| "num_tokens": 2903596870.0, | |
| "step": 27400 | |
| }, | |
| { | |
| "entropy": 1.37453125, | |
| "epoch": 0.6721022476862054, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 4.75506226382621e-06, | |
| "loss": 0.1862, | |
| "mean_token_accuracy": 0.9613700366020203, | |
| "num_tokens": 2909474929.0, | |
| "step": 27450 | |
| }, | |
| { | |
| "entropy": 1.36875, | |
| "epoch": 0.673326477645561, | |
| "grad_norm": 2.453125, | |
| "learning_rate": 4.750133564414611e-06, | |
| "loss": 0.1667, | |
| "mean_token_accuracy": 0.9644119250774383, | |
| "num_tokens": 2914673564.0, | |
| "step": 27500 | |
| }, | |
| { | |
| "entropy": 1.396875, | |
| "epoch": 0.6745507076049165, | |
| "grad_norm": 2.796875, | |
| "learning_rate": 4.745197693692121e-06, | |
| "loss": 0.1852, | |
| "mean_token_accuracy": 0.9608116745948792, | |
| "num_tokens": 2920176865.0, | |
| "step": 27550 | |
| }, | |
| { | |
| "entropy": 1.41515625, | |
| "epoch": 0.6757749375642721, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 4.740254671883864e-06, | |
| "loss": 0.1912, | |
| "mean_token_accuracy": 0.9596376729011535, | |
| "num_tokens": 2925586459.0, | |
| "step": 27600 | |
| }, | |
| { | |
| "entropy": 1.3996875, | |
| "epoch": 0.6769991675236277, | |
| "grad_norm": 3.65625, | |
| "learning_rate": 4.735304519244263e-06, | |
| "loss": 0.1745, | |
| "mean_token_accuracy": 0.9637066113948822, | |
| "num_tokens": 2930825954.0, | |
| "step": 27650 | |
| }, | |
| { | |
| "entropy": 1.3809375, | |
| "epoch": 0.6782233974829832, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 4.73034725605696e-06, | |
| "loss": 0.1658, | |
| "mean_token_accuracy": 0.9653242897987365, | |
| "num_tokens": 2935862959.0, | |
| "step": 27700 | |
| }, | |
| { | |
| "entropy": 1.38953125, | |
| "epoch": 0.6794476274423388, | |
| "grad_norm": 3.0625, | |
| "learning_rate": 4.725382902634733e-06, | |
| "loss": 0.1681, | |
| "mean_token_accuracy": 0.9643997454643249, | |
| "num_tokens": 2940725166.0, | |
| "step": 27750 | |
| }, | |
| { | |
| "entropy": 1.40421875, | |
| "epoch": 0.6806718574016943, | |
| "grad_norm": 2.859375, | |
| "learning_rate": 4.720411479319414e-06, | |
| "loss": 0.1725, | |
| "mean_token_accuracy": 0.9641519057750702, | |
| "num_tokens": 2946188027.0, | |
| "step": 27800 | |
| }, | |
| { | |
| "entropy": 1.40796875, | |
| "epoch": 0.6818960873610499, | |
| "grad_norm": 2.828125, | |
| "learning_rate": 4.7154330064818045e-06, | |
| "loss": 0.1841, | |
| "mean_token_accuracy": 0.9606011056900025, | |
| "num_tokens": 2951612651.0, | |
| "step": 27850 | |
| }, | |
| { | |
| "entropy": 1.395625, | |
| "epoch": 0.6831203173204055, | |
| "grad_norm": 2.96875, | |
| "learning_rate": 4.710447504521588e-06, | |
| "loss": 0.1647, | |
| "mean_token_accuracy": 0.9641698563098907, | |
| "num_tokens": 2956787623.0, | |
| "step": 27900 | |
| }, | |
| { | |
| "entropy": 1.40359375, | |
| "epoch": 0.684344547279761, | |
| "grad_norm": 3.5625, | |
| "learning_rate": 4.705454993867257e-06, | |
| "loss": 0.1751, | |
| "mean_token_accuracy": 0.9634602963924408, | |
| "num_tokens": 2961925459.0, | |
| "step": 27950 | |
| }, | |
| { | |
| "entropy": 1.3925, | |
| "epoch": 0.6855687772391166, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 4.700455494976019e-06, | |
| "loss": 0.1751, | |
| "mean_token_accuracy": 0.9632600677013398, | |
| "num_tokens": 2967274024.0, | |
| "step": 28000 | |
| }, | |
| { | |
| "entropy": 1.3640625, | |
| "epoch": 0.6867930071984721, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 4.695449028333715e-06, | |
| "loss": 0.1581, | |
| "mean_token_accuracy": 0.965574380159378, | |
| "num_tokens": 2972439136.0, | |
| "step": 28050 | |
| }, | |
| { | |
| "entropy": 1.37203125, | |
| "epoch": 0.6880172371578277, | |
| "grad_norm": 2.640625, | |
| "learning_rate": 4.6904356144547405e-06, | |
| "loss": 0.1833, | |
| "mean_token_accuracy": 0.9605630087852478, | |
| "num_tokens": 2977717715.0, | |
| "step": 28100 | |
| }, | |
| { | |
| "entropy": 1.38703125, | |
| "epoch": 0.6892414671171833, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 4.685415273881955e-06, | |
| "loss": 0.1849, | |
| "mean_token_accuracy": 0.9602934348583222, | |
| "num_tokens": 2983019999.0, | |
| "step": 28150 | |
| }, | |
| { | |
| "entropy": 1.36609375, | |
| "epoch": 0.6904656970765388, | |
| "grad_norm": 1.65625, | |
| "learning_rate": 4.6803880271866e-06, | |
| "loss": 0.1635, | |
| "mean_token_accuracy": 0.9659206521511078, | |
| "num_tokens": 2987974089.0, | |
| "step": 28200 | |
| }, | |
| { | |
| "entropy": 1.38875, | |
| "epoch": 0.6916899270358944, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 4.675353894968219e-06, | |
| "loss": 0.1956, | |
| "mean_token_accuracy": 0.958441025018692, | |
| "num_tokens": 2993587967.0, | |
| "step": 28250 | |
| }, | |
| { | |
| "entropy": 1.3828125, | |
| "epoch": 0.6929141569952499, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 4.670312897854568e-06, | |
| "loss": 0.1822, | |
| "mean_token_accuracy": 0.9611673438549042, | |
| "num_tokens": 2999047067.0, | |
| "step": 28300 | |
| }, | |
| { | |
| "entropy": 1.36875, | |
| "epoch": 0.6941383869546055, | |
| "grad_norm": 2.375, | |
| "learning_rate": 4.665265056501529e-06, | |
| "loss": 0.1743, | |
| "mean_token_accuracy": 0.9631416380405426, | |
| "num_tokens": 3004064576.0, | |
| "step": 28350 | |
| }, | |
| { | |
| "entropy": 1.34109375, | |
| "epoch": 0.6953626169139612, | |
| "grad_norm": 3.0625, | |
| "learning_rate": 4.660210391593035e-06, | |
| "loss": 0.1593, | |
| "mean_token_accuracy": 0.9659523034095764, | |
| "num_tokens": 3009178123.0, | |
| "step": 28400 | |
| }, | |
| { | |
| "entropy": 1.36859375, | |
| "epoch": 0.6965868468733167, | |
| "grad_norm": 2.96875, | |
| "learning_rate": 4.655148923840974e-06, | |
| "loss": 0.1848, | |
| "mean_token_accuracy": 0.9613404250144959, | |
| "num_tokens": 3014406061.0, | |
| "step": 28450 | |
| }, | |
| { | |
| "entropy": 1.36828125, | |
| "epoch": 0.6978110768326723, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 4.6500806739851114e-06, | |
| "loss": 0.1754, | |
| "mean_token_accuracy": 0.9632516479492188, | |
| "num_tokens": 3019405252.0, | |
| "step": 28500 | |
| }, | |
| { | |
| "entropy": 1.36640625, | |
| "epoch": 0.6990353067920279, | |
| "grad_norm": 3.265625, | |
| "learning_rate": 4.645005662793002e-06, | |
| "loss": 0.1765, | |
| "mean_token_accuracy": 0.9634008550643921, | |
| "num_tokens": 3024715395.0, | |
| "step": 28550 | |
| }, | |
| { | |
| "entropy": 1.386875, | |
| "epoch": 0.7002595367513834, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 4.639923911059907e-06, | |
| "loss": 0.1792, | |
| "mean_token_accuracy": 0.9633400416374207, | |
| "num_tokens": 3030214594.0, | |
| "step": 28600 | |
| }, | |
| { | |
| "entropy": 1.36390625, | |
| "epoch": 0.701483766710739, | |
| "grad_norm": 2.828125, | |
| "learning_rate": 4.634835439608706e-06, | |
| "loss": 0.1712, | |
| "mean_token_accuracy": 0.9632709419727326, | |
| "num_tokens": 3035472593.0, | |
| "step": 28650 | |
| }, | |
| { | |
| "entropy": 1.34984375, | |
| "epoch": 0.7027079966700945, | |
| "grad_norm": 2.640625, | |
| "learning_rate": 4.629740269289813e-06, | |
| "loss": 0.1634, | |
| "mean_token_accuracy": 0.9657196223735809, | |
| "num_tokens": 3040576077.0, | |
| "step": 28700 | |
| }, | |
| { | |
| "entropy": 1.37296875, | |
| "epoch": 0.7039322266294501, | |
| "grad_norm": 1.8125, | |
| "learning_rate": 4.6246384209810935e-06, | |
| "loss": 0.1857, | |
| "mean_token_accuracy": 0.9612914025783539, | |
| "num_tokens": 3046057341.0, | |
| "step": 28750 | |
| }, | |
| { | |
| "entropy": 1.35765625, | |
| "epoch": 0.7051564565888057, | |
| "grad_norm": 3.5, | |
| "learning_rate": 4.6195299155877746e-06, | |
| "loss": 0.1752, | |
| "mean_token_accuracy": 0.9628597724437714, | |
| "num_tokens": 3051406159.0, | |
| "step": 28800 | |
| }, | |
| { | |
| "entropy": 1.34625, | |
| "epoch": 0.7063806865481612, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 4.61441477404236e-06, | |
| "loss": 0.1736, | |
| "mean_token_accuracy": 0.963384006023407, | |
| "num_tokens": 3056663844.0, | |
| "step": 28850 | |
| }, | |
| { | |
| "entropy": 1.35421875, | |
| "epoch": 0.7076049165075168, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 4.60929301730455e-06, | |
| "loss": 0.1857, | |
| "mean_token_accuracy": 0.9611174511909485, | |
| "num_tokens": 3062180594.0, | |
| "step": 28900 | |
| }, | |
| { | |
| "entropy": 1.3396875, | |
| "epoch": 0.7088291464668723, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 4.604164666361146e-06, | |
| "loss": 0.1771, | |
| "mean_token_accuracy": 0.9630412280559539, | |
| "num_tokens": 3067629529.0, | |
| "step": 28950 | |
| }, | |
| { | |
| "entropy": 1.3521875, | |
| "epoch": 0.7100533764262279, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 4.599029742225975e-06, | |
| "loss": 0.1854, | |
| "mean_token_accuracy": 0.9603700506687164, | |
| "num_tokens": 3072962675.0, | |
| "step": 29000 | |
| }, | |
| { | |
| "entropy": 1.34265625, | |
| "epoch": 0.7112776063855835, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 4.593888265939793e-06, | |
| "loss": 0.1668, | |
| "mean_token_accuracy": 0.9641862511634827, | |
| "num_tokens": 3078457917.0, | |
| "step": 29050 | |
| }, | |
| { | |
| "entropy": 1.3565625, | |
| "epoch": 0.712501836344939, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 4.5887402585702056e-06, | |
| "loss": 0.1741, | |
| "mean_token_accuracy": 0.9627685403823852, | |
| "num_tokens": 3083722495.0, | |
| "step": 29100 | |
| }, | |
| { | |
| "entropy": 1.3690625, | |
| "epoch": 0.7137260663042946, | |
| "grad_norm": 2.0, | |
| "learning_rate": 4.583585741211583e-06, | |
| "loss": 0.1782, | |
| "mean_token_accuracy": 0.9620171189308167, | |
| "num_tokens": 3089097439.0, | |
| "step": 29150 | |
| }, | |
| { | |
| "entropy": 1.3615625, | |
| "epoch": 0.7149502962636501, | |
| "grad_norm": 2.90625, | |
| "learning_rate": 4.5784247349849666e-06, | |
| "loss": 0.183, | |
| "mean_token_accuracy": 0.9622057628631592, | |
| "num_tokens": 3094373355.0, | |
| "step": 29200 | |
| }, | |
| { | |
| "entropy": 1.3421875, | |
| "epoch": 0.7161745262230057, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 4.57325726103799e-06, | |
| "loss": 0.1771, | |
| "mean_token_accuracy": 0.9627100145816803, | |
| "num_tokens": 3099619006.0, | |
| "step": 29250 | |
| }, | |
| { | |
| "entropy": 1.33015625, | |
| "epoch": 0.7173987561823613, | |
| "grad_norm": 3.296875, | |
| "learning_rate": 4.568083340544785e-06, | |
| "loss": 0.1738, | |
| "mean_token_accuracy": 0.9631901240348816, | |
| "num_tokens": 3104769496.0, | |
| "step": 29300 | |
| }, | |
| { | |
| "entropy": 1.32921875, | |
| "epoch": 0.7186229861417168, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 4.562902994705902e-06, | |
| "loss": 0.1689, | |
| "mean_token_accuracy": 0.9646138906478882, | |
| "num_tokens": 3110079410.0, | |
| "step": 29350 | |
| }, | |
| { | |
| "entropy": 1.3515625, | |
| "epoch": 0.7198472161010724, | |
| "grad_norm": 2.640625, | |
| "learning_rate": 4.557716244748217e-06, | |
| "loss": 0.186, | |
| "mean_token_accuracy": 0.9605904114246369, | |
| "num_tokens": 3115590754.0, | |
| "step": 29400 | |
| }, | |
| { | |
| "entropy": 1.33421875, | |
| "epoch": 0.721071446060428, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 4.55252311192485e-06, | |
| "loss": 0.1727, | |
| "mean_token_accuracy": 0.9634395956993103, | |
| "num_tokens": 3120943769.0, | |
| "step": 29450 | |
| }, | |
| { | |
| "entropy": 1.3384375, | |
| "epoch": 0.7222956760197835, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 4.547323617515073e-06, | |
| "loss": 0.1754, | |
| "mean_token_accuracy": 0.9623040866851806, | |
| "num_tokens": 3126534469.0, | |
| "step": 29500 | |
| }, | |
| { | |
| "entropy": 1.306875, | |
| "epoch": 0.7235199059791391, | |
| "grad_norm": 3.5, | |
| "learning_rate": 4.542117782824228e-06, | |
| "loss": 0.1649, | |
| "mean_token_accuracy": 0.9650185751914978, | |
| "num_tokens": 3131829007.0, | |
| "step": 29550 | |
| }, | |
| { | |
| "entropy": 1.31984375, | |
| "epoch": 0.7247441359384947, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 4.536905629183632e-06, | |
| "loss": 0.1844, | |
| "mean_token_accuracy": 0.9605432045459747, | |
| "num_tokens": 3137395527.0, | |
| "step": 29600 | |
| }, | |
| { | |
| "entropy": 1.3121875, | |
| "epoch": 0.7259683658978503, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 4.5316871779505e-06, | |
| "loss": 0.1663, | |
| "mean_token_accuracy": 0.9653282749652863, | |
| "num_tokens": 3142501686.0, | |
| "step": 29650 | |
| }, | |
| { | |
| "entropy": 1.33921875, | |
| "epoch": 0.7271925958572059, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 4.5264624505078485e-06, | |
| "loss": 0.1796, | |
| "mean_token_accuracy": 0.9623512411117554, | |
| "num_tokens": 3147984109.0, | |
| "step": 29700 | |
| }, | |
| { | |
| "entropy": 1.3259375, | |
| "epoch": 0.7284168258165614, | |
| "grad_norm": 3.671875, | |
| "learning_rate": 4.521231468264411e-06, | |
| "loss": 0.173, | |
| "mean_token_accuracy": 0.9634522151947021, | |
| "num_tokens": 3153428961.0, | |
| "step": 29750 | |
| }, | |
| { | |
| "entropy": 1.339375, | |
| "epoch": 0.729641055775917, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 4.515994252654552e-06, | |
| "loss": 0.1846, | |
| "mean_token_accuracy": 0.9607186770439148, | |
| "num_tokens": 3158828246.0, | |
| "step": 29800 | |
| }, | |
| { | |
| "entropy": 1.29671875, | |
| "epoch": 0.7308652857352725, | |
| "grad_norm": 3.140625, | |
| "learning_rate": 4.510750825138178e-06, | |
| "loss": 0.1608, | |
| "mean_token_accuracy": 0.9657926094532013, | |
| "num_tokens": 3163804439.0, | |
| "step": 29850 | |
| }, | |
| { | |
| "entropy": 1.3315625, | |
| "epoch": 0.7320895156946281, | |
| "grad_norm": 2.9375, | |
| "learning_rate": 4.505501207200649e-06, | |
| "loss": 0.1818, | |
| "mean_token_accuracy": 0.9619475591182709, | |
| "num_tokens": 3169333412.0, | |
| "step": 29900 | |
| }, | |
| { | |
| "entropy": 1.324375, | |
| "epoch": 0.7333137456539837, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 4.500245420352687e-06, | |
| "loss": 0.1733, | |
| "mean_token_accuracy": 0.963250036239624, | |
| "num_tokens": 3174683947.0, | |
| "step": 29950 | |
| }, | |
| { | |
| "entropy": 1.32015625, | |
| "epoch": 0.7345379756133392, | |
| "grad_norm": 3.171875, | |
| "learning_rate": 4.494983486130298e-06, | |
| "loss": 0.1755, | |
| "mean_token_accuracy": 0.9633795261383057, | |
| "num_tokens": 3179817804.0, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 0.7345379756133392, | |
| "eval_entropy": 1.3244140625, | |
| "eval_loss": 0.1920091211795807, | |
| "eval_mean_token_accuracy": 0.9597868000467619, | |
| "eval_num_tokens": 3179817804.0, | |
| "eval_runtime": 606.2695, | |
| "eval_samples_per_second": 15.927, | |
| "eval_steps_per_second": 0.2, | |
| "step": 30000 | |
| }, | |
| { | |
| "entropy": 1.34265625, | |
| "epoch": 0.7357622055726948, | |
| "grad_norm": 2.828125, | |
| "learning_rate": 4.489715426094674e-06, | |
| "loss": 0.1971, | |
| "mean_token_accuracy": 0.9590841460227967, | |
| "num_tokens": 3185695558.0, | |
| "step": 30050 | |
| }, | |
| { | |
| "entropy": 1.33234375, | |
| "epoch": 0.7369864355320503, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 4.484441261832107e-06, | |
| "loss": 0.1767, | |
| "mean_token_accuracy": 0.9629596638679504, | |
| "num_tokens": 3191177099.0, | |
| "step": 30100 | |
| }, | |
| { | |
| "entropy": 1.3253125, | |
| "epoch": 0.7382106654914059, | |
| "grad_norm": 2.75, | |
| "learning_rate": 4.479161014953903e-06, | |
| "loss": 0.1795, | |
| "mean_token_accuracy": 0.9617591965198516, | |
| "num_tokens": 3196688072.0, | |
| "step": 30150 | |
| }, | |
| { | |
| "entropy": 1.3171875, | |
| "epoch": 0.7394348954507615, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 4.473874707096293e-06, | |
| "loss": 0.185, | |
| "mean_token_accuracy": 0.9615085804462433, | |
| "num_tokens": 3202252950.0, | |
| "step": 30200 | |
| }, | |
| { | |
| "entropy": 1.3203125, | |
| "epoch": 0.740659125410117, | |
| "grad_norm": 3.078125, | |
| "learning_rate": 4.46858235992034e-06, | |
| "loss": 0.1716, | |
| "mean_token_accuracy": 0.9639656889438629, | |
| "num_tokens": 3207720004.0, | |
| "step": 30250 | |
| }, | |
| { | |
| "entropy": 1.33046875, | |
| "epoch": 0.7418833553694726, | |
| "grad_norm": 3.4375, | |
| "learning_rate": 4.463283995111858e-06, | |
| "loss": 0.1909, | |
| "mean_token_accuracy": 0.9597360849380493, | |
| "num_tokens": 3213270190.0, | |
| "step": 30300 | |
| }, | |
| { | |
| "entropy": 1.32171875, | |
| "epoch": 0.7431075853288281, | |
| "grad_norm": 3.671875, | |
| "learning_rate": 4.4579796343813155e-06, | |
| "loss": 0.1746, | |
| "mean_token_accuracy": 0.9631195080280304, | |
| "num_tokens": 3218354333.0, | |
| "step": 30350 | |
| }, | |
| { | |
| "entropy": 1.3359375, | |
| "epoch": 0.7443318152881837, | |
| "grad_norm": 3.15625, | |
| "learning_rate": 4.452669299463749e-06, | |
| "loss": 0.172, | |
| "mean_token_accuracy": 0.963985036611557, | |
| "num_tokens": 3223570126.0, | |
| "step": 30400 | |
| }, | |
| { | |
| "entropy": 1.32640625, | |
| "epoch": 0.7455560452475393, | |
| "grad_norm": 1.8125, | |
| "learning_rate": 4.44735301211868e-06, | |
| "loss": 0.1807, | |
| "mean_token_accuracy": 0.9622200524806976, | |
| "num_tokens": 3228934737.0, | |
| "step": 30450 | |
| }, | |
| { | |
| "entropy": 1.34375, | |
| "epoch": 0.7467802752068948, | |
| "grad_norm": 3.109375, | |
| "learning_rate": 4.442030794130013e-06, | |
| "loss": 0.1719, | |
| "mean_token_accuracy": 0.9641703021526337, | |
| "num_tokens": 3234092609.0, | |
| "step": 30500 | |
| }, | |
| { | |
| "entropy": 1.3525, | |
| "epoch": 0.7480045051662504, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 4.43670266730596e-06, | |
| "loss": 0.1871, | |
| "mean_token_accuracy": 0.9610934937000275, | |
| "num_tokens": 3239470570.0, | |
| "step": 30550 | |
| }, | |
| { | |
| "entropy": 1.35859375, | |
| "epoch": 0.749228735125606, | |
| "grad_norm": 3.234375, | |
| "learning_rate": 4.431368653478943e-06, | |
| "loss": 0.1799, | |
| "mean_token_accuracy": 0.9625358593463897, | |
| "num_tokens": 3245129970.0, | |
| "step": 30600 | |
| }, | |
| { | |
| "entropy": 1.36859375, | |
| "epoch": 0.7504529650849615, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 4.426028774505504e-06, | |
| "loss": 0.1895, | |
| "mean_token_accuracy": 0.9608589220046997, | |
| "num_tokens": 3250417534.0, | |
| "step": 30650 | |
| }, | |
| { | |
| "entropy": 1.37203125, | |
| "epoch": 0.7516771950443171, | |
| "grad_norm": 3.125, | |
| "learning_rate": 4.420683052266223e-06, | |
| "loss": 0.1962, | |
| "mean_token_accuracy": 0.9591640889644623, | |
| "num_tokens": 3256202020.0, | |
| "step": 30700 | |
| }, | |
| { | |
| "entropy": 1.35421875, | |
| "epoch": 0.7529014250036727, | |
| "grad_norm": 2.84375, | |
| "learning_rate": 4.415331508665619e-06, | |
| "loss": 0.1723, | |
| "mean_token_accuracy": 0.9638619077205658, | |
| "num_tokens": 3261559010.0, | |
| "step": 30750 | |
| }, | |
| { | |
| "entropy": 1.36328125, | |
| "epoch": 0.7541256549630283, | |
| "grad_norm": 3.5625, | |
| "learning_rate": 4.409974165632064e-06, | |
| "loss": 0.1819, | |
| "mean_token_accuracy": 0.9618020045757294, | |
| "num_tokens": 3267151095.0, | |
| "step": 30800 | |
| }, | |
| { | |
| "entropy": 1.3546875, | |
| "epoch": 0.7553498849223839, | |
| "grad_norm": 3.484375, | |
| "learning_rate": 4.404611045117696e-06, | |
| "loss": 0.1792, | |
| "mean_token_accuracy": 0.9617926621437073, | |
| "num_tokens": 3272412916.0, | |
| "step": 30850 | |
| }, | |
| { | |
| "entropy": 1.3534375, | |
| "epoch": 0.7565741148817394, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 4.399242169098329e-06, | |
| "loss": 0.1745, | |
| "mean_token_accuracy": 0.9625967741012573, | |
| "num_tokens": 3277577448.0, | |
| "step": 30900 | |
| }, | |
| { | |
| "entropy": 1.35625, | |
| "epoch": 0.757798344841095, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 4.393867559573354e-06, | |
| "loss": 0.1744, | |
| "mean_token_accuracy": 0.9626732635498046, | |
| "num_tokens": 3282706579.0, | |
| "step": 30950 | |
| }, | |
| { | |
| "entropy": 1.36421875, | |
| "epoch": 0.7590225748004505, | |
| "grad_norm": 3.0, | |
| "learning_rate": 4.388487238565661e-06, | |
| "loss": 0.1784, | |
| "mean_token_accuracy": 0.9623777115345001, | |
| "num_tokens": 3287949862.0, | |
| "step": 31000 | |
| }, | |
| { | |
| "entropy": 1.36, | |
| "epoch": 0.7602468047598061, | |
| "grad_norm": 0.0230712890625, | |
| "learning_rate": 4.383101228121541e-06, | |
| "loss": 0.1788, | |
| "mean_token_accuracy": 0.9617887794971466, | |
| "num_tokens": 3293406088.0, | |
| "step": 31050 | |
| }, | |
| { | |
| "entropy": 1.35609375, | |
| "epoch": 0.7614710347191617, | |
| "grad_norm": 2.984375, | |
| "learning_rate": 4.377709550310598e-06, | |
| "loss": 0.1699, | |
| "mean_token_accuracy": 0.9636480760574341, | |
| "num_tokens": 3298608896.0, | |
| "step": 31100 | |
| }, | |
| { | |
| "entropy": 1.35375, | |
| "epoch": 0.7626952646785172, | |
| "grad_norm": 3.65625, | |
| "learning_rate": 4.37231222722566e-06, | |
| "loss": 0.1643, | |
| "mean_token_accuracy": 0.9644955229759217, | |
| "num_tokens": 3303290550.0, | |
| "step": 31150 | |
| }, | |
| { | |
| "entropy": 1.37390625, | |
| "epoch": 0.7639194946378728, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 4.366909280982685e-06, | |
| "loss": 0.1766, | |
| "mean_token_accuracy": 0.9628056597709655, | |
| "num_tokens": 3308295645.0, | |
| "step": 31200 | |
| }, | |
| { | |
| "entropy": 1.36515625, | |
| "epoch": 0.7651437245972283, | |
| "grad_norm": 2.453125, | |
| "learning_rate": 4.361500733720674e-06, | |
| "loss": 0.1662, | |
| "mean_token_accuracy": 0.9649233341217041, | |
| "num_tokens": 3313438478.0, | |
| "step": 31250 | |
| }, | |
| { | |
| "entropy": 1.3575, | |
| "epoch": 0.7663679545565839, | |
| "grad_norm": 4.46875, | |
| "learning_rate": 4.356086607601575e-06, | |
| "loss": 0.1749, | |
| "mean_token_accuracy": 0.9627750849723816, | |
| "num_tokens": 3319025887.0, | |
| "step": 31300 | |
| }, | |
| { | |
| "entropy": 1.34359375, | |
| "epoch": 0.7675921845159395, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 4.350666924810203e-06, | |
| "loss": 0.1647, | |
| "mean_token_accuracy": 0.9644002187252044, | |
| "num_tokens": 3323975976.0, | |
| "step": 31350 | |
| }, | |
| { | |
| "entropy": 1.35203125, | |
| "epoch": 0.768816414475295, | |
| "grad_norm": 3.765625, | |
| "learning_rate": 4.345241707554134e-06, | |
| "loss": 0.1674, | |
| "mean_token_accuracy": 0.9647248589992523, | |
| "num_tokens": 3329356054.0, | |
| "step": 31400 | |
| }, | |
| { | |
| "entropy": 1.36625, | |
| "epoch": 0.7700406444346506, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 4.339810978063626e-06, | |
| "loss": 0.1776, | |
| "mean_token_accuracy": 0.9627327370643616, | |
| "num_tokens": 3334739313.0, | |
| "step": 31450 | |
| }, | |
| { | |
| "entropy": 1.35125, | |
| "epoch": 0.7712648743940062, | |
| "grad_norm": 1.875, | |
| "learning_rate": 4.334374758591524e-06, | |
| "loss": 0.1896, | |
| "mean_token_accuracy": 0.9596246099472046, | |
| "num_tokens": 3340200973.0, | |
| "step": 31500 | |
| }, | |
| { | |
| "entropy": 1.36171875, | |
| "epoch": 0.7724891043533617, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 4.328933071413168e-06, | |
| "loss": 0.1731, | |
| "mean_token_accuracy": 0.9636253571510315, | |
| "num_tokens": 3345689303.0, | |
| "step": 31550 | |
| }, | |
| { | |
| "entropy": 1.36078125, | |
| "epoch": 0.7737133343127173, | |
| "grad_norm": 3.4375, | |
| "learning_rate": 4.323485938826302e-06, | |
| "loss": 0.1896, | |
| "mean_token_accuracy": 0.9603872370719909, | |
| "num_tokens": 3350984033.0, | |
| "step": 31600 | |
| }, | |
| { | |
| "entropy": 1.3403125, | |
| "epoch": 0.7749375642720728, | |
| "grad_norm": 2.5, | |
| "learning_rate": 4.318033383150981e-06, | |
| "loss": 0.1735, | |
| "mean_token_accuracy": 0.9628359317779541, | |
| "num_tokens": 3356162417.0, | |
| "step": 31650 | |
| }, | |
| { | |
| "entropy": 1.34640625, | |
| "epoch": 0.7761617942314284, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 4.312575426729486e-06, | |
| "loss": 0.1848, | |
| "mean_token_accuracy": 0.9605207931995392, | |
| "num_tokens": 3361647453.0, | |
| "step": 31700 | |
| }, | |
| { | |
| "entropy": 1.33171875, | |
| "epoch": 0.777386024190784, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 4.307112091926226e-06, | |
| "loss": 0.1637, | |
| "mean_token_accuracy": 0.965142446756363, | |
| "num_tokens": 3366481444.0, | |
| "step": 31750 | |
| }, | |
| { | |
| "entropy": 1.37390625, | |
| "epoch": 0.7786102541501395, | |
| "grad_norm": 3.1875, | |
| "learning_rate": 4.301643401127647e-06, | |
| "loss": 0.1778, | |
| "mean_token_accuracy": 0.9628903007507325, | |
| "num_tokens": 3371649682.0, | |
| "step": 31800 | |
| }, | |
| { | |
| "entropy": 1.3721875, | |
| "epoch": 0.7798344841094951, | |
| "grad_norm": 2.625, | |
| "learning_rate": 4.2961693767421435e-06, | |
| "loss": 0.1645, | |
| "mean_token_accuracy": 0.9658307003974914, | |
| "num_tokens": 3376382887.0, | |
| "step": 31850 | |
| }, | |
| { | |
| "entropy": 1.358125, | |
| "epoch": 0.7810587140688507, | |
| "grad_norm": 2.921875, | |
| "learning_rate": 4.290690041199963e-06, | |
| "loss": 0.179, | |
| "mean_token_accuracy": 0.9622143077850341, | |
| "num_tokens": 3381791030.0, | |
| "step": 31900 | |
| }, | |
| { | |
| "entropy": 1.37015625, | |
| "epoch": 0.7822829440282063, | |
| "grad_norm": 2.125, | |
| "learning_rate": 4.285205416953118e-06, | |
| "loss": 0.1876, | |
| "mean_token_accuracy": 0.9609373700618744, | |
| "num_tokens": 3387334981.0, | |
| "step": 31950 | |
| }, | |
| { | |
| "entropy": 1.34765625, | |
| "epoch": 0.7835071739875619, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 4.279715526475289e-06, | |
| "loss": 0.1762, | |
| "mean_token_accuracy": 0.962603681087494, | |
| "num_tokens": 3392713314.0, | |
| "step": 32000 | |
| }, | |
| { | |
| "entropy": 1.3678125, | |
| "epoch": 0.7847314039469174, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 4.274220392261738e-06, | |
| "loss": 0.1887, | |
| "mean_token_accuracy": 0.9606349515914917, | |
| "num_tokens": 3398537796.0, | |
| "step": 32050 | |
| }, | |
| { | |
| "entropy": 1.33734375, | |
| "epoch": 0.785955633906273, | |
| "grad_norm": 2.921875, | |
| "learning_rate": 4.268720036829214e-06, | |
| "loss": 0.1748, | |
| "mean_token_accuracy": 0.964071912765503, | |
| "num_tokens": 3403920236.0, | |
| "step": 32100 | |
| }, | |
| { | |
| "entropy": 1.37, | |
| "epoch": 0.7871798638656285, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 4.263214482715857e-06, | |
| "loss": 0.1654, | |
| "mean_token_accuracy": 0.9644496822357178, | |
| "num_tokens": 3409108918.0, | |
| "step": 32150 | |
| }, | |
| { | |
| "entropy": 1.35046875, | |
| "epoch": 0.7884040938249841, | |
| "grad_norm": 3.125, | |
| "learning_rate": 4.2577037524811104e-06, | |
| "loss": 0.1714, | |
| "mean_token_accuracy": 0.9636311101913452, | |
| "num_tokens": 3414387238.0, | |
| "step": 32200 | |
| }, | |
| { | |
| "entropy": 1.34359375, | |
| "epoch": 0.7896283237843397, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 4.25218786870563e-06, | |
| "loss": 0.1552, | |
| "mean_token_accuracy": 0.965884006023407, | |
| "num_tokens": 3419148471.0, | |
| "step": 32250 | |
| }, | |
| { | |
| "entropy": 1.34875, | |
| "epoch": 0.7908525537436952, | |
| "grad_norm": 0.004241943359375, | |
| "learning_rate": 4.246666853991186e-06, | |
| "loss": 0.1676, | |
| "mean_token_accuracy": 0.9639466750621796, | |
| "num_tokens": 3424295496.0, | |
| "step": 32300 | |
| }, | |
| { | |
| "entropy": 1.364375, | |
| "epoch": 0.7920767837030508, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 4.241140730960573e-06, | |
| "loss": 0.1829, | |
| "mean_token_accuracy": 0.9615444934368134, | |
| "num_tokens": 3429846223.0, | |
| "step": 32350 | |
| }, | |
| { | |
| "entropy": 1.33828125, | |
| "epoch": 0.7933010136624064, | |
| "grad_norm": 3.53125, | |
| "learning_rate": 4.235609522257517e-06, | |
| "loss": 0.178, | |
| "mean_token_accuracy": 0.9621382772922515, | |
| "num_tokens": 3434814232.0, | |
| "step": 32400 | |
| }, | |
| { | |
| "entropy": 1.37265625, | |
| "epoch": 0.7945252436217619, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 4.230073250546585e-06, | |
| "loss": 0.1854, | |
| "mean_token_accuracy": 0.9616455745697021, | |
| "num_tokens": 3440013747.0, | |
| "step": 32450 | |
| }, | |
| { | |
| "entropy": 1.33484375, | |
| "epoch": 0.7957494735811175, | |
| "grad_norm": 2.828125, | |
| "learning_rate": 4.224531938513088e-06, | |
| "loss": 0.175, | |
| "mean_token_accuracy": 0.9632323062419892, | |
| "num_tokens": 3445299571.0, | |
| "step": 32500 | |
| }, | |
| { | |
| "entropy": 1.34203125, | |
| "epoch": 0.796973703540473, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 4.218985608862992e-06, | |
| "loss": 0.1814, | |
| "mean_token_accuracy": 0.9623367011547088, | |
| "num_tokens": 3450664579.0, | |
| "step": 32550 | |
| }, | |
| { | |
| "entropy": 1.3540625, | |
| "epoch": 0.7981979334998286, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 4.213434284322819e-06, | |
| "loss": 0.1729, | |
| "mean_token_accuracy": 0.9627703261375428, | |
| "num_tokens": 3455979121.0, | |
| "step": 32600 | |
| }, | |
| { | |
| "entropy": 1.33734375, | |
| "epoch": 0.7994221634591842, | |
| "grad_norm": 2.796875, | |
| "learning_rate": 4.207877987639566e-06, | |
| "loss": 0.1764, | |
| "mean_token_accuracy": 0.9627932643890381, | |
| "num_tokens": 3461283678.0, | |
| "step": 32650 | |
| }, | |
| { | |
| "entropy": 1.3596875, | |
| "epoch": 0.8006463934185397, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 4.202316741580594e-06, | |
| "loss": 0.1854, | |
| "mean_token_accuracy": 0.9612032771110535, | |
| "num_tokens": 3467126201.0, | |
| "step": 32700 | |
| }, | |
| { | |
| "entropy": 1.344375, | |
| "epoch": 0.8018706233778953, | |
| "grad_norm": 2.921875, | |
| "learning_rate": 4.196750568933551e-06, | |
| "loss": 0.1721, | |
| "mean_token_accuracy": 0.9638476753234864, | |
| "num_tokens": 3472599559.0, | |
| "step": 32750 | |
| }, | |
| { | |
| "entropy": 1.3415625, | |
| "epoch": 0.8030948533372508, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 4.191179492506271e-06, | |
| "loss": 0.1754, | |
| "mean_token_accuracy": 0.9628195893764496, | |
| "num_tokens": 3477994415.0, | |
| "step": 32800 | |
| }, | |
| { | |
| "entropy": 1.34953125, | |
| "epoch": 0.8043190832966064, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 4.18560353512668e-06, | |
| "loss": 0.1778, | |
| "mean_token_accuracy": 0.9618653762340545, | |
| "num_tokens": 3483437386.0, | |
| "step": 32850 | |
| }, | |
| { | |
| "entropy": 1.34390625, | |
| "epoch": 0.805543313255962, | |
| "grad_norm": 2.875, | |
| "learning_rate": 4.1800227196427055e-06, | |
| "loss": 0.1751, | |
| "mean_token_accuracy": 0.9623115694522858, | |
| "num_tokens": 3488795577.0, | |
| "step": 32900 | |
| }, | |
| { | |
| "entropy": 1.32609375, | |
| "epoch": 0.8067675432153175, | |
| "grad_norm": 1.9921875, | |
| "learning_rate": 4.17443706892218e-06, | |
| "loss": 0.1766, | |
| "mean_token_accuracy": 0.9626455020904541, | |
| "num_tokens": 3494139245.0, | |
| "step": 32950 | |
| }, | |
| { | |
| "entropy": 1.34953125, | |
| "epoch": 0.8079917731746731, | |
| "grad_norm": 3.640625, | |
| "learning_rate": 4.168846605852751e-06, | |
| "loss": 0.1811, | |
| "mean_token_accuracy": 0.9624789762496948, | |
| "num_tokens": 3499294686.0, | |
| "step": 33000 | |
| }, | |
| { | |
| "entropy": 1.34546875, | |
| "epoch": 0.8092160031340287, | |
| "grad_norm": 3.234375, | |
| "learning_rate": 4.1632513533417825e-06, | |
| "loss": 0.1629, | |
| "mean_token_accuracy": 0.9650925529003144, | |
| "num_tokens": 3504042622.0, | |
| "step": 33050 | |
| }, | |
| { | |
| "entropy": 1.3675, | |
| "epoch": 0.8104402330933843, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 4.157651334316264e-06, | |
| "loss": 0.159, | |
| "mean_token_accuracy": 0.9659399092197418, | |
| "num_tokens": 3509103882.0, | |
| "step": 33100 | |
| }, | |
| { | |
| "entropy": 1.35625, | |
| "epoch": 0.8116644630527399, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 4.1520465717227206e-06, | |
| "loss": 0.1782, | |
| "mean_token_accuracy": 0.9628897225856781, | |
| "num_tokens": 3514150747.0, | |
| "step": 33150 | |
| }, | |
| { | |
| "entropy": 1.3603125, | |
| "epoch": 0.8128886930120954, | |
| "grad_norm": 2.859375, | |
| "learning_rate": 4.146437088527108e-06, | |
| "loss": 0.1811, | |
| "mean_token_accuracy": 0.9617001414299011, | |
| "num_tokens": 3519220750.0, | |
| "step": 33200 | |
| }, | |
| { | |
| "entropy": 1.36859375, | |
| "epoch": 0.814112922971451, | |
| "grad_norm": 2.921875, | |
| "learning_rate": 4.140822907714728e-06, | |
| "loss": 0.1885, | |
| "mean_token_accuracy": 0.9607588303089142, | |
| "num_tokens": 3524668178.0, | |
| "step": 33250 | |
| }, | |
| { | |
| "entropy": 1.35484375, | |
| "epoch": 0.8153371529308066, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 4.135204052290131e-06, | |
| "loss": 0.1645, | |
| "mean_token_accuracy": 0.9654926788806916, | |
| "num_tokens": 3529737924.0, | |
| "step": 33300 | |
| }, | |
| { | |
| "entropy": 1.33109375, | |
| "epoch": 0.8165613828901621, | |
| "grad_norm": 3.0, | |
| "learning_rate": 4.129580545277023e-06, | |
| "loss": 0.1637, | |
| "mean_token_accuracy": 0.9648844826221467, | |
| "num_tokens": 3534673592.0, | |
| "step": 33350 | |
| }, | |
| { | |
| "entropy": 1.33046875, | |
| "epoch": 0.8177856128495177, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 4.123952409718169e-06, | |
| "loss": 0.1705, | |
| "mean_token_accuracy": 0.963813624382019, | |
| "num_tokens": 3539705624.0, | |
| "step": 33400 | |
| }, | |
| { | |
| "entropy": 1.3225, | |
| "epoch": 0.8190098428088732, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 4.118319668675301e-06, | |
| "loss": 0.1607, | |
| "mean_token_accuracy": 0.9656564962863922, | |
| "num_tokens": 3544723634.0, | |
| "step": 33450 | |
| }, | |
| { | |
| "entropy": 1.34328125, | |
| "epoch": 0.8202340727682288, | |
| "grad_norm": 3.625, | |
| "learning_rate": 4.112682345229019e-06, | |
| "loss": 0.1858, | |
| "mean_token_accuracy": 0.9613649821281434, | |
| "num_tokens": 3550196451.0, | |
| "step": 33500 | |
| }, | |
| { | |
| "entropy": 1.34546875, | |
| "epoch": 0.8214583027275844, | |
| "grad_norm": 2.375, | |
| "learning_rate": 4.107040462478706e-06, | |
| "loss": 0.1698, | |
| "mean_token_accuracy": 0.9640332353115082, | |
| "num_tokens": 3555769583.0, | |
| "step": 33550 | |
| }, | |
| { | |
| "entropy": 1.35515625, | |
| "epoch": 0.8226825326869399, | |
| "grad_norm": 4.6875, | |
| "learning_rate": 4.101394043542421e-06, | |
| "loss": 0.1781, | |
| "mean_token_accuracy": 0.9626898431777954, | |
| "num_tokens": 3560775725.0, | |
| "step": 33600 | |
| }, | |
| { | |
| "entropy": 1.37046875, | |
| "epoch": 0.8239067626462955, | |
| "grad_norm": 2.828125, | |
| "learning_rate": 4.095743111556813e-06, | |
| "loss": 0.1822, | |
| "mean_token_accuracy": 0.9615408968925476, | |
| "num_tokens": 3566233997.0, | |
| "step": 33650 | |
| }, | |
| { | |
| "entropy": 1.3565625, | |
| "epoch": 0.825130992605651, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 4.090087689677025e-06, | |
| "loss": 0.1798, | |
| "mean_token_accuracy": 0.9622524130344391, | |
| "num_tokens": 3571629994.0, | |
| "step": 33700 | |
| }, | |
| { | |
| "entropy": 1.35453125, | |
| "epoch": 0.8263552225650066, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 4.084427801076592e-06, | |
| "loss": 0.1631, | |
| "mean_token_accuracy": 0.965935331583023, | |
| "num_tokens": 3576662114.0, | |
| "step": 33750 | |
| }, | |
| { | |
| "entropy": 1.36453125, | |
| "epoch": 0.8275794525243622, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 4.0787634689473605e-06, | |
| "loss": 0.1704, | |
| "mean_token_accuracy": 0.9641584491729737, | |
| "num_tokens": 3581699530.0, | |
| "step": 33800 | |
| }, | |
| { | |
| "entropy": 1.33421875, | |
| "epoch": 0.8288036824837177, | |
| "grad_norm": 3.71875, | |
| "learning_rate": 4.0730947164993775e-06, | |
| "loss": 0.1746, | |
| "mean_token_accuracy": 0.9626482093334198, | |
| "num_tokens": 3586891414.0, | |
| "step": 33850 | |
| }, | |
| { | |
| "entropy": 1.34828125, | |
| "epoch": 0.8300279124430733, | |
| "grad_norm": 2.9375, | |
| "learning_rate": 4.067421566960805e-06, | |
| "loss": 0.173, | |
| "mean_token_accuracy": 0.9637481319904327, | |
| "num_tokens": 3591845863.0, | |
| "step": 33900 | |
| }, | |
| { | |
| "entropy": 1.32796875, | |
| "epoch": 0.8312521424024288, | |
| "grad_norm": 4.3125, | |
| "learning_rate": 4.061744043577822e-06, | |
| "loss": 0.1826, | |
| "mean_token_accuracy": 0.960258857011795, | |
| "num_tokens": 3597325814.0, | |
| "step": 33950 | |
| }, | |
| { | |
| "entropy": 1.343125, | |
| "epoch": 0.8324763723617844, | |
| "grad_norm": 3.65625, | |
| "learning_rate": 4.056062169614533e-06, | |
| "loss": 0.1788, | |
| "mean_token_accuracy": 0.9624998271465302, | |
| "num_tokens": 3602589177.0, | |
| "step": 34000 | |
| }, | |
| { | |
| "entropy": 1.33171875, | |
| "epoch": 0.83370060232114, | |
| "grad_norm": 5.5, | |
| "learning_rate": 4.050375968352865e-06, | |
| "loss": 0.1749, | |
| "mean_token_accuracy": 0.9635978293418884, | |
| "num_tokens": 3607686315.0, | |
| "step": 34050 | |
| }, | |
| { | |
| "entropy": 1.35046875, | |
| "epoch": 0.8349248322804955, | |
| "grad_norm": 2.921875, | |
| "learning_rate": 4.044685463092477e-06, | |
| "loss": 0.1823, | |
| "mean_token_accuracy": 0.9619014573097229, | |
| "num_tokens": 3613032357.0, | |
| "step": 34100 | |
| }, | |
| { | |
| "entropy": 1.3278125, | |
| "epoch": 0.8361490622398511, | |
| "grad_norm": 3.796875, | |
| "learning_rate": 4.0389906771506666e-06, | |
| "loss": 0.1567, | |
| "mean_token_accuracy": 0.9672730362415314, | |
| "num_tokens": 3617947758.0, | |
| "step": 34150 | |
| }, | |
| { | |
| "entropy": 1.3509375, | |
| "epoch": 0.8373732921992068, | |
| "grad_norm": 3.0, | |
| "learning_rate": 4.03329163386227e-06, | |
| "loss": 0.1821, | |
| "mean_token_accuracy": 0.9615289163589478, | |
| "num_tokens": 3623324648.0, | |
| "step": 34200 | |
| }, | |
| { | |
| "entropy": 1.36625, | |
| "epoch": 0.8385975221585623, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 4.027588356579567e-06, | |
| "loss": 0.1807, | |
| "mean_token_accuracy": 0.962299063205719, | |
| "num_tokens": 3628936189.0, | |
| "step": 34250 | |
| }, | |
| { | |
| "entropy": 1.34484375, | |
| "epoch": 0.8398217521179179, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 4.0218808686721884e-06, | |
| "loss": 0.1766, | |
| "mean_token_accuracy": 0.9632388269901275, | |
| "num_tokens": 3634256824.0, | |
| "step": 34300 | |
| }, | |
| { | |
| "entropy": 1.3365625, | |
| "epoch": 0.8410459820772734, | |
| "grad_norm": 3.265625, | |
| "learning_rate": 4.01616919352702e-06, | |
| "loss": 0.1653, | |
| "mean_token_accuracy": 0.9652460610866547, | |
| "num_tokens": 3639058717.0, | |
| "step": 34350 | |
| }, | |
| { | |
| "entropy": 1.3490625, | |
| "epoch": 0.842270212036629, | |
| "grad_norm": 3.53125, | |
| "learning_rate": 4.010453354548101e-06, | |
| "loss": 0.1587, | |
| "mean_token_accuracy": 0.9665447866916657, | |
| "num_tokens": 3644031006.0, | |
| "step": 34400 | |
| }, | |
| { | |
| "entropy": 1.36546875, | |
| "epoch": 0.8434944419959846, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 4.004733375156534e-06, | |
| "loss": 0.1862, | |
| "mean_token_accuracy": 0.9608346676826477, | |
| "num_tokens": 3649652142.0, | |
| "step": 34450 | |
| }, | |
| { | |
| "entropy": 1.36640625, | |
| "epoch": 0.8447186719553401, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 3.999009278790389e-06, | |
| "loss": 0.1692, | |
| "mean_token_accuracy": 0.9642466914653778, | |
| "num_tokens": 3654831381.0, | |
| "step": 34500 | |
| }, | |
| { | |
| "entropy": 1.35890625, | |
| "epoch": 0.8459429019146957, | |
| "grad_norm": 1.96875, | |
| "learning_rate": 3.993281088904603e-06, | |
| "loss": 0.1659, | |
| "mean_token_accuracy": 0.9651599872112274, | |
| "num_tokens": 3659811312.0, | |
| "step": 34550 | |
| }, | |
| { | |
| "entropy": 1.36734375, | |
| "epoch": 0.8471671318740512, | |
| "grad_norm": 3.578125, | |
| "learning_rate": 3.9875488289708895e-06, | |
| "loss": 0.1693, | |
| "mean_token_accuracy": 0.9640548026561737, | |
| "num_tokens": 3665088140.0, | |
| "step": 34600 | |
| }, | |
| { | |
| "entropy": 1.35578125, | |
| "epoch": 0.8483913618334068, | |
| "grad_norm": 2.671875, | |
| "learning_rate": 3.981812522477634e-06, | |
| "loss": 0.1683, | |
| "mean_token_accuracy": 0.9642880761623382, | |
| "num_tokens": 3670199765.0, | |
| "step": 34650 | |
| }, | |
| { | |
| "entropy": 1.371875, | |
| "epoch": 0.8496155917927624, | |
| "grad_norm": 2.125, | |
| "learning_rate": 3.976072192929812e-06, | |
| "loss": 0.1859, | |
| "mean_token_accuracy": 0.961214131116867, | |
| "num_tokens": 3675973370.0, | |
| "step": 34700 | |
| }, | |
| { | |
| "entropy": 1.335, | |
| "epoch": 0.8508398217521179, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 3.970327863848874e-06, | |
| "loss": 0.163, | |
| "mean_token_accuracy": 0.9652379751205444, | |
| "num_tokens": 3680935151.0, | |
| "step": 34750 | |
| }, | |
| { | |
| "entropy": 1.35953125, | |
| "epoch": 0.8520640517114735, | |
| "grad_norm": 3.984375, | |
| "learning_rate": 3.964579558772665e-06, | |
| "loss": 0.1686, | |
| "mean_token_accuracy": 0.9643210101127625, | |
| "num_tokens": 3686151191.0, | |
| "step": 34800 | |
| }, | |
| { | |
| "entropy": 1.35, | |
| "epoch": 0.853288281670829, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 3.95882730125532e-06, | |
| "loss": 0.1755, | |
| "mean_token_accuracy": 0.9624910676479339, | |
| "num_tokens": 3691478654.0, | |
| "step": 34850 | |
| }, | |
| { | |
| "entropy": 1.338125, | |
| "epoch": 0.8545125116301846, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 3.953071114867171e-06, | |
| "loss": 0.1711, | |
| "mean_token_accuracy": 0.9633730280399323, | |
| "num_tokens": 3696633906.0, | |
| "step": 34900 | |
| }, | |
| { | |
| "entropy": 1.34890625, | |
| "epoch": 0.8557367415895402, | |
| "grad_norm": 1.890625, | |
| "learning_rate": 3.947311023194645e-06, | |
| "loss": 0.1804, | |
| "mean_token_accuracy": 0.9618865346908569, | |
| "num_tokens": 3701978753.0, | |
| "step": 34950 | |
| }, | |
| { | |
| "entropy": 1.3384375, | |
| "epoch": 0.8569609715488957, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 3.941547049840176e-06, | |
| "loss": 0.1645, | |
| "mean_token_accuracy": 0.9649497640132904, | |
| "num_tokens": 3706915348.0, | |
| "step": 35000 | |
| }, | |
| { | |
| "entropy": 1.32359375, | |
| "epoch": 0.8581852015082513, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 3.9357792184221005e-06, | |
| "loss": 0.1739, | |
| "mean_token_accuracy": 0.9632923007011414, | |
| "num_tokens": 3712046907.0, | |
| "step": 35050 | |
| }, | |
| { | |
| "entropy": 1.3240625, | |
| "epoch": 0.8594094314676068, | |
| "grad_norm": 3.078125, | |
| "learning_rate": 3.930007552574564e-06, | |
| "loss": 0.1763, | |
| "mean_token_accuracy": 0.9626149117946625, | |
| "num_tokens": 3717274859.0, | |
| "step": 35100 | |
| }, | |
| { | |
| "entropy": 1.33484375, | |
| "epoch": 0.8606336614269624, | |
| "grad_norm": 3.03125, | |
| "learning_rate": 3.924232075947427e-06, | |
| "loss": 0.186, | |
| "mean_token_accuracy": 0.9613423335552216, | |
| "num_tokens": 3722674538.0, | |
| "step": 35150 | |
| }, | |
| { | |
| "entropy": 1.33484375, | |
| "epoch": 0.861857891386318, | |
| "grad_norm": 3.40625, | |
| "learning_rate": 3.918452812206159e-06, | |
| "loss": 0.1777, | |
| "mean_token_accuracy": 0.9628440749645233, | |
| "num_tokens": 3727975730.0, | |
| "step": 35200 | |
| }, | |
| { | |
| "entropy": 1.34125, | |
| "epoch": 0.8630821213456735, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 3.9126697850317525e-06, | |
| "loss": 0.1761, | |
| "mean_token_accuracy": 0.963371901512146, | |
| "num_tokens": 3733241093.0, | |
| "step": 35250 | |
| }, | |
| { | |
| "entropy": 1.34328125, | |
| "epoch": 0.8643063513050291, | |
| "grad_norm": 2.640625, | |
| "learning_rate": 3.906883018120619e-06, | |
| "loss": 0.1707, | |
| "mean_token_accuracy": 0.9642481172084808, | |
| "num_tokens": 3738164559.0, | |
| "step": 35300 | |
| }, | |
| { | |
| "entropy": 1.3203125, | |
| "epoch": 0.8655305812643848, | |
| "grad_norm": 3.546875, | |
| "learning_rate": 3.901092535184496e-06, | |
| "loss": 0.1713, | |
| "mean_token_accuracy": 0.9637637650966644, | |
| "num_tokens": 3743459921.0, | |
| "step": 35350 | |
| }, | |
| { | |
| "entropy": 1.35578125, | |
| "epoch": 0.8667548112237403, | |
| "grad_norm": 3.40625, | |
| "learning_rate": 3.895298359950343e-06, | |
| "loss": 0.1829, | |
| "mean_token_accuracy": 0.9605180990695953, | |
| "num_tokens": 3748868327.0, | |
| "step": 35400 | |
| }, | |
| { | |
| "entropy": 1.34265625, | |
| "epoch": 0.8679790411830959, | |
| "grad_norm": 2.125, | |
| "learning_rate": 3.889500516160254e-06, | |
| "loss": 0.1715, | |
| "mean_token_accuracy": 0.9643005490303039, | |
| "num_tokens": 3753748677.0, | |
| "step": 35450 | |
| }, | |
| { | |
| "entropy": 1.3384375, | |
| "epoch": 0.8692032711424514, | |
| "grad_norm": 2.375, | |
| "learning_rate": 3.883699027571352e-06, | |
| "loss": 0.1668, | |
| "mean_token_accuracy": 0.965086680650711, | |
| "num_tokens": 3759201853.0, | |
| "step": 35500 | |
| }, | |
| { | |
| "entropy": 1.34390625, | |
| "epoch": 0.870427501101807, | |
| "grad_norm": 3.25, | |
| "learning_rate": 3.8778939179556976e-06, | |
| "loss": 0.1694, | |
| "mean_token_accuracy": 0.9643353164196015, | |
| "num_tokens": 3764158638.0, | |
| "step": 35550 | |
| }, | |
| { | |
| "entropy": 1.33015625, | |
| "epoch": 0.8716517310611626, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 3.872085211100185e-06, | |
| "loss": 0.1621, | |
| "mean_token_accuracy": 0.9657464909553528, | |
| "num_tokens": 3769226815.0, | |
| "step": 35600 | |
| }, | |
| { | |
| "entropy": 1.35078125, | |
| "epoch": 0.8728759610205181, | |
| "grad_norm": 1.890625, | |
| "learning_rate": 3.86627293080645e-06, | |
| "loss": 0.1836, | |
| "mean_token_accuracy": 0.9611875438690185, | |
| "num_tokens": 3774861819.0, | |
| "step": 35650 | |
| }, | |
| { | |
| "entropy": 1.34953125, | |
| "epoch": 0.8741001909798737, | |
| "grad_norm": 3.203125, | |
| "learning_rate": 3.860457100890776e-06, | |
| "loss": 0.1795, | |
| "mean_token_accuracy": 0.9616686987876892, | |
| "num_tokens": 3780181646.0, | |
| "step": 35700 | |
| }, | |
| { | |
| "entropy": 1.34359375, | |
| "epoch": 0.8753244209392292, | |
| "grad_norm": 3.046875, | |
| "learning_rate": 3.854637745183983e-06, | |
| "loss": 0.1762, | |
| "mean_token_accuracy": 0.9630369508266449, | |
| "num_tokens": 3785489246.0, | |
| "step": 35750 | |
| }, | |
| { | |
| "entropy": 1.3425, | |
| "epoch": 0.8765486508985848, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 3.848814887531342e-06, | |
| "loss": 0.1865, | |
| "mean_token_accuracy": 0.9609660315513611, | |
| "num_tokens": 3790970702.0, | |
| "step": 35800 | |
| }, | |
| { | |
| "entropy": 1.3375, | |
| "epoch": 0.8777728808579404, | |
| "grad_norm": 2.890625, | |
| "learning_rate": 3.842988551792473e-06, | |
| "loss": 0.1666, | |
| "mean_token_accuracy": 0.9646478390693665, | |
| "num_tokens": 3796002667.0, | |
| "step": 35850 | |
| }, | |
| { | |
| "entropy": 1.33828125, | |
| "epoch": 0.8789971108172959, | |
| "grad_norm": 3.234375, | |
| "learning_rate": 3.83715876184125e-06, | |
| "loss": 0.1727, | |
| "mean_token_accuracy": 0.9642738771438598, | |
| "num_tokens": 3801134844.0, | |
| "step": 35900 | |
| }, | |
| { | |
| "entropy": 1.33859375, | |
| "epoch": 0.8802213407766515, | |
| "grad_norm": 2.8125, | |
| "learning_rate": 3.831325541565699e-06, | |
| "loss": 0.1714, | |
| "mean_token_accuracy": 0.9640265047550202, | |
| "num_tokens": 3806453829.0, | |
| "step": 35950 | |
| }, | |
| { | |
| "entropy": 1.34015625, | |
| "epoch": 0.881445570736007, | |
| "grad_norm": 3.046875, | |
| "learning_rate": 3.825488914867901e-06, | |
| "loss": 0.1762, | |
| "mean_token_accuracy": 0.9627239561080932, | |
| "num_tokens": 3811628461.0, | |
| "step": 36000 | |
| }, | |
| { | |
| "entropy": 1.35203125, | |
| "epoch": 0.8826698006953626, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 3.8196489056638965e-06, | |
| "loss": 0.1849, | |
| "mean_token_accuracy": 0.9613272595405579, | |
| "num_tokens": 3816892701.0, | |
| "step": 36050 | |
| }, | |
| { | |
| "entropy": 1.34703125, | |
| "epoch": 0.8838940306547182, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 3.813805537883585e-06, | |
| "loss": 0.1711, | |
| "mean_token_accuracy": 0.9637981843948364, | |
| "num_tokens": 3822028448.0, | |
| "step": 36100 | |
| }, | |
| { | |
| "entropy": 1.34875, | |
| "epoch": 0.8851182606140737, | |
| "grad_norm": 1.8671875, | |
| "learning_rate": 3.80795883547063e-06, | |
| "loss": 0.1672, | |
| "mean_token_accuracy": 0.9647044801712036, | |
| "num_tokens": 3827213092.0, | |
| "step": 36150 | |
| }, | |
| { | |
| "entropy": 1.34578125, | |
| "epoch": 0.8863424905734293, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 3.8021088223823558e-06, | |
| "loss": 0.1927, | |
| "mean_token_accuracy": 0.9597675764560699, | |
| "num_tokens": 3832709039.0, | |
| "step": 36200 | |
| }, | |
| { | |
| "entropy": 1.33359375, | |
| "epoch": 0.8875667205327848, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 3.7962555225896563e-06, | |
| "loss": 0.177, | |
| "mean_token_accuracy": 0.9623324680328369, | |
| "num_tokens": 3837879687.0, | |
| "step": 36250 | |
| }, | |
| { | |
| "entropy": 1.33890625, | |
| "epoch": 0.8887909504921404, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 3.790398960076891e-06, | |
| "loss": 0.1769, | |
| "mean_token_accuracy": 0.9629685461521149, | |
| "num_tokens": 3843045671.0, | |
| "step": 36300 | |
| }, | |
| { | |
| "entropy": 1.32703125, | |
| "epoch": 0.890015180451496, | |
| "grad_norm": 3.09375, | |
| "learning_rate": 3.7845391588417876e-06, | |
| "loss": 0.173, | |
| "mean_token_accuracy": 0.9636087584495544, | |
| "num_tokens": 3848206427.0, | |
| "step": 36350 | |
| }, | |
| { | |
| "entropy": 1.32984375, | |
| "epoch": 0.8912394104108515, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 3.778676142895346e-06, | |
| "loss": 0.1734, | |
| "mean_token_accuracy": 0.9632059478759766, | |
| "num_tokens": 3853828427.0, | |
| "step": 36400 | |
| }, | |
| { | |
| "entropy": 1.32390625, | |
| "epoch": 0.8924636403702071, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 3.772809936261739e-06, | |
| "loss": 0.1894, | |
| "mean_token_accuracy": 0.9601573574543, | |
| "num_tokens": 3859273920.0, | |
| "step": 36450 | |
| }, | |
| { | |
| "entropy": 1.3265625, | |
| "epoch": 0.8936878703295627, | |
| "grad_norm": 2.875, | |
| "learning_rate": 3.766940562978211e-06, | |
| "loss": 0.1763, | |
| "mean_token_accuracy": 0.9631186270713806, | |
| "num_tokens": 3864494355.0, | |
| "step": 36500 | |
| }, | |
| { | |
| "entropy": 1.33109375, | |
| "epoch": 0.8949121002889183, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 3.761068047094987e-06, | |
| "loss": 0.1736, | |
| "mean_token_accuracy": 0.963892787694931, | |
| "num_tokens": 3869689661.0, | |
| "step": 36550 | |
| }, | |
| { | |
| "entropy": 1.3115625, | |
| "epoch": 0.8961363302482739, | |
| "grad_norm": 2.625, | |
| "learning_rate": 3.7551924126751624e-06, | |
| "loss": 0.1832, | |
| "mean_token_accuracy": 0.9618776285648346, | |
| "num_tokens": 3875053980.0, | |
| "step": 36600 | |
| }, | |
| { | |
| "entropy": 1.3021875, | |
| "epoch": 0.8973605602076294, | |
| "grad_norm": 2.734375, | |
| "learning_rate": 3.7493136837946177e-06, | |
| "loss": 0.1749, | |
| "mean_token_accuracy": 0.962455780506134, | |
| "num_tokens": 3880568995.0, | |
| "step": 36650 | |
| }, | |
| { | |
| "entropy": 1.3209375, | |
| "epoch": 0.898584790166985, | |
| "grad_norm": 2.90625, | |
| "learning_rate": 3.743431884541909e-06, | |
| "loss": 0.1835, | |
| "mean_token_accuracy": 0.9612640655040741, | |
| "num_tokens": 3885898540.0, | |
| "step": 36700 | |
| }, | |
| { | |
| "entropy": 1.31390625, | |
| "epoch": 0.8998090201263406, | |
| "grad_norm": 2.75, | |
| "learning_rate": 3.737547039018173e-06, | |
| "loss": 0.1664, | |
| "mean_token_accuracy": 0.9649625384807586, | |
| "num_tokens": 3891014489.0, | |
| "step": 36750 | |
| }, | |
| { | |
| "entropy": 1.323125, | |
| "epoch": 0.9010332500856961, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 3.7316591713370315e-06, | |
| "loss": 0.1774, | |
| "mean_token_accuracy": 0.9622565031051635, | |
| "num_tokens": 3896408077.0, | |
| "step": 36800 | |
| }, | |
| { | |
| "entropy": 1.34515625, | |
| "epoch": 0.9022574800450517, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 3.7257683056244895e-06, | |
| "loss": 0.178, | |
| "mean_token_accuracy": 0.9631640148162842, | |
| "num_tokens": 3901699376.0, | |
| "step": 36850 | |
| }, | |
| { | |
| "entropy": 1.32171875, | |
| "epoch": 0.9034817100044072, | |
| "grad_norm": 2.84375, | |
| "learning_rate": 3.7198744660188347e-06, | |
| "loss": 0.1578, | |
| "mean_token_accuracy": 0.966994469165802, | |
| "num_tokens": 3906644235.0, | |
| "step": 36900 | |
| }, | |
| { | |
| "entropy": 1.3284375, | |
| "epoch": 0.9047059399637628, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 3.7139776766705433e-06, | |
| "loss": 0.161, | |
| "mean_token_accuracy": 0.9657053291797638, | |
| "num_tokens": 3911529877.0, | |
| "step": 36950 | |
| }, | |
| { | |
| "entropy": 1.320625, | |
| "epoch": 0.9059301699231184, | |
| "grad_norm": 2.640625, | |
| "learning_rate": 3.7080779617421733e-06, | |
| "loss": 0.1663, | |
| "mean_token_accuracy": 0.9647897446155548, | |
| "num_tokens": 3917023608.0, | |
| "step": 37000 | |
| }, | |
| { | |
| "entropy": 1.3315625, | |
| "epoch": 0.9071543998824739, | |
| "grad_norm": 3.078125, | |
| "learning_rate": 3.7021753454082772e-06, | |
| "loss": 0.1851, | |
| "mean_token_accuracy": 0.9609014749526977, | |
| "num_tokens": 3922789580.0, | |
| "step": 37050 | |
| }, | |
| { | |
| "entropy": 1.31453125, | |
| "epoch": 0.9083786298418295, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 3.696269851855292e-06, | |
| "loss": 0.1738, | |
| "mean_token_accuracy": 0.9629218196868896, | |
| "num_tokens": 3927904246.0, | |
| "step": 37100 | |
| }, | |
| { | |
| "entropy": 1.29828125, | |
| "epoch": 0.909602859801185, | |
| "grad_norm": 2.875, | |
| "learning_rate": 3.6903615052814444e-06, | |
| "loss": 0.1723, | |
| "mean_token_accuracy": 0.96382728099823, | |
| "num_tokens": 3933096610.0, | |
| "step": 37150 | |
| }, | |
| { | |
| "entropy": 1.275, | |
| "epoch": 0.9108270897605406, | |
| "grad_norm": 1.640625, | |
| "learning_rate": 3.684450329896653e-06, | |
| "loss": 0.1538, | |
| "mean_token_accuracy": 0.9664675867557526, | |
| "num_tokens": 3938208531.0, | |
| "step": 37200 | |
| }, | |
| { | |
| "entropy": 1.2990625, | |
| "epoch": 0.9120513197198962, | |
| "grad_norm": 1.6640625, | |
| "learning_rate": 3.6785363499224266e-06, | |
| "loss": 0.1676, | |
| "mean_token_accuracy": 0.9638699948787689, | |
| "num_tokens": 3943507764.0, | |
| "step": 37250 | |
| }, | |
| { | |
| "entropy": 1.29953125, | |
| "epoch": 0.9132755496792517, | |
| "grad_norm": 2.921875, | |
| "learning_rate": 3.672619589591768e-06, | |
| "loss": 0.1737, | |
| "mean_token_accuracy": 0.9631060230731964, | |
| "num_tokens": 3948883174.0, | |
| "step": 37300 | |
| }, | |
| { | |
| "entropy": 1.3128125, | |
| "epoch": 0.9144997796386073, | |
| "grad_norm": 3.34375, | |
| "learning_rate": 3.6667000731490695e-06, | |
| "loss": 0.1769, | |
| "mean_token_accuracy": 0.9630844449996948, | |
| "num_tokens": 3954228445.0, | |
| "step": 37350 | |
| }, | |
| { | |
| "entropy": 1.31328125, | |
| "epoch": 0.9157240095979629, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 3.660777824850019e-06, | |
| "loss": 0.178, | |
| "mean_token_accuracy": 0.9625172114372254, | |
| "num_tokens": 3959522338.0, | |
| "step": 37400 | |
| }, | |
| { | |
| "entropy": 1.3109375, | |
| "epoch": 0.9169482395573184, | |
| "grad_norm": 3.875, | |
| "learning_rate": 3.6548528689614985e-06, | |
| "loss": 0.1615, | |
| "mean_token_accuracy": 0.9651338791847229, | |
| "num_tokens": 3964674293.0, | |
| "step": 37450 | |
| }, | |
| { | |
| "entropy": 1.3209375, | |
| "epoch": 0.918172469516674, | |
| "grad_norm": 2.765625, | |
| "learning_rate": 3.6489252297614833e-06, | |
| "loss": 0.1743, | |
| "mean_token_accuracy": 0.9630649185180664, | |
| "num_tokens": 3970201603.0, | |
| "step": 37500 | |
| }, | |
| { | |
| "entropy": 1.33578125, | |
| "epoch": 0.9193966994760295, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 3.6429949315389455e-06, | |
| "loss": 0.1792, | |
| "mean_token_accuracy": 0.9619642412662506, | |
| "num_tokens": 3975729221.0, | |
| "step": 37550 | |
| }, | |
| { | |
| "entropy": 1.3121875, | |
| "epoch": 0.9206209294353851, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 3.6370619985937513e-06, | |
| "loss": 0.1658, | |
| "mean_token_accuracy": 0.9639672470092774, | |
| "num_tokens": 3980440332.0, | |
| "step": 37600 | |
| }, | |
| { | |
| "entropy": 1.314375, | |
| "epoch": 0.9218451593947407, | |
| "grad_norm": 2.0, | |
| "learning_rate": 3.6311264552365634e-06, | |
| "loss": 0.1748, | |
| "mean_token_accuracy": 0.9630878198146821, | |
| "num_tokens": 3985861602.0, | |
| "step": 37650 | |
| }, | |
| { | |
| "entropy": 1.31109375, | |
| "epoch": 0.9230693893540963, | |
| "grad_norm": 1.671875, | |
| "learning_rate": 3.62518832578874e-06, | |
| "loss": 0.1647, | |
| "mean_token_accuracy": 0.9646557712554932, | |
| "num_tokens": 3991141130.0, | |
| "step": 37700 | |
| }, | |
| { | |
| "entropy": 1.31140625, | |
| "epoch": 0.9242936193134519, | |
| "grad_norm": 1.6328125, | |
| "learning_rate": 3.619247634582238e-06, | |
| "loss": 0.1798, | |
| "mean_token_accuracy": 0.961934734582901, | |
| "num_tokens": 3996774043.0, | |
| "step": 37750 | |
| }, | |
| { | |
| "entropy": 1.326875, | |
| "epoch": 0.9255178492728074, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 3.6133044059595083e-06, | |
| "loss": 0.1817, | |
| "mean_token_accuracy": 0.9612915456295014, | |
| "num_tokens": 4002462308.0, | |
| "step": 37800 | |
| }, | |
| { | |
| "entropy": 1.31359375, | |
| "epoch": 0.926742079232163, | |
| "grad_norm": 3.296875, | |
| "learning_rate": 3.6073586642734027e-06, | |
| "loss": 0.1779, | |
| "mean_token_accuracy": 0.9622733199596405, | |
| "num_tokens": 4007870657.0, | |
| "step": 37850 | |
| }, | |
| { | |
| "entropy": 1.3059375, | |
| "epoch": 0.9279663091915186, | |
| "grad_norm": 1.734375, | |
| "learning_rate": 3.601410433887068e-06, | |
| "loss": 0.1696, | |
| "mean_token_accuracy": 0.9639555370807648, | |
| "num_tokens": 4012925044.0, | |
| "step": 37900 | |
| }, | |
| { | |
| "entropy": 1.30625, | |
| "epoch": 0.9291905391508741, | |
| "grad_norm": 3.203125, | |
| "learning_rate": 3.5954597391738487e-06, | |
| "loss": 0.1749, | |
| "mean_token_accuracy": 0.9627858221530914, | |
| "num_tokens": 4018089645.0, | |
| "step": 37950 | |
| }, | |
| { | |
| "entropy": 1.3059375, | |
| "epoch": 0.9304147691102297, | |
| "grad_norm": 3.0, | |
| "learning_rate": 3.589506604517189e-06, | |
| "loss": 0.1668, | |
| "mean_token_accuracy": 0.9654299330711364, | |
| "num_tokens": 4023139809.0, | |
| "step": 38000 | |
| }, | |
| { | |
| "entropy": 1.32140625, | |
| "epoch": 0.9316389990695852, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 3.583551054310529e-06, | |
| "loss": 0.1743, | |
| "mean_token_accuracy": 0.9638527107238769, | |
| "num_tokens": 4028437262.0, | |
| "step": 38050 | |
| }, | |
| { | |
| "entropy": 1.31890625, | |
| "epoch": 0.9328632290289408, | |
| "grad_norm": 3.03125, | |
| "learning_rate": 3.5775931129572072e-06, | |
| "loss": 0.1658, | |
| "mean_token_accuracy": 0.9640737462043762, | |
| "num_tokens": 4033659635.0, | |
| "step": 38100 | |
| }, | |
| { | |
| "entropy": 1.31625, | |
| "epoch": 0.9340874589882964, | |
| "grad_norm": 4.28125, | |
| "learning_rate": 3.57163280487036e-06, | |
| "loss": 0.1742, | |
| "mean_token_accuracy": 0.9627125465869903, | |
| "num_tokens": 4039135210.0, | |
| "step": 38150 | |
| }, | |
| { | |
| "entropy": 1.31125, | |
| "epoch": 0.9353116889476519, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 3.5656701544728222e-06, | |
| "loss": 0.1739, | |
| "mean_token_accuracy": 0.9629321038722992, | |
| "num_tokens": 4044192912.0, | |
| "step": 38200 | |
| }, | |
| { | |
| "entropy": 1.315625, | |
| "epoch": 0.9365359189070075, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 3.559705186197026e-06, | |
| "loss": 0.1641, | |
| "mean_token_accuracy": 0.9655595874786377, | |
| "num_tokens": 4049649393.0, | |
| "step": 38250 | |
| }, | |
| { | |
| "entropy": 1.31484375, | |
| "epoch": 0.937760148866363, | |
| "grad_norm": 3.0625, | |
| "learning_rate": 3.5537379244849017e-06, | |
| "loss": 0.1739, | |
| "mean_token_accuracy": 0.9634083175659179, | |
| "num_tokens": 4054901732.0, | |
| "step": 38300 | |
| }, | |
| { | |
| "entropy": 1.31140625, | |
| "epoch": 0.9389843788257186, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 3.5477683937877755e-06, | |
| "loss": 0.1694, | |
| "mean_token_accuracy": 0.9634031581878663, | |
| "num_tokens": 4060033796.0, | |
| "step": 38350 | |
| }, | |
| { | |
| "entropy": 1.30640625, | |
| "epoch": 0.9402086087850742, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 3.541796618566273e-06, | |
| "loss": 0.1634, | |
| "mean_token_accuracy": 0.9645454668998719, | |
| "num_tokens": 4065362004.0, | |
| "step": 38400 | |
| }, | |
| { | |
| "entropy": 1.281875, | |
| "epoch": 0.9414328387444297, | |
| "grad_norm": 3.0625, | |
| "learning_rate": 3.535822623290217e-06, | |
| "loss": 0.1456, | |
| "mean_token_accuracy": 0.9695195186138154, | |
| "num_tokens": 4070167345.0, | |
| "step": 38450 | |
| }, | |
| { | |
| "entropy": 1.3134375, | |
| "epoch": 0.9426570687037853, | |
| "grad_norm": 0.004974365234375, | |
| "learning_rate": 3.5298464324385246e-06, | |
| "loss": 0.1636, | |
| "mean_token_accuracy": 0.9643441307544708, | |
| "num_tokens": 4075331852.0, | |
| "step": 38500 | |
| }, | |
| { | |
| "entropy": 1.29109375, | |
| "epoch": 0.9438812986631409, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 3.523868070499112e-06, | |
| "loss": 0.1522, | |
| "mean_token_accuracy": 0.9671092510223389, | |
| "num_tokens": 4080329045.0, | |
| "step": 38550 | |
| }, | |
| { | |
| "entropy": 1.2840625, | |
| "epoch": 0.9451055286224964, | |
| "grad_norm": 3.75, | |
| "learning_rate": 3.517887561968791e-06, | |
| "loss": 0.1616, | |
| "mean_token_accuracy": 0.9650249874591827, | |
| "num_tokens": 4085382254.0, | |
| "step": 38600 | |
| }, | |
| { | |
| "entropy": 1.29203125, | |
| "epoch": 0.946329758581852, | |
| "grad_norm": 3.5, | |
| "learning_rate": 3.5119049313531687e-06, | |
| "loss": 0.1698, | |
| "mean_token_accuracy": 0.9630708813667297, | |
| "num_tokens": 4090736615.0, | |
| "step": 38650 | |
| }, | |
| { | |
| "entropy": 1.29640625, | |
| "epoch": 0.9475539885412075, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 3.5059202031665473e-06, | |
| "loss": 0.1761, | |
| "mean_token_accuracy": 0.962629064321518, | |
| "num_tokens": 4096335023.0, | |
| "step": 38700 | |
| }, | |
| { | |
| "entropy": 1.28390625, | |
| "epoch": 0.9487782185005631, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 3.499933401931826e-06, | |
| "loss": 0.1711, | |
| "mean_token_accuracy": 0.9639296698570251, | |
| "num_tokens": 4101408840.0, | |
| "step": 38750 | |
| }, | |
| { | |
| "entropy": 1.26234375, | |
| "epoch": 0.9500024484599187, | |
| "grad_norm": 3.515625, | |
| "learning_rate": 3.493944552180395e-06, | |
| "loss": 0.1548, | |
| "mean_token_accuracy": 0.9661567640304566, | |
| "num_tokens": 4106422813.0, | |
| "step": 38800 | |
| }, | |
| { | |
| "entropy": 1.2709375, | |
| "epoch": 0.9512266784192743, | |
| "grad_norm": 3.109375, | |
| "learning_rate": 3.487953678452042e-06, | |
| "loss": 0.1544, | |
| "mean_token_accuracy": 0.9669099247455597, | |
| "num_tokens": 4111614226.0, | |
| "step": 38850 | |
| }, | |
| { | |
| "entropy": 1.2715625, | |
| "epoch": 0.9524509083786299, | |
| "grad_norm": 3.1875, | |
| "learning_rate": 3.481960805294847e-06, | |
| "loss": 0.1652, | |
| "mean_token_accuracy": 0.9649276435375214, | |
| "num_tokens": 4116902981.0, | |
| "step": 38900 | |
| }, | |
| { | |
| "entropy": 1.26828125, | |
| "epoch": 0.9536751383379855, | |
| "grad_norm": 3.390625, | |
| "learning_rate": 3.47596595726508e-06, | |
| "loss": 0.1682, | |
| "mean_token_accuracy": 0.9636393487453461, | |
| "num_tokens": 4122056561.0, | |
| "step": 38950 | |
| }, | |
| { | |
| "entropy": 1.270625, | |
| "epoch": 0.954899368297341, | |
| "grad_norm": 2.125, | |
| "learning_rate": 3.4699691589271076e-06, | |
| "loss": 0.1685, | |
| "mean_token_accuracy": 0.9632602989673614, | |
| "num_tokens": 4127685041.0, | |
| "step": 39000 | |
| }, | |
| { | |
| "entropy": 1.2453125, | |
| "epoch": 0.9561235982566966, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 3.463970434853285e-06, | |
| "loss": 0.142, | |
| "mean_token_accuracy": 0.9697425818443298, | |
| "num_tokens": 4132578966.0, | |
| "step": 39050 | |
| }, | |
| { | |
| "entropy": 1.258125, | |
| "epoch": 0.9573478282160521, | |
| "grad_norm": 2.375, | |
| "learning_rate": 3.45796980962386e-06, | |
| "loss": 0.1678, | |
| "mean_token_accuracy": 0.9646705484390259, | |
| "num_tokens": 4138012784.0, | |
| "step": 39100 | |
| }, | |
| { | |
| "entropy": 1.26859375, | |
| "epoch": 0.9585720581754077, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 3.451967307826869e-06, | |
| "loss": 0.1757, | |
| "mean_token_accuracy": 0.9628133857250214, | |
| "num_tokens": 4143616072.0, | |
| "step": 39150 | |
| }, | |
| { | |
| "entropy": 1.259375, | |
| "epoch": 0.9597962881347633, | |
| "grad_norm": 4.1875, | |
| "learning_rate": 3.445962954058039e-06, | |
| "loss": 0.1752, | |
| "mean_token_accuracy": 0.962674834728241, | |
| "num_tokens": 4148944121.0, | |
| "step": 39200 | |
| }, | |
| { | |
| "entropy": 1.25921875, | |
| "epoch": 0.9610205180941188, | |
| "grad_norm": 3.375, | |
| "learning_rate": 3.439956772920685e-06, | |
| "loss": 0.1648, | |
| "mean_token_accuracy": 0.9645766019821167, | |
| "num_tokens": 4153880493.0, | |
| "step": 39250 | |
| }, | |
| { | |
| "entropy": 1.2525, | |
| "epoch": 0.9622447480534744, | |
| "grad_norm": 3.015625, | |
| "learning_rate": 3.4339487890256097e-06, | |
| "loss": 0.161, | |
| "mean_token_accuracy": 0.965018298625946, | |
| "num_tokens": 4158921325.0, | |
| "step": 39300 | |
| }, | |
| { | |
| "entropy": 1.2459375, | |
| "epoch": 0.9634689780128299, | |
| "grad_norm": 3.5, | |
| "learning_rate": 3.4279390269910033e-06, | |
| "loss": 0.1658, | |
| "mean_token_accuracy": 0.9649594247341156, | |
| "num_tokens": 4163950443.0, | |
| "step": 39350 | |
| }, | |
| { | |
| "entropy": 1.2590625, | |
| "epoch": 0.9646932079721855, | |
| "grad_norm": 2.875, | |
| "learning_rate": 3.421927511442341e-06, | |
| "loss": 0.172, | |
| "mean_token_accuracy": 0.9640387868881226, | |
| "num_tokens": 4169489034.0, | |
| "step": 39400 | |
| }, | |
| { | |
| "entropy": 1.261875, | |
| "epoch": 0.9659174379315411, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 3.4159142670122845e-06, | |
| "loss": 0.1719, | |
| "mean_token_accuracy": 0.9637044394016265, | |
| "num_tokens": 4174842337.0, | |
| "step": 39450 | |
| }, | |
| { | |
| "entropy": 1.26265625, | |
| "epoch": 0.9671416678908966, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 3.4098993183405793e-06, | |
| "loss": 0.1725, | |
| "mean_token_accuracy": 0.9634046721458435, | |
| "num_tokens": 4180354181.0, | |
| "step": 39500 | |
| }, | |
| { | |
| "entropy": 1.26046875, | |
| "epoch": 0.9683658978502522, | |
| "grad_norm": 1.6484375, | |
| "learning_rate": 3.403882690073954e-06, | |
| "loss": 0.1653, | |
| "mean_token_accuracy": 0.9639586913585663, | |
| "num_tokens": 4185417059.0, | |
| "step": 39550 | |
| }, | |
| { | |
| "entropy": 1.27140625, | |
| "epoch": 0.9695901278096077, | |
| "grad_norm": 1.2421875, | |
| "learning_rate": 3.3978644068660175e-06, | |
| "loss": 0.1583, | |
| "mean_token_accuracy": 0.96663733959198, | |
| "num_tokens": 4190550088.0, | |
| "step": 39600 | |
| }, | |
| { | |
| "entropy": 1.28140625, | |
| "epoch": 0.9708143577689633, | |
| "grad_norm": 2.859375, | |
| "learning_rate": 3.3918444933771637e-06, | |
| "loss": 0.1755, | |
| "mean_token_accuracy": 0.9624445605278015, | |
| "num_tokens": 4196306371.0, | |
| "step": 39650 | |
| }, | |
| { | |
| "entropy": 1.27078125, | |
| "epoch": 0.9720385877283189, | |
| "grad_norm": 1.875, | |
| "learning_rate": 3.385822974274465e-06, | |
| "loss": 0.1673, | |
| "mean_token_accuracy": 0.9644521117210388, | |
| "num_tokens": 4201403065.0, | |
| "step": 39700 | |
| }, | |
| { | |
| "entropy": 1.2859375, | |
| "epoch": 0.9732628176876744, | |
| "grad_norm": 2.78125, | |
| "learning_rate": 3.3797998742315724e-06, | |
| "loss": 0.1646, | |
| "mean_token_accuracy": 0.9653528666496277, | |
| "num_tokens": 4206711792.0, | |
| "step": 39750 | |
| }, | |
| { | |
| "entropy": 1.2709375, | |
| "epoch": 0.97448704764703, | |
| "grad_norm": 3.625, | |
| "learning_rate": 3.3737752179286158e-06, | |
| "loss": 0.1694, | |
| "mean_token_accuracy": 0.964444397687912, | |
| "num_tokens": 4212047599.0, | |
| "step": 39800 | |
| }, | |
| { | |
| "entropy": 1.289375, | |
| "epoch": 0.9757112776063855, | |
| "grad_norm": 2.5, | |
| "learning_rate": 3.3677490300521e-06, | |
| "loss": 0.1697, | |
| "mean_token_accuracy": 0.963803733587265, | |
| "num_tokens": 4217477603.0, | |
| "step": 39850 | |
| }, | |
| { | |
| "entropy": 1.27140625, | |
| "epoch": 0.9769355075657411, | |
| "grad_norm": 2.25, | |
| "learning_rate": 3.361721335294809e-06, | |
| "loss": 0.1579, | |
| "mean_token_accuracy": 0.9657166159152984, | |
| "num_tokens": 4222290662.0, | |
| "step": 39900 | |
| }, | |
| { | |
| "entropy": 1.3034375, | |
| "epoch": 0.9781597375250967, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 3.355692158355699e-06, | |
| "loss": 0.1816, | |
| "mean_token_accuracy": 0.9609908378124237, | |
| "num_tokens": 4228024616.0, | |
| "step": 39950 | |
| }, | |
| { | |
| "entropy": 1.2765625, | |
| "epoch": 0.9793839674844523, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 3.349661523939799e-06, | |
| "loss": 0.1549, | |
| "mean_token_accuracy": 0.9669453859329223, | |
| "num_tokens": 4233080108.0, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 0.9793839674844523, | |
| "eval_entropy": 1.2830078125, | |
| "eval_loss": 0.18154892325401306, | |
| "eval_mean_token_accuracy": 0.9611844887336095, | |
| "eval_num_tokens": 4233080108.0, | |
| "eval_runtime": 601.7254, | |
| "eval_samples_per_second": 16.047, | |
| "eval_steps_per_second": 0.201, | |
| "step": 40000 | |
| }, | |
| { | |
| "entropy": 1.28, | |
| "epoch": 0.9806081974438079, | |
| "grad_norm": 3.40625, | |
| "learning_rate": 3.3436294567581125e-06, | |
| "loss": 0.1685, | |
| "mean_token_accuracy": 0.9643000710010529, | |
| "num_tokens": 4238491459.0, | |
| "step": 40050 | |
| }, | |
| { | |
| "entropy": 1.29109375, | |
| "epoch": 0.9818324274031635, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 3.3375959815275103e-06, | |
| "loss": 0.1714, | |
| "mean_token_accuracy": 0.9640710878372193, | |
| "num_tokens": 4244109418.0, | |
| "step": 40100 | |
| }, | |
| { | |
| "entropy": 1.27515625, | |
| "epoch": 0.983056657362519, | |
| "grad_norm": 0.029052734375, | |
| "learning_rate": 3.3315611229706377e-06, | |
| "loss": 0.1519, | |
| "mean_token_accuracy": 0.9681409633159638, | |
| "num_tokens": 4249250373.0, | |
| "step": 40150 | |
| }, | |
| { | |
| "entropy": 1.2784375, | |
| "epoch": 0.9842808873218746, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 3.325524905815804e-06, | |
| "loss": 0.1654, | |
| "mean_token_accuracy": 0.9648780179023743, | |
| "num_tokens": 4254623197.0, | |
| "step": 40200 | |
| }, | |
| { | |
| "entropy": 1.2959375, | |
| "epoch": 0.9855051172812301, | |
| "grad_norm": 2.0, | |
| "learning_rate": 3.3194873547968867e-06, | |
| "loss": 0.1667, | |
| "mean_token_accuracy": 0.963757860660553, | |
| "num_tokens": 4260002335.0, | |
| "step": 40250 | |
| }, | |
| { | |
| "entropy": 1.28453125, | |
| "epoch": 0.9867293472405857, | |
| "grad_norm": 0.0078125, | |
| "learning_rate": 3.313448494653232e-06, | |
| "loss": 0.1738, | |
| "mean_token_accuracy": 0.9633991587162017, | |
| "num_tokens": 4265450665.0, | |
| "step": 40300 | |
| }, | |
| { | |
| "entropy": 1.26625, | |
| "epoch": 0.9879535771999413, | |
| "grad_norm": 2.90625, | |
| "learning_rate": 3.3074083501295447e-06, | |
| "loss": 0.1441, | |
| "mean_token_accuracy": 0.9687767088413238, | |
| "num_tokens": 4270155512.0, | |
| "step": 40350 | |
| }, | |
| { | |
| "entropy": 1.2728125, | |
| "epoch": 0.9891778071592968, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 3.3013669459757956e-06, | |
| "loss": 0.1546, | |
| "mean_token_accuracy": 0.9668355488777161, | |
| "num_tokens": 4275174062.0, | |
| "step": 40400 | |
| }, | |
| { | |
| "entropy": 1.28171875, | |
| "epoch": 0.9904020371186524, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 3.2953243069471187e-06, | |
| "loss": 0.1692, | |
| "mean_token_accuracy": 0.9641734325885772, | |
| "num_tokens": 4280291982.0, | |
| "step": 40450 | |
| }, | |
| { | |
| "entropy": 1.29375, | |
| "epoch": 0.9916262670780079, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 3.2892804578037036e-06, | |
| "loss": 0.1754, | |
| "mean_token_accuracy": 0.9624480056762695, | |
| "num_tokens": 4285827143.0, | |
| "step": 40500 | |
| }, | |
| { | |
| "entropy": 1.29921875, | |
| "epoch": 0.9928504970373635, | |
| "grad_norm": 3.328125, | |
| "learning_rate": 3.2832354233107023e-06, | |
| "loss": 0.1717, | |
| "mean_token_accuracy": 0.9635557103157043, | |
| "num_tokens": 4291196556.0, | |
| "step": 40550 | |
| }, | |
| { | |
| "entropy": 1.29515625, | |
| "epoch": 0.9940747269967191, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 3.2771892282381226e-06, | |
| "loss": 0.1535, | |
| "mean_token_accuracy": 0.9667463576793671, | |
| "num_tokens": 4296297335.0, | |
| "step": 40600 | |
| }, | |
| { | |
| "entropy": 1.2765625, | |
| "epoch": 0.9952989569560746, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 3.2711418973607257e-06, | |
| "loss": 0.1584, | |
| "mean_token_accuracy": 0.9667293214797974, | |
| "num_tokens": 4301506384.0, | |
| "step": 40650 | |
| }, | |
| { | |
| "entropy": 1.27078125, | |
| "epoch": 0.9965231869154302, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 3.2650934554579314e-06, | |
| "loss": 0.1551, | |
| "mean_token_accuracy": 0.9660876715183258, | |
| "num_tokens": 4306603792.0, | |
| "step": 40700 | |
| }, | |
| { | |
| "entropy": 1.27515625, | |
| "epoch": 0.9977474168747857, | |
| "grad_norm": 4.15625, | |
| "learning_rate": 3.2590439273137074e-06, | |
| "loss": 0.1702, | |
| "mean_token_accuracy": 0.9637362861633301, | |
| "num_tokens": 4312148607.0, | |
| "step": 40750 | |
| }, | |
| { | |
| "entropy": 1.266875, | |
| "epoch": 0.9989716468341413, | |
| "grad_norm": 3.546875, | |
| "learning_rate": 3.2529933377164754e-06, | |
| "loss": 0.1498, | |
| "mean_token_accuracy": 0.9686801016330719, | |
| "num_tokens": 4317085828.0, | |
| "step": 40800 | |
| }, | |
| { | |
| "entropy": 1.2784375, | |
| "epoch": 1.0001958767934969, | |
| "grad_norm": 3.578125, | |
| "learning_rate": 3.2469417114590055e-06, | |
| "loss": 0.1627, | |
| "mean_token_accuracy": 0.9648519742488861, | |
| "num_tokens": 4322221289.0, | |
| "step": 40850 | |
| }, | |
| { | |
| "entropy": 1.28578125, | |
| "epoch": 1.0014201067528525, | |
| "grad_norm": 4.375, | |
| "learning_rate": 3.240889073338315e-06, | |
| "loss": 0.1602, | |
| "mean_token_accuracy": 0.9657353925704956, | |
| "num_tokens": 4327372960.0, | |
| "step": 40900 | |
| }, | |
| { | |
| "entropy": 1.268125, | |
| "epoch": 1.002644336712208, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 3.2348354481555692e-06, | |
| "loss": 0.1607, | |
| "mean_token_accuracy": 0.9653881311416626, | |
| "num_tokens": 4332436449.0, | |
| "step": 40950 | |
| }, | |
| { | |
| "entropy": 1.27359375, | |
| "epoch": 1.0038685666715637, | |
| "grad_norm": 0.0013580322265625, | |
| "learning_rate": 3.2287808607159753e-06, | |
| "loss": 0.153, | |
| "mean_token_accuracy": 0.9669638919830322, | |
| "num_tokens": 4337572886.0, | |
| "step": 41000 | |
| }, | |
| { | |
| "entropy": 1.26375, | |
| "epoch": 1.005092796630919, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 3.222725335828685e-06, | |
| "loss": 0.1474, | |
| "mean_token_accuracy": 0.9681554007530212, | |
| "num_tokens": 4342524064.0, | |
| "step": 41050 | |
| }, | |
| { | |
| "entropy": 1.290625, | |
| "epoch": 1.0063170265902748, | |
| "grad_norm": 2.796875, | |
| "learning_rate": 3.216668898306692e-06, | |
| "loss": 0.1723, | |
| "mean_token_accuracy": 0.9632875370979309, | |
| "num_tokens": 4347805365.0, | |
| "step": 41100 | |
| }, | |
| { | |
| "entropy": 1.28421875, | |
| "epoch": 1.0075412565496302, | |
| "grad_norm": 3.546875, | |
| "learning_rate": 3.210611572966728e-06, | |
| "loss": 0.1571, | |
| "mean_token_accuracy": 0.9664819014072418, | |
| "num_tokens": 4352875723.0, | |
| "step": 41150 | |
| }, | |
| { | |
| "entropy": 1.29171875, | |
| "epoch": 1.008765486508986, | |
| "grad_norm": 1.7421875, | |
| "learning_rate": 3.2045533846291643e-06, | |
| "loss": 0.1755, | |
| "mean_token_accuracy": 0.9631037187576293, | |
| "num_tokens": 4358561815.0, | |
| "step": 41200 | |
| }, | |
| { | |
| "entropy": 1.276875, | |
| "epoch": 1.0099897164683413, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 3.1984943581179053e-06, | |
| "loss": 0.1553, | |
| "mean_token_accuracy": 0.9667964303493499, | |
| "num_tokens": 4363644242.0, | |
| "step": 41250 | |
| }, | |
| { | |
| "entropy": 1.30296875, | |
| "epoch": 1.011213946427697, | |
| "grad_norm": 3.96875, | |
| "learning_rate": 3.1924345182602943e-06, | |
| "loss": 0.1749, | |
| "mean_token_accuracy": 0.9630448269844055, | |
| "num_tokens": 4369318393.0, | |
| "step": 41300 | |
| }, | |
| { | |
| "entropy": 1.28875, | |
| "epoch": 1.0124381763870525, | |
| "grad_norm": 3.296875, | |
| "learning_rate": 3.1863738898870033e-06, | |
| "loss": 0.1669, | |
| "mean_token_accuracy": 0.9647123277187347, | |
| "num_tokens": 4374659681.0, | |
| "step": 41350 | |
| }, | |
| { | |
| "entropy": 1.27265625, | |
| "epoch": 1.0136624063464081, | |
| "grad_norm": 3.125, | |
| "learning_rate": 3.180312497831938e-06, | |
| "loss": 0.1567, | |
| "mean_token_accuracy": 0.9661735820770264, | |
| "num_tokens": 4379733438.0, | |
| "step": 41400 | |
| }, | |
| { | |
| "entropy": 1.28484375, | |
| "epoch": 1.0148866363057636, | |
| "grad_norm": 3.046875, | |
| "learning_rate": 3.174250366932133e-06, | |
| "loss": 0.1612, | |
| "mean_token_accuracy": 0.9659793210029602, | |
| "num_tokens": 4384885742.0, | |
| "step": 41450 | |
| }, | |
| { | |
| "entropy": 1.2934375, | |
| "epoch": 1.0161108662651193, | |
| "grad_norm": 3.65625, | |
| "learning_rate": 3.1681875220276487e-06, | |
| "loss": 0.1702, | |
| "mean_token_accuracy": 0.9628891766071319, | |
| "num_tokens": 4390251007.0, | |
| "step": 41500 | |
| }, | |
| { | |
| "entropy": 1.29703125, | |
| "epoch": 1.0173350962244747, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 3.1621239879614722e-06, | |
| "loss": 0.1752, | |
| "mean_token_accuracy": 0.9631851124763489, | |
| "num_tokens": 4395820970.0, | |
| "step": 41550 | |
| }, | |
| { | |
| "entropy": 1.289375, | |
| "epoch": 1.0185593261838304, | |
| "grad_norm": 3.15625, | |
| "learning_rate": 3.1560597895794157e-06, | |
| "loss": 0.1651, | |
| "mean_token_accuracy": 0.9643260395526886, | |
| "num_tokens": 4401284321.0, | |
| "step": 41600 | |
| }, | |
| { | |
| "entropy": 1.3046875, | |
| "epoch": 1.0197835561431858, | |
| "grad_norm": 2.859375, | |
| "learning_rate": 3.149994951730011e-06, | |
| "loss": 0.1879, | |
| "mean_token_accuracy": 0.9601117408275605, | |
| "num_tokens": 4406768060.0, | |
| "step": 41650 | |
| }, | |
| { | |
| "entropy": 1.29484375, | |
| "epoch": 1.0210077861025415, | |
| "grad_norm": 3.0, | |
| "learning_rate": 3.143929499264413e-06, | |
| "loss": 0.1665, | |
| "mean_token_accuracy": 0.9648369300365448, | |
| "num_tokens": 4412201333.0, | |
| "step": 41700 | |
| }, | |
| { | |
| "entropy": 1.28328125, | |
| "epoch": 1.0222320160618972, | |
| "grad_norm": 3.40625, | |
| "learning_rate": 3.137863457036292e-06, | |
| "loss": 0.1533, | |
| "mean_token_accuracy": 0.9676184570789337, | |
| "num_tokens": 4417135073.0, | |
| "step": 41750 | |
| }, | |
| { | |
| "entropy": 1.3009375, | |
| "epoch": 1.0234562460212526, | |
| "grad_norm": 2.828125, | |
| "learning_rate": 3.1317968499017366e-06, | |
| "loss": 0.1742, | |
| "mean_token_accuracy": 0.9627422571182251, | |
| "num_tokens": 4422234270.0, | |
| "step": 41800 | |
| }, | |
| { | |
| "entropy": 1.29265625, | |
| "epoch": 1.0246804759806083, | |
| "grad_norm": 1.6875, | |
| "learning_rate": 3.1257297027191517e-06, | |
| "loss": 0.1579, | |
| "mean_token_accuracy": 0.9664195513725281, | |
| "num_tokens": 4427309878.0, | |
| "step": 41850 | |
| }, | |
| { | |
| "entropy": 1.275, | |
| "epoch": 1.0259047059399637, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 3.1196620403491515e-06, | |
| "loss": 0.1651, | |
| "mean_token_accuracy": 0.9644128286838531, | |
| "num_tokens": 4432672891.0, | |
| "step": 41900 | |
| }, | |
| { | |
| "entropy": 1.2815625, | |
| "epoch": 1.0271289358993194, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 3.113593887654463e-06, | |
| "loss": 0.1513, | |
| "mean_token_accuracy": 0.9673609352111816, | |
| "num_tokens": 4437526358.0, | |
| "step": 41950 | |
| }, | |
| { | |
| "entropy": 1.290625, | |
| "epoch": 1.0283531658586749, | |
| "grad_norm": 3.125, | |
| "learning_rate": 3.107525269499825e-06, | |
| "loss": 0.1706, | |
| "mean_token_accuracy": 0.9627550756931305, | |
| "num_tokens": 4442820350.0, | |
| "step": 42000 | |
| }, | |
| { | |
| "entropy": 1.29484375, | |
| "epoch": 1.0295773958180305, | |
| "grad_norm": 3.4375, | |
| "learning_rate": 3.1014562107518786e-06, | |
| "loss": 0.1684, | |
| "mean_token_accuracy": 0.9646277678012848, | |
| "num_tokens": 4448357734.0, | |
| "step": 42050 | |
| }, | |
| { | |
| "entropy": 1.300625, | |
| "epoch": 1.030801625777386, | |
| "grad_norm": 3.5625, | |
| "learning_rate": 3.0953867362790734e-06, | |
| "loss": 0.1802, | |
| "mean_token_accuracy": 0.9611736404895782, | |
| "num_tokens": 4453928087.0, | |
| "step": 42100 | |
| }, | |
| { | |
| "entropy": 1.29171875, | |
| "epoch": 1.0320258557367417, | |
| "grad_norm": 3.375, | |
| "learning_rate": 3.089316870951562e-06, | |
| "loss": 0.162, | |
| "mean_token_accuracy": 0.9649739050865174, | |
| "num_tokens": 4458946227.0, | |
| "step": 42150 | |
| }, | |
| { | |
| "entropy": 1.289375, | |
| "epoch": 1.033250085696097, | |
| "grad_norm": 3.21875, | |
| "learning_rate": 3.083246639641098e-06, | |
| "loss": 0.1723, | |
| "mean_token_accuracy": 0.9634380388259888, | |
| "num_tokens": 4464192504.0, | |
| "step": 42200 | |
| }, | |
| { | |
| "entropy": 1.3146875, | |
| "epoch": 1.0344743156554528, | |
| "grad_norm": 2.453125, | |
| "learning_rate": 3.077176067220935e-06, | |
| "loss": 0.1793, | |
| "mean_token_accuracy": 0.9617934930324554, | |
| "num_tokens": 4469999689.0, | |
| "step": 42250 | |
| }, | |
| { | |
| "entropy": 1.3025, | |
| "epoch": 1.0356985456148082, | |
| "grad_norm": 2.125, | |
| "learning_rate": 3.0711051785657236e-06, | |
| "loss": 0.1649, | |
| "mean_token_accuracy": 0.964527097940445, | |
| "num_tokens": 4475221088.0, | |
| "step": 42300 | |
| }, | |
| { | |
| "entropy": 1.29015625, | |
| "epoch": 1.036922775574164, | |
| "grad_norm": 2.84375, | |
| "learning_rate": 3.065033998551413e-06, | |
| "loss": 0.1741, | |
| "mean_token_accuracy": 0.9632121896743775, | |
| "num_tokens": 4480484467.0, | |
| "step": 42350 | |
| }, | |
| { | |
| "entropy": 1.29890625, | |
| "epoch": 1.0381470055335194, | |
| "grad_norm": 3.234375, | |
| "learning_rate": 3.0589625520551414e-06, | |
| "loss": 0.168, | |
| "mean_token_accuracy": 0.9637061321735382, | |
| "num_tokens": 4486042679.0, | |
| "step": 42400 | |
| }, | |
| { | |
| "entropy": 1.31703125, | |
| "epoch": 1.039371235492875, | |
| "grad_norm": 2.75, | |
| "learning_rate": 3.0528908639551436e-06, | |
| "loss": 0.1726, | |
| "mean_token_accuracy": 0.9634595859050751, | |
| "num_tokens": 4491749175.0, | |
| "step": 42450 | |
| }, | |
| { | |
| "entropy": 1.280625, | |
| "epoch": 1.0405954654522305, | |
| "grad_norm": 2.59375, | |
| "learning_rate": 3.0468189591306418e-06, | |
| "loss": 0.1637, | |
| "mean_token_accuracy": 0.9648339354991913, | |
| "num_tokens": 4497083391.0, | |
| "step": 42500 | |
| }, | |
| { | |
| "entropy": 1.275, | |
| "epoch": 1.0418196954115861, | |
| "grad_norm": 3.59375, | |
| "learning_rate": 3.040746862461747e-06, | |
| "loss": 0.1573, | |
| "mean_token_accuracy": 0.9660842347145081, | |
| "num_tokens": 4502213588.0, | |
| "step": 42550 | |
| }, | |
| { | |
| "entropy": 1.27265625, | |
| "epoch": 1.0430439253709416, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 3.0346745988293553e-06, | |
| "loss": 0.1638, | |
| "mean_token_accuracy": 0.9644993054866791, | |
| "num_tokens": 4507601887.0, | |
| "step": 42600 | |
| }, | |
| { | |
| "entropy": 1.2703125, | |
| "epoch": 1.0442681553302973, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 3.02860219311505e-06, | |
| "loss": 0.162, | |
| "mean_token_accuracy": 0.965209093093872, | |
| "num_tokens": 4512999351.0, | |
| "step": 42650 | |
| }, | |
| { | |
| "entropy": 1.2659375, | |
| "epoch": 1.0454923852896527, | |
| "grad_norm": 2.84375, | |
| "learning_rate": 3.0225296702009917e-06, | |
| "loss": 0.1708, | |
| "mean_token_accuracy": 0.9636136376857758, | |
| "num_tokens": 4518295845.0, | |
| "step": 42700 | |
| }, | |
| { | |
| "entropy": 1.27453125, | |
| "epoch": 1.0467166152490084, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 3.016457054969827e-06, | |
| "loss": 0.165, | |
| "mean_token_accuracy": 0.9648648130893708, | |
| "num_tokens": 4523705084.0, | |
| "step": 42750 | |
| }, | |
| { | |
| "entropy": 1.27328125, | |
| "epoch": 1.0479408452083638, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 3.0103843723045753e-06, | |
| "loss": 0.1587, | |
| "mean_token_accuracy": 0.9660780084133148, | |
| "num_tokens": 4528928559.0, | |
| "step": 42800 | |
| }, | |
| { | |
| "entropy": 1.27140625, | |
| "epoch": 1.0491650751677195, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 3.004311647088536e-06, | |
| "loss": 0.1608, | |
| "mean_token_accuracy": 0.9661289596557617, | |
| "num_tokens": 4534161929.0, | |
| "step": 42850 | |
| }, | |
| { | |
| "entropy": 1.28, | |
| "epoch": 1.0503893051270752, | |
| "grad_norm": 2.734375, | |
| "learning_rate": 2.9982389042051802e-06, | |
| "loss": 0.1596, | |
| "mean_token_accuracy": 0.9655217385292053, | |
| "num_tokens": 4539230226.0, | |
| "step": 42900 | |
| }, | |
| { | |
| "entropy": 1.27828125, | |
| "epoch": 1.0516135350864306, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 2.992166168538055e-06, | |
| "loss": 0.1654, | |
| "mean_token_accuracy": 0.9645612442493439, | |
| "num_tokens": 4544444757.0, | |
| "step": 42950 | |
| }, | |
| { | |
| "entropy": 1.28609375, | |
| "epoch": 1.0528377650457863, | |
| "grad_norm": 3.1875, | |
| "learning_rate": 2.986093464970675e-06, | |
| "loss": 0.1809, | |
| "mean_token_accuracy": 0.961436516046524, | |
| "num_tokens": 4550024290.0, | |
| "step": 43000 | |
| }, | |
| { | |
| "entropy": 1.2921875, | |
| "epoch": 1.0540619950051417, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 2.9800208183864225e-06, | |
| "loss": 0.1737, | |
| "mean_token_accuracy": 0.9631437683105468, | |
| "num_tokens": 4555846037.0, | |
| "step": 43050 | |
| }, | |
| { | |
| "entropy": 1.29046875, | |
| "epoch": 1.0552862249644974, | |
| "grad_norm": 2.859375, | |
| "learning_rate": 2.97394825366845e-06, | |
| "loss": 0.1824, | |
| "mean_token_accuracy": 0.9611044287681579, | |
| "num_tokens": 4561556919.0, | |
| "step": 43100 | |
| }, | |
| { | |
| "entropy": 1.2615625, | |
| "epoch": 1.0565104549238529, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 2.9678757956995704e-06, | |
| "loss": 0.1519, | |
| "mean_token_accuracy": 0.967376263141632, | |
| "num_tokens": 4566754673.0, | |
| "step": 43150 | |
| }, | |
| { | |
| "entropy": 1.24921875, | |
| "epoch": 1.0577346848832085, | |
| "grad_norm": 3.5, | |
| "learning_rate": 2.9618034693621624e-06, | |
| "loss": 0.1651, | |
| "mean_token_accuracy": 0.9647138011455536, | |
| "num_tokens": 4571961153.0, | |
| "step": 43200 | |
| }, | |
| { | |
| "entropy": 1.27078125, | |
| "epoch": 1.058958914842564, | |
| "grad_norm": 4.0625, | |
| "learning_rate": 2.955731299538065e-06, | |
| "loss": 0.1664, | |
| "mean_token_accuracy": 0.9643959999084473, | |
| "num_tokens": 4577276643.0, | |
| "step": 43250 | |
| }, | |
| { | |
| "entropy": 1.27125, | |
| "epoch": 1.0601831448019197, | |
| "grad_norm": 3.5, | |
| "learning_rate": 2.9496593111084725e-06, | |
| "loss": 0.1764, | |
| "mean_token_accuracy": 0.9621264743804931, | |
| "num_tokens": 4582787780.0, | |
| "step": 43300 | |
| }, | |
| { | |
| "entropy": 1.2503125, | |
| "epoch": 1.0614073747612751, | |
| "grad_norm": 0.06201171875, | |
| "learning_rate": 2.9435875289538397e-06, | |
| "loss": 0.1616, | |
| "mean_token_accuracy": 0.9652257537841797, | |
| "num_tokens": 4587978646.0, | |
| "step": 43350 | |
| }, | |
| { | |
| "entropy": 1.25390625, | |
| "epoch": 1.0626316047206308, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 2.937515977953776e-06, | |
| "loss": 0.1601, | |
| "mean_token_accuracy": 0.9656472432613373, | |
| "num_tokens": 4593105594.0, | |
| "step": 43400 | |
| }, | |
| { | |
| "entropy": 1.235, | |
| "epoch": 1.0638558346799862, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 2.93144468298694e-06, | |
| "loss": 0.1465, | |
| "mean_token_accuracy": 0.9684570038318634, | |
| "num_tokens": 4598082227.0, | |
| "step": 43450 | |
| }, | |
| { | |
| "entropy": 1.2615625, | |
| "epoch": 1.065080064639342, | |
| "grad_norm": 1.75, | |
| "learning_rate": 2.9253736689309453e-06, | |
| "loss": 0.1739, | |
| "mean_token_accuracy": 0.9627693855762481, | |
| "num_tokens": 4603820936.0, | |
| "step": 43500 | |
| }, | |
| { | |
| "entropy": 1.2409375, | |
| "epoch": 1.0663042945986974, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 2.919302960662252e-06, | |
| "loss": 0.1665, | |
| "mean_token_accuracy": 0.9645286548137665, | |
| "num_tokens": 4609111825.0, | |
| "step": 43550 | |
| }, | |
| { | |
| "entropy": 1.251875, | |
| "epoch": 1.067528524558053, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 2.9132325830560694e-06, | |
| "loss": 0.1708, | |
| "mean_token_accuracy": 0.9642206788063049, | |
| "num_tokens": 4614988638.0, | |
| "step": 43600 | |
| }, | |
| { | |
| "entropy": 1.23515625, | |
| "epoch": 1.0687527545174085, | |
| "grad_norm": 3.109375, | |
| "learning_rate": 2.907162560986249e-06, | |
| "loss": 0.1665, | |
| "mean_token_accuracy": 0.9648200106620789, | |
| "num_tokens": 4620258466.0, | |
| "step": 43650 | |
| }, | |
| { | |
| "entropy": 1.23046875, | |
| "epoch": 1.0699769844767641, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 2.9010929193251877e-06, | |
| "loss": 0.1587, | |
| "mean_token_accuracy": 0.9666041648387909, | |
| "num_tokens": 4625541440.0, | |
| "step": 43700 | |
| }, | |
| { | |
| "entropy": 1.23578125, | |
| "epoch": 1.0712012144361196, | |
| "grad_norm": 2.59375, | |
| "learning_rate": 2.8950236829437243e-06, | |
| "loss": 0.1595, | |
| "mean_token_accuracy": 0.9665923917293548, | |
| "num_tokens": 4630862596.0, | |
| "step": 43750 | |
| }, | |
| { | |
| "entropy": 1.24796875, | |
| "epoch": 1.0724254443954753, | |
| "grad_norm": 3.625, | |
| "learning_rate": 2.8889548767110325e-06, | |
| "loss": 0.1726, | |
| "mean_token_accuracy": 0.9622351431846619, | |
| "num_tokens": 4636080162.0, | |
| "step": 43800 | |
| }, | |
| { | |
| "entropy": 1.255, | |
| "epoch": 1.0736496743548307, | |
| "grad_norm": 2.984375, | |
| "learning_rate": 2.882886525494528e-06, | |
| "loss": 0.1677, | |
| "mean_token_accuracy": 0.9641489648818969, | |
| "num_tokens": 4641603830.0, | |
| "step": 43850 | |
| }, | |
| { | |
| "entropy": 1.26390625, | |
| "epoch": 1.0748739043141864, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 2.8768186541597617e-06, | |
| "loss": 0.1803, | |
| "mean_token_accuracy": 0.9621511352062225, | |
| "num_tokens": 4647162733.0, | |
| "step": 43900 | |
| }, | |
| { | |
| "entropy": 1.26953125, | |
| "epoch": 1.0760981342735418, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 2.8707512875703146e-06, | |
| "loss": 0.1724, | |
| "mean_token_accuracy": 0.963198972940445, | |
| "num_tokens": 4652659894.0, | |
| "step": 43950 | |
| }, | |
| { | |
| "entropy": 1.261875, | |
| "epoch": 1.0773223642328975, | |
| "grad_norm": 2.984375, | |
| "learning_rate": 2.8646844505877032e-06, | |
| "loss": 0.1702, | |
| "mean_token_accuracy": 0.963871557712555, | |
| "num_tokens": 4657833019.0, | |
| "step": 44000 | |
| }, | |
| { | |
| "entropy": 1.25171875, | |
| "epoch": 1.078546594192253, | |
| "grad_norm": 3.875, | |
| "learning_rate": 2.8586181680712726e-06, | |
| "loss": 0.1671, | |
| "mean_token_accuracy": 0.9647689509391785, | |
| "num_tokens": 4663099416.0, | |
| "step": 44050 | |
| }, | |
| { | |
| "entropy": 1.2353125, | |
| "epoch": 1.0797708241516086, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 2.852552464878096e-06, | |
| "loss": 0.1626, | |
| "mean_token_accuracy": 0.9649975061416626, | |
| "num_tokens": 4668463403.0, | |
| "step": 44100 | |
| }, | |
| { | |
| "entropy": 1.2309375, | |
| "epoch": 1.0809950541109643, | |
| "grad_norm": 3.515625, | |
| "learning_rate": 2.846487365862872e-06, | |
| "loss": 0.1622, | |
| "mean_token_accuracy": 0.966260347366333, | |
| "num_tokens": 4673588957.0, | |
| "step": 44150 | |
| }, | |
| { | |
| "entropy": 1.2703125, | |
| "epoch": 1.0822192840703198, | |
| "grad_norm": 2.921875, | |
| "learning_rate": 2.840422895877824e-06, | |
| "loss": 0.1829, | |
| "mean_token_accuracy": 0.9611806380748749, | |
| "num_tokens": 4679435999.0, | |
| "step": 44200 | |
| }, | |
| { | |
| "entropy": 1.21671875, | |
| "epoch": 1.0834435140296754, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 2.8343590797725993e-06, | |
| "loss": 0.1595, | |
| "mean_token_accuracy": 0.9657203650474548, | |
| "num_tokens": 4684283427.0, | |
| "step": 44250 | |
| }, | |
| { | |
| "entropy": 1.23546875, | |
| "epoch": 1.0846677439890309, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 2.828295942394163e-06, | |
| "loss": 0.1545, | |
| "mean_token_accuracy": 0.9663613975048065, | |
| "num_tokens": 4689166634.0, | |
| "step": 44300 | |
| }, | |
| { | |
| "entropy": 1.2715625, | |
| "epoch": 1.0858919739483865, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 2.822233508586702e-06, | |
| "loss": 0.1721, | |
| "mean_token_accuracy": 0.9638037300109863, | |
| "num_tokens": 4694728156.0, | |
| "step": 44350 | |
| }, | |
| { | |
| "entropy": 1.246875, | |
| "epoch": 1.087116203907742, | |
| "grad_norm": 1.1171875, | |
| "learning_rate": 2.8161718031915194e-06, | |
| "loss": 0.1629, | |
| "mean_token_accuracy": 0.9652890110015869, | |
| "num_tokens": 4700054529.0, | |
| "step": 44400 | |
| }, | |
| { | |
| "entropy": 1.26203125, | |
| "epoch": 1.0883404338670977, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 2.8101108510469308e-06, | |
| "loss": 0.1667, | |
| "mean_token_accuracy": 0.9647334861755371, | |
| "num_tokens": 4705521940.0, | |
| "step": 44450 | |
| }, | |
| { | |
| "entropy": 1.26171875, | |
| "epoch": 1.0895646638264531, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 2.804050676988169e-06, | |
| "loss": 0.1764, | |
| "mean_token_accuracy": 0.9625956809520722, | |
| "num_tokens": 4711285057.0, | |
| "step": 44500 | |
| }, | |
| { | |
| "entropy": 1.26828125, | |
| "epoch": 1.0907888937858088, | |
| "grad_norm": 3.21875, | |
| "learning_rate": 2.797991305847279e-06, | |
| "loss": 0.1695, | |
| "mean_token_accuracy": 0.9635378420352936, | |
| "num_tokens": 4716659220.0, | |
| "step": 44550 | |
| }, | |
| { | |
| "entropy": 1.25296875, | |
| "epoch": 1.0920131237451642, | |
| "grad_norm": 2.84375, | |
| "learning_rate": 2.7919327624530105e-06, | |
| "loss": 0.1589, | |
| "mean_token_accuracy": 0.966244969367981, | |
| "num_tokens": 4721738500.0, | |
| "step": 44600 | |
| }, | |
| { | |
| "entropy": 1.25390625, | |
| "epoch": 1.09323735370452, | |
| "grad_norm": 1.5859375, | |
| "learning_rate": 2.7858750716307267e-06, | |
| "loss": 0.1629, | |
| "mean_token_accuracy": 0.9655514645576477, | |
| "num_tokens": 4727007974.0, | |
| "step": 44650 | |
| }, | |
| { | |
| "entropy": 1.261875, | |
| "epoch": 1.0944615836638754, | |
| "grad_norm": 3.15625, | |
| "learning_rate": 2.7798182582022956e-06, | |
| "loss": 0.1666, | |
| "mean_token_accuracy": 0.9647921168804169, | |
| "num_tokens": 4732247570.0, | |
| "step": 44700 | |
| }, | |
| { | |
| "entropy": 1.275, | |
| "epoch": 1.095685813623231, | |
| "grad_norm": 3.0, | |
| "learning_rate": 2.7737623469859904e-06, | |
| "loss": 0.1753, | |
| "mean_token_accuracy": 0.9633481323719024, | |
| "num_tokens": 4737626660.0, | |
| "step": 44750 | |
| }, | |
| { | |
| "entropy": 1.27203125, | |
| "epoch": 1.0969100435825865, | |
| "grad_norm": 3.1875, | |
| "learning_rate": 2.767707362796385e-06, | |
| "loss": 0.1707, | |
| "mean_token_accuracy": 0.9635563850402832, | |
| "num_tokens": 4743127298.0, | |
| "step": 44800 | |
| }, | |
| { | |
| "entropy": 1.2753125, | |
| "epoch": 1.0981342735419422, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 2.7616533304442583e-06, | |
| "loss": 0.1725, | |
| "mean_token_accuracy": 0.9624858343601227, | |
| "num_tokens": 4748930038.0, | |
| "step": 44850 | |
| }, | |
| { | |
| "entropy": 1.251875, | |
| "epoch": 1.0993585035012976, | |
| "grad_norm": 2.828125, | |
| "learning_rate": 2.7556002747364882e-06, | |
| "loss": 0.1618, | |
| "mean_token_accuracy": 0.965050835609436, | |
| "num_tokens": 4754015548.0, | |
| "step": 44900 | |
| }, | |
| { | |
| "entropy": 1.24, | |
| "epoch": 1.1005827334606533, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 2.749548220475947e-06, | |
| "loss": 0.1556, | |
| "mean_token_accuracy": 0.9672428011894226, | |
| "num_tokens": 4759064667.0, | |
| "step": 44950 | |
| }, | |
| { | |
| "entropy": 1.24671875, | |
| "epoch": 1.1018069634200087, | |
| "grad_norm": 3.796875, | |
| "learning_rate": 2.7434971924614085e-06, | |
| "loss": 0.1581, | |
| "mean_token_accuracy": 0.9658971416950226, | |
| "num_tokens": 4764080171.0, | |
| "step": 45000 | |
| }, | |
| { | |
| "entropy": 1.27921875, | |
| "epoch": 1.1030311933793644, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 2.7374472154874396e-06, | |
| "loss": 0.1743, | |
| "mean_token_accuracy": 0.9628953158855438, | |
| "num_tokens": 4769590544.0, | |
| "step": 45050 | |
| }, | |
| { | |
| "entropy": 1.27578125, | |
| "epoch": 1.1042554233387198, | |
| "grad_norm": 3.203125, | |
| "learning_rate": 2.731398314344298e-06, | |
| "loss": 0.172, | |
| "mean_token_accuracy": 0.9631561875343323, | |
| "num_tokens": 4774983478.0, | |
| "step": 45100 | |
| }, | |
| { | |
| "entropy": 1.26796875, | |
| "epoch": 1.1054796532980755, | |
| "grad_norm": 2.625, | |
| "learning_rate": 2.7253505138178363e-06, | |
| "loss": 0.1626, | |
| "mean_token_accuracy": 0.9651547718048096, | |
| "num_tokens": 4780291854.0, | |
| "step": 45150 | |
| }, | |
| { | |
| "entropy": 1.24359375, | |
| "epoch": 1.1067038832574312, | |
| "grad_norm": 1.7734375, | |
| "learning_rate": 2.719303838689397e-06, | |
| "loss": 0.1586, | |
| "mean_token_accuracy": 0.9661097753047944, | |
| "num_tokens": 4785746067.0, | |
| "step": 45200 | |
| }, | |
| { | |
| "entropy": 1.27703125, | |
| "epoch": 1.1079281132167866, | |
| "grad_norm": 3.21875, | |
| "learning_rate": 2.7132583137357085e-06, | |
| "loss": 0.173, | |
| "mean_token_accuracy": 0.9634435415267945, | |
| "num_tokens": 4791411988.0, | |
| "step": 45250 | |
| }, | |
| { | |
| "entropy": 1.26703125, | |
| "epoch": 1.1091523431761423, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 2.70721396372879e-06, | |
| "loss": 0.1574, | |
| "mean_token_accuracy": 0.9663924646377563, | |
| "num_tokens": 4796839124.0, | |
| "step": 45300 | |
| }, | |
| { | |
| "entropy": 1.251875, | |
| "epoch": 1.1103765731354978, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 2.7011708134358433e-06, | |
| "loss": 0.1702, | |
| "mean_token_accuracy": 0.963711371421814, | |
| "num_tokens": 4802261281.0, | |
| "step": 45350 | |
| }, | |
| { | |
| "entropy": 1.26109375, | |
| "epoch": 1.1116008030948534, | |
| "grad_norm": 3.34375, | |
| "learning_rate": 2.6951288876191554e-06, | |
| "loss": 0.163, | |
| "mean_token_accuracy": 0.9658736658096313, | |
| "num_tokens": 4807722190.0, | |
| "step": 45400 | |
| }, | |
| { | |
| "entropy": 1.2421875, | |
| "epoch": 1.1128250330542089, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 2.689088211035996e-06, | |
| "loss": 0.1582, | |
| "mean_token_accuracy": 0.9665179479122162, | |
| "num_tokens": 4812528854.0, | |
| "step": 45450 | |
| }, | |
| { | |
| "entropy": 1.26859375, | |
| "epoch": 1.1140492630135646, | |
| "grad_norm": 4.28125, | |
| "learning_rate": 2.6830488084385153e-06, | |
| "loss": 0.1633, | |
| "mean_token_accuracy": 0.9647966718673706, | |
| "num_tokens": 4817654045.0, | |
| "step": 45500 | |
| }, | |
| { | |
| "entropy": 1.276875, | |
| "epoch": 1.11527349297292, | |
| "grad_norm": 3.078125, | |
| "learning_rate": 2.6770107045736457e-06, | |
| "loss": 0.1659, | |
| "mean_token_accuracy": 0.9641125738620758, | |
| "num_tokens": 4823118089.0, | |
| "step": 45550 | |
| }, | |
| { | |
| "entropy": 1.26890625, | |
| "epoch": 1.1164977229322757, | |
| "grad_norm": 3.40625, | |
| "learning_rate": 2.670973924182993e-06, | |
| "loss": 0.1652, | |
| "mean_token_accuracy": 0.965114232301712, | |
| "num_tokens": 4828253691.0, | |
| "step": 45600 | |
| }, | |
| { | |
| "entropy": 1.25203125, | |
| "epoch": 1.1177219528916311, | |
| "grad_norm": 4.03125, | |
| "learning_rate": 2.664938492002745e-06, | |
| "loss": 0.1578, | |
| "mean_token_accuracy": 0.965950778722763, | |
| "num_tokens": 4833456111.0, | |
| "step": 45650 | |
| }, | |
| { | |
| "entropy": 1.27203125, | |
| "epoch": 1.1189461828509868, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 2.658904432763564e-06, | |
| "loss": 0.172, | |
| "mean_token_accuracy": 0.962825288772583, | |
| "num_tokens": 4838982999.0, | |
| "step": 45700 | |
| }, | |
| { | |
| "entropy": 1.2459375, | |
| "epoch": 1.1201704128103422, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 2.6528717711904823e-06, | |
| "loss": 0.1553, | |
| "mean_token_accuracy": 0.9660564112663269, | |
| "num_tokens": 4844057439.0, | |
| "step": 45750 | |
| }, | |
| { | |
| "entropy": 1.2546875, | |
| "epoch": 1.121394642769698, | |
| "grad_norm": 2.78125, | |
| "learning_rate": 2.6468405320028107e-06, | |
| "loss": 0.1758, | |
| "mean_token_accuracy": 0.9631454050540924, | |
| "num_tokens": 4849526204.0, | |
| "step": 45800 | |
| }, | |
| { | |
| "entropy": 1.2471875, | |
| "epoch": 1.1226188727290534, | |
| "grad_norm": 2.375, | |
| "learning_rate": 2.6408107399140297e-06, | |
| "loss": 0.1525, | |
| "mean_token_accuracy": 0.9672383844852448, | |
| "num_tokens": 4854563999.0, | |
| "step": 45850 | |
| }, | |
| { | |
| "entropy": 1.2390625, | |
| "epoch": 1.123843102688409, | |
| "grad_norm": 2.796875, | |
| "learning_rate": 2.6347824196316884e-06, | |
| "loss": 0.1571, | |
| "mean_token_accuracy": 0.9666775286197662, | |
| "num_tokens": 4859889553.0, | |
| "step": 45900 | |
| }, | |
| { | |
| "entropy": 1.251875, | |
| "epoch": 1.1250673326477645, | |
| "grad_norm": 6.1875, | |
| "learning_rate": 2.628755595857308e-06, | |
| "loss": 0.1659, | |
| "mean_token_accuracy": 0.964877005815506, | |
| "num_tokens": 4865439463.0, | |
| "step": 45950 | |
| }, | |
| { | |
| "entropy": 1.26578125, | |
| "epoch": 1.1262915626071202, | |
| "grad_norm": 2.9375, | |
| "learning_rate": 2.622730293286276e-06, | |
| "loss": 0.1663, | |
| "mean_token_accuracy": 0.9647691214084625, | |
| "num_tokens": 4870527275.0, | |
| "step": 46000 | |
| }, | |
| { | |
| "entropy": 1.2690625, | |
| "epoch": 1.1275157925664756, | |
| "grad_norm": 3.84375, | |
| "learning_rate": 2.6167065366077473e-06, | |
| "loss": 0.164, | |
| "mean_token_accuracy": 0.9649512505531311, | |
| "num_tokens": 4875809735.0, | |
| "step": 46050 | |
| }, | |
| { | |
| "entropy": 1.2575, | |
| "epoch": 1.1287400225258313, | |
| "grad_norm": 3.546875, | |
| "learning_rate": 2.6106843505045403e-06, | |
| "loss": 0.1637, | |
| "mean_token_accuracy": 0.9659580600261688, | |
| "num_tokens": 4881072058.0, | |
| "step": 46100 | |
| }, | |
| { | |
| "entropy": 1.2534375, | |
| "epoch": 1.1299642524851867, | |
| "grad_norm": 3.265625, | |
| "learning_rate": 2.6046637596530405e-06, | |
| "loss": 0.1738, | |
| "mean_token_accuracy": 0.9629634070396423, | |
| "num_tokens": 4886211504.0, | |
| "step": 46150 | |
| }, | |
| { | |
| "entropy": 1.255, | |
| "epoch": 1.1311884824445424, | |
| "grad_norm": 2.953125, | |
| "learning_rate": 2.598644788723097e-06, | |
| "loss": 0.1635, | |
| "mean_token_accuracy": 0.964535938501358, | |
| "num_tokens": 4891417957.0, | |
| "step": 46200 | |
| }, | |
| { | |
| "entropy": 1.26296875, | |
| "epoch": 1.132412712403898, | |
| "grad_norm": 2.25, | |
| "learning_rate": 2.5926274623779176e-06, | |
| "loss": 0.1648, | |
| "mean_token_accuracy": 0.9648210310935974, | |
| "num_tokens": 4897027521.0, | |
| "step": 46250 | |
| }, | |
| { | |
| "entropy": 1.2715625, | |
| "epoch": 1.1336369423632535, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 2.5866118052739744e-06, | |
| "loss": 0.1701, | |
| "mean_token_accuracy": 0.9643675744533539, | |
| "num_tokens": 4902630666.0, | |
| "step": 46300 | |
| }, | |
| { | |
| "entropy": 1.2640625, | |
| "epoch": 1.134861172322609, | |
| "grad_norm": 2.921875, | |
| "learning_rate": 2.5805978420608995e-06, | |
| "loss": 0.1588, | |
| "mean_token_accuracy": 0.9654871869087219, | |
| "num_tokens": 4907957609.0, | |
| "step": 46350 | |
| }, | |
| { | |
| "entropy": 1.25765625, | |
| "epoch": 1.1360854022819646, | |
| "grad_norm": 1.96875, | |
| "learning_rate": 2.574585597381383e-06, | |
| "loss": 0.1657, | |
| "mean_token_accuracy": 0.964663782119751, | |
| "num_tokens": 4913108629.0, | |
| "step": 46400 | |
| }, | |
| { | |
| "entropy": 1.26984375, | |
| "epoch": 1.1373096322413203, | |
| "grad_norm": 2.890625, | |
| "learning_rate": 2.5685750958710737e-06, | |
| "loss": 0.1654, | |
| "mean_token_accuracy": 0.9640021121501923, | |
| "num_tokens": 4918622288.0, | |
| "step": 46450 | |
| }, | |
| { | |
| "entropy": 1.2890625, | |
| "epoch": 1.1385338622006758, | |
| "grad_norm": 2.59375, | |
| "learning_rate": 2.5625663621584777e-06, | |
| "loss": 0.1822, | |
| "mean_token_accuracy": 0.9616779792308807, | |
| "num_tokens": 4924224135.0, | |
| "step": 46500 | |
| }, | |
| { | |
| "entropy": 1.2665625, | |
| "epoch": 1.1397580921600314, | |
| "grad_norm": 3.171875, | |
| "learning_rate": 2.5565594208648566e-06, | |
| "loss": 0.1703, | |
| "mean_token_accuracy": 0.9643717563152313, | |
| "num_tokens": 4929573607.0, | |
| "step": 46550 | |
| }, | |
| { | |
| "entropy": 1.2684375, | |
| "epoch": 1.1409823221193869, | |
| "grad_norm": 3.296875, | |
| "learning_rate": 2.5505542966041285e-06, | |
| "loss": 0.1726, | |
| "mean_token_accuracy": 0.9641470229625702, | |
| "num_tokens": 4935198269.0, | |
| "step": 46600 | |
| }, | |
| { | |
| "entropy": 1.2725, | |
| "epoch": 1.1422065520787426, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 2.5445510139827656e-06, | |
| "loss": 0.1731, | |
| "mean_token_accuracy": 0.9628414344787598, | |
| "num_tokens": 4940751379.0, | |
| "step": 46650 | |
| }, | |
| { | |
| "entropy": 1.2690625, | |
| "epoch": 1.143430782038098, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 2.5385495975996952e-06, | |
| "loss": 0.1769, | |
| "mean_token_accuracy": 0.9626391875743866, | |
| "num_tokens": 4946216596.0, | |
| "step": 46700 | |
| }, | |
| { | |
| "entropy": 1.27125, | |
| "epoch": 1.1446550119974537, | |
| "grad_norm": 3.359375, | |
| "learning_rate": 2.532550072046194e-06, | |
| "loss": 0.179, | |
| "mean_token_accuracy": 0.9620010888576508, | |
| "num_tokens": 4951891973.0, | |
| "step": 46750 | |
| }, | |
| { | |
| "entropy": 1.28984375, | |
| "epoch": 1.1458792419568091, | |
| "grad_norm": 1.71875, | |
| "learning_rate": 2.5265524619057936e-06, | |
| "loss": 0.1822, | |
| "mean_token_accuracy": 0.9611503231525421, | |
| "num_tokens": 4957928188.0, | |
| "step": 46800 | |
| }, | |
| { | |
| "entropy": 1.27203125, | |
| "epoch": 1.1471034719161648, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 2.520556791754179e-06, | |
| "loss": 0.1675, | |
| "mean_token_accuracy": 0.9632143163681031, | |
| "num_tokens": 4963189602.0, | |
| "step": 46850 | |
| }, | |
| { | |
| "entropy": 1.2546875, | |
| "epoch": 1.1483277018755202, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 2.5145630861590806e-06, | |
| "loss": 0.1677, | |
| "mean_token_accuracy": 0.9636298882961273, | |
| "num_tokens": 4968384917.0, | |
| "step": 46900 | |
| }, | |
| { | |
| "entropy": 1.234375, | |
| "epoch": 1.149551931834876, | |
| "grad_norm": 2.0, | |
| "learning_rate": 2.5085713696801825e-06, | |
| "loss": 0.1456, | |
| "mean_token_accuracy": 0.9684996688365937, | |
| "num_tokens": 4973304826.0, | |
| "step": 46950 | |
| }, | |
| { | |
| "entropy": 1.2384375, | |
| "epoch": 1.1507761617942314, | |
| "grad_norm": 3.1875, | |
| "learning_rate": 2.5025816668690183e-06, | |
| "loss": 0.1615, | |
| "mean_token_accuracy": 0.9655906355381012, | |
| "num_tokens": 4978583670.0, | |
| "step": 47000 | |
| }, | |
| { | |
| "entropy": 1.23921875, | |
| "epoch": 1.152000391753587, | |
| "grad_norm": 2.796875, | |
| "learning_rate": 2.496594002268869e-06, | |
| "loss": 0.1633, | |
| "mean_token_accuracy": 0.9643825757503509, | |
| "num_tokens": 4983769645.0, | |
| "step": 47050 | |
| }, | |
| { | |
| "entropy": 1.2440625, | |
| "epoch": 1.1532246217129425, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 2.490608400414664e-06, | |
| "loss": 0.1601, | |
| "mean_token_accuracy": 0.9659870672225952, | |
| "num_tokens": 4989133497.0, | |
| "step": 47100 | |
| }, | |
| { | |
| "entropy": 1.2484375, | |
| "epoch": 1.1544488516722982, | |
| "grad_norm": 3.15625, | |
| "learning_rate": 2.484624885832883e-06, | |
| "loss": 0.1618, | |
| "mean_token_accuracy": 0.9654805910587311, | |
| "num_tokens": 4994369533.0, | |
| "step": 47150 | |
| }, | |
| { | |
| "entropy": 1.2490625, | |
| "epoch": 1.1556730816316536, | |
| "grad_norm": 3.109375, | |
| "learning_rate": 2.478643483041449e-06, | |
| "loss": 0.1616, | |
| "mean_token_accuracy": 0.9649089682102203, | |
| "num_tokens": 4999527347.0, | |
| "step": 47200 | |
| }, | |
| { | |
| "entropy": 1.2553125, | |
| "epoch": 1.1568973115910093, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 2.472664216549633e-06, | |
| "loss": 0.1627, | |
| "mean_token_accuracy": 0.9657234275341033, | |
| "num_tokens": 5004961075.0, | |
| "step": 47250 | |
| }, | |
| { | |
| "entropy": 1.24203125, | |
| "epoch": 1.1581215415503647, | |
| "grad_norm": 2.640625, | |
| "learning_rate": 2.466687110857955e-06, | |
| "loss": 0.1533, | |
| "mean_token_accuracy": 0.9676401782035827, | |
| "num_tokens": 5009801621.0, | |
| "step": 47300 | |
| }, | |
| { | |
| "entropy": 1.2534375, | |
| "epoch": 1.1593457715097204, | |
| "grad_norm": 1.875, | |
| "learning_rate": 2.4607121904580796e-06, | |
| "loss": 0.1689, | |
| "mean_token_accuracy": 0.96378169298172, | |
| "num_tokens": 5015019832.0, | |
| "step": 47350 | |
| }, | |
| { | |
| "entropy": 1.285625, | |
| "epoch": 1.1605700014690759, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 2.4547394798327127e-06, | |
| "loss": 0.1824, | |
| "mean_token_accuracy": 0.961477290391922, | |
| "num_tokens": 5020771556.0, | |
| "step": 47400 | |
| }, | |
| { | |
| "entropy": 1.2609375, | |
| "epoch": 1.1617942314284315, | |
| "grad_norm": 0.00445556640625, | |
| "learning_rate": 2.448769003455512e-06, | |
| "loss": 0.1606, | |
| "mean_token_accuracy": 0.9650316751003265, | |
| "num_tokens": 5026174408.0, | |
| "step": 47450 | |
| }, | |
| { | |
| "entropy": 1.24875, | |
| "epoch": 1.1630184613877872, | |
| "grad_norm": 2.96875, | |
| "learning_rate": 2.442800785790977e-06, | |
| "loss": 0.1554, | |
| "mean_token_accuracy": 0.9664806413650513, | |
| "num_tokens": 5031142557.0, | |
| "step": 47500 | |
| }, | |
| { | |
| "entropy": 1.25828125, | |
| "epoch": 1.1642426913471426, | |
| "grad_norm": 2.75, | |
| "learning_rate": 2.436834851294351e-06, | |
| "loss": 0.1731, | |
| "mean_token_accuracy": 0.9635387444496155, | |
| "num_tokens": 5036598656.0, | |
| "step": 47550 | |
| }, | |
| { | |
| "entropy": 1.2440625, | |
| "epoch": 1.165466921306498, | |
| "grad_norm": 3.125, | |
| "learning_rate": 2.4308712244115256e-06, | |
| "loss": 0.1652, | |
| "mean_token_accuracy": 0.9645625805854797, | |
| "num_tokens": 5041932484.0, | |
| "step": 47600 | |
| }, | |
| { | |
| "entropy": 1.23, | |
| "epoch": 1.1666911512658538, | |
| "grad_norm": 4.53125, | |
| "learning_rate": 2.4249099295789315e-06, | |
| "loss": 0.1503, | |
| "mean_token_accuracy": 0.9676901125907897, | |
| "num_tokens": 5047049390.0, | |
| "step": 47650 | |
| }, | |
| { | |
| "entropy": 1.24640625, | |
| "epoch": 1.1679153812252094, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 2.4189509912234475e-06, | |
| "loss": 0.1754, | |
| "mean_token_accuracy": 0.9623109328746796, | |
| "num_tokens": 5052498083.0, | |
| "step": 47700 | |
| }, | |
| { | |
| "entropy": 1.2228125, | |
| "epoch": 1.1691396111845649, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 2.412994433762295e-06, | |
| "loss": 0.1438, | |
| "mean_token_accuracy": 0.9679240989685058, | |
| "num_tokens": 5057358329.0, | |
| "step": 47750 | |
| }, | |
| { | |
| "entropy": 1.236875, | |
| "epoch": 1.1703638411439206, | |
| "grad_norm": 3.390625, | |
| "learning_rate": 2.407040281602942e-06, | |
| "loss": 0.1549, | |
| "mean_token_accuracy": 0.9666338443756104, | |
| "num_tokens": 5062500243.0, | |
| "step": 47800 | |
| }, | |
| { | |
| "entropy": 1.2196875, | |
| "epoch": 1.171588071103276, | |
| "grad_norm": 0.005096435546875, | |
| "learning_rate": 2.4010885591429955e-06, | |
| "loss": 0.1541, | |
| "mean_token_accuracy": 0.9668021559715271, | |
| "num_tokens": 5067435842.0, | |
| "step": 47850 | |
| }, | |
| { | |
| "entropy": 1.25109375, | |
| "epoch": 1.1728123010626317, | |
| "grad_norm": 3.40625, | |
| "learning_rate": 2.3951392907701115e-06, | |
| "loss": 0.1831, | |
| "mean_token_accuracy": 0.9610938668251038, | |
| "num_tokens": 5073063170.0, | |
| "step": 47900 | |
| }, | |
| { | |
| "entropy": 1.24375, | |
| "epoch": 1.1740365310219871, | |
| "grad_norm": 3.328125, | |
| "learning_rate": 2.389192500861888e-06, | |
| "loss": 0.1754, | |
| "mean_token_accuracy": 0.9621718871593475, | |
| "num_tokens": 5078828458.0, | |
| "step": 47950 | |
| }, | |
| { | |
| "entropy": 1.26078125, | |
| "epoch": 1.1752607609813428, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 2.3832482137857685e-06, | |
| "loss": 0.175, | |
| "mean_token_accuracy": 0.9630187213420868, | |
| "num_tokens": 5084161692.0, | |
| "step": 48000 | |
| }, | |
| { | |
| "entropy": 1.2571875, | |
| "epoch": 1.1764849909406982, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 2.377306453898938e-06, | |
| "loss": 0.1689, | |
| "mean_token_accuracy": 0.9643845617771148, | |
| "num_tokens": 5089346169.0, | |
| "step": 48050 | |
| }, | |
| { | |
| "entropy": 1.25125, | |
| "epoch": 1.177709220900054, | |
| "grad_norm": 3.34375, | |
| "learning_rate": 2.3713672455482293e-06, | |
| "loss": 0.1609, | |
| "mean_token_accuracy": 0.9652318274974823, | |
| "num_tokens": 5094622581.0, | |
| "step": 48100 | |
| }, | |
| { | |
| "entropy": 1.24921875, | |
| "epoch": 1.1789334508594094, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 2.36543061307002e-06, | |
| "loss": 0.1611, | |
| "mean_token_accuracy": 0.9650622093677521, | |
| "num_tokens": 5099539248.0, | |
| "step": 48150 | |
| }, | |
| { | |
| "entropy": 1.2584375, | |
| "epoch": 1.180157680818765, | |
| "grad_norm": 4.5, | |
| "learning_rate": 2.35949658079013e-06, | |
| "loss": 0.1693, | |
| "mean_token_accuracy": 0.9631922256946563, | |
| "num_tokens": 5104589567.0, | |
| "step": 48200 | |
| }, | |
| { | |
| "entropy": 1.26328125, | |
| "epoch": 1.1813819107781205, | |
| "grad_norm": 3.09375, | |
| "learning_rate": 2.3535651730237275e-06, | |
| "loss": 0.1613, | |
| "mean_token_accuracy": 0.9661449313163757, | |
| "num_tokens": 5109766096.0, | |
| "step": 48250 | |
| }, | |
| { | |
| "entropy": 1.25484375, | |
| "epoch": 1.1826061407374762, | |
| "grad_norm": 3.125, | |
| "learning_rate": 2.3476364140752266e-06, | |
| "loss": 0.1599, | |
| "mean_token_accuracy": 0.9653767657279968, | |
| "num_tokens": 5114683078.0, | |
| "step": 48300 | |
| }, | |
| { | |
| "entropy": 1.28109375, | |
| "epoch": 1.1838303706968316, | |
| "grad_norm": 3.46875, | |
| "learning_rate": 2.341710328238185e-06, | |
| "loss": 0.1725, | |
| "mean_token_accuracy": 0.9629187500476837, | |
| "num_tokens": 5120172628.0, | |
| "step": 48350 | |
| }, | |
| { | |
| "entropy": 1.2590625, | |
| "epoch": 1.1850546006561873, | |
| "grad_norm": 2.125, | |
| "learning_rate": 2.335786939795209e-06, | |
| "loss": 0.1574, | |
| "mean_token_accuracy": 0.966355732679367, | |
| "num_tokens": 5125111521.0, | |
| "step": 48400 | |
| }, | |
| { | |
| "entropy": 1.2721875, | |
| "epoch": 1.1862788306155427, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 2.3298662730178536e-06, | |
| "loss": 0.1635, | |
| "mean_token_accuracy": 0.9648284649848938, | |
| "num_tokens": 5130646209.0, | |
| "step": 48450 | |
| }, | |
| { | |
| "entropy": 1.2484375, | |
| "epoch": 1.1875030605748984, | |
| "grad_norm": 2.703125, | |
| "learning_rate": 2.3239483521665165e-06, | |
| "loss": 0.1529, | |
| "mean_token_accuracy": 0.9668037176132203, | |
| "num_tokens": 5135665531.0, | |
| "step": 48500 | |
| }, | |
| { | |
| "entropy": 1.25546875, | |
| "epoch": 1.188727290534254, | |
| "grad_norm": 1.5703125, | |
| "learning_rate": 2.31803320149035e-06, | |
| "loss": 0.1674, | |
| "mean_token_accuracy": 0.9642703318595887, | |
| "num_tokens": 5140993137.0, | |
| "step": 48550 | |
| }, | |
| { | |
| "entropy": 1.2759375, | |
| "epoch": 1.1899515204936095, | |
| "grad_norm": 1.875, | |
| "learning_rate": 2.312120845227151e-06, | |
| "loss": 0.1682, | |
| "mean_token_accuracy": 0.9635923814773559, | |
| "num_tokens": 5146394110.0, | |
| "step": 48600 | |
| }, | |
| { | |
| "entropy": 1.269375, | |
| "epoch": 1.191175750452965, | |
| "grad_norm": 2.125, | |
| "learning_rate": 2.306211307603269e-06, | |
| "loss": 0.1603, | |
| "mean_token_accuracy": 0.9650293779373169, | |
| "num_tokens": 5151444447.0, | |
| "step": 48650 | |
| }, | |
| { | |
| "entropy": 1.2778125, | |
| "epoch": 1.1923999804123206, | |
| "grad_norm": 3.484375, | |
| "learning_rate": 2.3003046128335004e-06, | |
| "loss": 0.1725, | |
| "mean_token_accuracy": 0.962925443649292, | |
| "num_tokens": 5157164016.0, | |
| "step": 48700 | |
| }, | |
| { | |
| "entropy": 1.2559375, | |
| "epoch": 1.1936242103716763, | |
| "grad_norm": 0.00262451171875, | |
| "learning_rate": 2.2944007851209967e-06, | |
| "loss": 0.1555, | |
| "mean_token_accuracy": 0.9663327503204345, | |
| "num_tokens": 5162287319.0, | |
| "step": 48750 | |
| }, | |
| { | |
| "entropy": 1.25578125, | |
| "epoch": 1.1948484403310318, | |
| "grad_norm": 1.84375, | |
| "learning_rate": 2.2884998486571587e-06, | |
| "loss": 0.1623, | |
| "mean_token_accuracy": 0.9643605947494507, | |
| "num_tokens": 5167697788.0, | |
| "step": 48800 | |
| }, | |
| { | |
| "entropy": 1.2621875, | |
| "epoch": 1.1960726702903874, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 2.2826018276215404e-06, | |
| "loss": 0.1641, | |
| "mean_token_accuracy": 0.9648311936855316, | |
| "num_tokens": 5172726413.0, | |
| "step": 48850 | |
| }, | |
| { | |
| "entropy": 1.256875, | |
| "epoch": 1.197296900249743, | |
| "grad_norm": 3.9375, | |
| "learning_rate": 2.276706746181751e-06, | |
| "loss": 0.1647, | |
| "mean_token_accuracy": 0.9653891062736512, | |
| "num_tokens": 5177807515.0, | |
| "step": 48900 | |
| }, | |
| { | |
| "entropy": 1.24484375, | |
| "epoch": 1.1985211302090986, | |
| "grad_norm": 3.359375, | |
| "learning_rate": 2.2708146284933544e-06, | |
| "loss": 0.1491, | |
| "mean_token_accuracy": 0.9672402215003967, | |
| "num_tokens": 5182682002.0, | |
| "step": 48950 | |
| }, | |
| { | |
| "entropy": 1.2434375, | |
| "epoch": 1.199745360168454, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 2.2649254986997666e-06, | |
| "loss": 0.1625, | |
| "mean_token_accuracy": 0.9646528875827789, | |
| "num_tokens": 5187927187.0, | |
| "step": 49000 | |
| }, | |
| { | |
| "entropy": 1.26171875, | |
| "epoch": 1.2009695901278097, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 2.2590393809321657e-06, | |
| "loss": 0.1601, | |
| "mean_token_accuracy": 0.9654495012760163, | |
| "num_tokens": 5192885819.0, | |
| "step": 49050 | |
| }, | |
| { | |
| "entropy": 1.26296875, | |
| "epoch": 1.2021938200871651, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 2.2531562993093854e-06, | |
| "loss": 0.1631, | |
| "mean_token_accuracy": 0.9647388279438018, | |
| "num_tokens": 5198240652.0, | |
| "step": 49100 | |
| }, | |
| { | |
| "entropy": 1.23265625, | |
| "epoch": 1.2034180500465208, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 2.247276277937817e-06, | |
| "loss": 0.1537, | |
| "mean_token_accuracy": 0.966611897945404, | |
| "num_tokens": 5203287957.0, | |
| "step": 49150 | |
| }, | |
| { | |
| "entropy": 1.24109375, | |
| "epoch": 1.2046422800058763, | |
| "grad_norm": 2.671875, | |
| "learning_rate": 2.241399340911315e-06, | |
| "loss": 0.1582, | |
| "mean_token_accuracy": 0.9648150885105133, | |
| "num_tokens": 5208259781.0, | |
| "step": 49200 | |
| }, | |
| { | |
| "entropy": 1.22828125, | |
| "epoch": 1.205866509965232, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 2.235525512311094e-06, | |
| "loss": 0.1659, | |
| "mean_token_accuracy": 0.9645445287227631, | |
| "num_tokens": 5213559098.0, | |
| "step": 49250 | |
| }, | |
| { | |
| "entropy": 1.23921875, | |
| "epoch": 1.2070907399245874, | |
| "grad_norm": 3.1875, | |
| "learning_rate": 2.229654816205632e-06, | |
| "loss": 0.1694, | |
| "mean_token_accuracy": 0.9639151406288147, | |
| "num_tokens": 5218710994.0, | |
| "step": 49300 | |
| }, | |
| { | |
| "entropy": 1.2425, | |
| "epoch": 1.208314969883943, | |
| "grad_norm": 2.828125, | |
| "learning_rate": 2.2237872766505715e-06, | |
| "loss": 0.1676, | |
| "mean_token_accuracy": 0.9631175470352172, | |
| "num_tokens": 5224096915.0, | |
| "step": 49350 | |
| }, | |
| { | |
| "entropy": 1.25015625, | |
| "epoch": 1.2095391998432985, | |
| "grad_norm": 3.71875, | |
| "learning_rate": 2.2179229176886196e-06, | |
| "loss": 0.1731, | |
| "mean_token_accuracy": 0.9628188860416412, | |
| "num_tokens": 5229833600.0, | |
| "step": 49400 | |
| }, | |
| { | |
| "entropy": 1.24265625, | |
| "epoch": 1.2107634298026542, | |
| "grad_norm": 4.125, | |
| "learning_rate": 2.212061763349454e-06, | |
| "loss": 0.1616, | |
| "mean_token_accuracy": 0.9654302883148194, | |
| "num_tokens": 5235131114.0, | |
| "step": 49450 | |
| }, | |
| { | |
| "entropy": 1.23765625, | |
| "epoch": 1.2119876597620096, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 2.206203837649615e-06, | |
| "loss": 0.1555, | |
| "mean_token_accuracy": 0.9665101909637451, | |
| "num_tokens": 5240317138.0, | |
| "step": 49500 | |
| }, | |
| { | |
| "entropy": 1.24921875, | |
| "epoch": 1.2132118897213653, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 2.2003491645924195e-06, | |
| "loss": 0.1715, | |
| "mean_token_accuracy": 0.9628171730041504, | |
| "num_tokens": 5245861371.0, | |
| "step": 49550 | |
| }, | |
| { | |
| "entropy": 1.24640625, | |
| "epoch": 1.2144361196807207, | |
| "grad_norm": 2.859375, | |
| "learning_rate": 2.194497768167855e-06, | |
| "loss": 0.1703, | |
| "mean_token_accuracy": 0.9627651238441467, | |
| "num_tokens": 5251350220.0, | |
| "step": 49600 | |
| }, | |
| { | |
| "entropy": 1.2528125, | |
| "epoch": 1.2156603496400764, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 2.188649672352479e-06, | |
| "loss": 0.1707, | |
| "mean_token_accuracy": 0.964025752544403, | |
| "num_tokens": 5256995465.0, | |
| "step": 49650 | |
| }, | |
| { | |
| "entropy": 1.25234375, | |
| "epoch": 1.2168845795994319, | |
| "grad_norm": 2.703125, | |
| "learning_rate": 2.1828049011093286e-06, | |
| "loss": 0.1702, | |
| "mean_token_accuracy": 0.9648704588413238, | |
| "num_tokens": 5262286472.0, | |
| "step": 49700 | |
| }, | |
| { | |
| "entropy": 1.241875, | |
| "epoch": 1.2181088095587875, | |
| "grad_norm": 3.578125, | |
| "learning_rate": 2.1769634783878182e-06, | |
| "loss": 0.1579, | |
| "mean_token_accuracy": 0.9658465564250946, | |
| "num_tokens": 5267436922.0, | |
| "step": 49750 | |
| }, | |
| { | |
| "entropy": 1.26, | |
| "epoch": 1.2193330395181432, | |
| "grad_norm": 3.453125, | |
| "learning_rate": 2.1711254281236373e-06, | |
| "loss": 0.1804, | |
| "mean_token_accuracy": 0.9622203695774079, | |
| "num_tokens": 5273103073.0, | |
| "step": 49800 | |
| }, | |
| { | |
| "entropy": 1.258125, | |
| "epoch": 1.2205572694774987, | |
| "grad_norm": 2.78125, | |
| "learning_rate": 2.1652907742386613e-06, | |
| "loss": 0.178, | |
| "mean_token_accuracy": 0.9619389712810517, | |
| "num_tokens": 5278483949.0, | |
| "step": 49850 | |
| }, | |
| { | |
| "entropy": 1.24796875, | |
| "epoch": 1.221781499436854, | |
| "grad_norm": 1.5546875, | |
| "learning_rate": 2.159459540640847e-06, | |
| "loss": 0.161, | |
| "mean_token_accuracy": 0.9660306286811828, | |
| "num_tokens": 5283427597.0, | |
| "step": 49900 | |
| }, | |
| { | |
| "entropy": 1.27984375, | |
| "epoch": 1.2230057293962098, | |
| "grad_norm": 1.5703125, | |
| "learning_rate": 2.1536317512241348e-06, | |
| "loss": 0.1777, | |
| "mean_token_accuracy": 0.9623690032958985, | |
| "num_tokens": 5288987030.0, | |
| "step": 49950 | |
| }, | |
| { | |
| "entropy": 1.2584375, | |
| "epoch": 1.2242299593555654, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 2.147807429868352e-06, | |
| "loss": 0.1658, | |
| "mean_token_accuracy": 0.9644541823863984, | |
| "num_tokens": 5294529728.0, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 1.2242299593555654, | |
| "eval_entropy": 1.2479817708333334, | |
| "eval_loss": 0.17940963804721832, | |
| "eval_mean_token_accuracy": 0.9616454169154167, | |
| "eval_num_tokens": 5294529728.0, | |
| "eval_runtime": 604.376, | |
| "eval_samples_per_second": 15.977, | |
| "eval_steps_per_second": 0.2, | |
| "step": 50000 | |
| }, | |
| { | |
| "entropy": 1.2259375, | |
| "epoch": 1.225454189314921, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 2.141986600439119e-06, | |
| "loss": 0.153, | |
| "mean_token_accuracy": 0.9670542335510254, | |
| "num_tokens": 5299381949.0, | |
| "step": 50050 | |
| }, | |
| { | |
| "entropy": 1.26140625, | |
| "epoch": 1.2266784192742766, | |
| "grad_norm": 2.625, | |
| "learning_rate": 2.1361692867877455e-06, | |
| "loss": 0.1754, | |
| "mean_token_accuracy": 0.9621517550945282, | |
| "num_tokens": 5304936166.0, | |
| "step": 50100 | |
| }, | |
| { | |
| "entropy": 1.24140625, | |
| "epoch": 1.227902649233632, | |
| "grad_norm": 0.00927734375, | |
| "learning_rate": 2.1303555127511327e-06, | |
| "loss": 0.1545, | |
| "mean_token_accuracy": 0.96613614320755, | |
| "num_tokens": 5310169155.0, | |
| "step": 50150 | |
| }, | |
| { | |
| "entropy": 1.261875, | |
| "epoch": 1.2291268791929877, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 2.124545302151681e-06, | |
| "loss": 0.1693, | |
| "mean_token_accuracy": 0.9642032277584076, | |
| "num_tokens": 5315607723.0, | |
| "step": 50200 | |
| }, | |
| { | |
| "entropy": 1.26796875, | |
| "epoch": 1.2303511091523431, | |
| "grad_norm": 2.640625, | |
| "learning_rate": 2.118738678797191e-06, | |
| "loss": 0.1677, | |
| "mean_token_accuracy": 0.9641611945629119, | |
| "num_tokens": 5321112342.0, | |
| "step": 50250 | |
| }, | |
| { | |
| "entropy": 1.25578125, | |
| "epoch": 1.2315753391116988, | |
| "grad_norm": 3.6875, | |
| "learning_rate": 2.112935666480758e-06, | |
| "loss": 0.1583, | |
| "mean_token_accuracy": 0.965636430978775, | |
| "num_tokens": 5326352547.0, | |
| "step": 50300 | |
| }, | |
| { | |
| "entropy": 1.26484375, | |
| "epoch": 1.2327995690710543, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 2.1071362889806863e-06, | |
| "loss": 0.1729, | |
| "mean_token_accuracy": 0.963402829170227, | |
| "num_tokens": 5331870603.0, | |
| "step": 50350 | |
| }, | |
| { | |
| "entropy": 1.27546875, | |
| "epoch": 1.23402379903041, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 2.101340570060385e-06, | |
| "loss": 0.1711, | |
| "mean_token_accuracy": 0.9636083686351776, | |
| "num_tokens": 5337306717.0, | |
| "step": 50400 | |
| }, | |
| { | |
| "entropy": 1.24609375, | |
| "epoch": 1.2352480289897654, | |
| "grad_norm": 4.125, | |
| "learning_rate": 2.09554853346827e-06, | |
| "loss": 0.1558, | |
| "mean_token_accuracy": 0.9663618934154511, | |
| "num_tokens": 5342628594.0, | |
| "step": 50450 | |
| }, | |
| { | |
| "entropy": 1.2796875, | |
| "epoch": 1.236472258949121, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 2.089760202937671e-06, | |
| "loss": 0.1711, | |
| "mean_token_accuracy": 0.9637987637519836, | |
| "num_tokens": 5348316678.0, | |
| "step": 50500 | |
| }, | |
| { | |
| "entropy": 1.2409375, | |
| "epoch": 1.2376964889084765, | |
| "grad_norm": 4.09375, | |
| "learning_rate": 2.0839756021867306e-06, | |
| "loss": 0.1499, | |
| "mean_token_accuracy": 0.967620609998703, | |
| "num_tokens": 5353095952.0, | |
| "step": 50550 | |
| }, | |
| { | |
| "entropy": 1.26671875, | |
| "epoch": 1.2389207188678322, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 2.07819475491831e-06, | |
| "loss": 0.1675, | |
| "mean_token_accuracy": 0.9643842697143554, | |
| "num_tokens": 5358561384.0, | |
| "step": 50600 | |
| }, | |
| { | |
| "entropy": 1.25734375, | |
| "epoch": 1.2401449488271876, | |
| "grad_norm": 3.546875, | |
| "learning_rate": 2.0724176848198856e-06, | |
| "loss": 0.1578, | |
| "mean_token_accuracy": 0.9659811770915985, | |
| "num_tokens": 5363968041.0, | |
| "step": 50650 | |
| }, | |
| { | |
| "entropy": 1.2559375, | |
| "epoch": 1.2413691787865433, | |
| "grad_norm": 3.265625, | |
| "learning_rate": 2.0666444155634613e-06, | |
| "loss": 0.1678, | |
| "mean_token_accuracy": 0.9649008166790009, | |
| "num_tokens": 5369138043.0, | |
| "step": 50700 | |
| }, | |
| { | |
| "entropy": 1.2790625, | |
| "epoch": 1.2425934087458987, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 2.0608749708054666e-06, | |
| "loss": 0.1717, | |
| "mean_token_accuracy": 0.9624824106693268, | |
| "num_tokens": 5374681050.0, | |
| "step": 50750 | |
| }, | |
| { | |
| "entropy": 1.274375, | |
| "epoch": 1.2438176387052544, | |
| "grad_norm": 3.5, | |
| "learning_rate": 2.0551093741866555e-06, | |
| "loss": 0.1653, | |
| "mean_token_accuracy": 0.964318573474884, | |
| "num_tokens": 5379930328.0, | |
| "step": 50800 | |
| }, | |
| { | |
| "entropy": 1.2709375, | |
| "epoch": 1.24504186866461, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 2.0493476493320182e-06, | |
| "loss": 0.1639, | |
| "mean_token_accuracy": 0.9642879796028138, | |
| "num_tokens": 5385290824.0, | |
| "step": 50850 | |
| }, | |
| { | |
| "entropy": 1.27515625, | |
| "epoch": 1.2462660986239655, | |
| "grad_norm": 3.15625, | |
| "learning_rate": 2.043589819850679e-06, | |
| "loss": 0.1784, | |
| "mean_token_accuracy": 0.9621766293048859, | |
| "num_tokens": 5390915687.0, | |
| "step": 50900 | |
| }, | |
| { | |
| "entropy": 1.26828125, | |
| "epoch": 1.247490328583321, | |
| "grad_norm": 2.671875, | |
| "learning_rate": 2.037835909335799e-06, | |
| "loss": 0.1653, | |
| "mean_token_accuracy": 0.9644598591327668, | |
| "num_tokens": 5396364664.0, | |
| "step": 50950 | |
| }, | |
| { | |
| "entropy": 1.23453125, | |
| "epoch": 1.2487145585426767, | |
| "grad_norm": 3.4375, | |
| "learning_rate": 2.032085941364483e-06, | |
| "loss": 0.1475, | |
| "mean_token_accuracy": 0.9683002579212189, | |
| "num_tokens": 5401284379.0, | |
| "step": 51000 | |
| }, | |
| { | |
| "entropy": 1.264375, | |
| "epoch": 1.2499387885020323, | |
| "grad_norm": 2.671875, | |
| "learning_rate": 2.026339939497681e-06, | |
| "loss": 0.1672, | |
| "mean_token_accuracy": 0.9641962945461273, | |
| "num_tokens": 5406818098.0, | |
| "step": 51050 | |
| }, | |
| { | |
| "entropy": 1.23828125, | |
| "epoch": 1.2511630184613878, | |
| "grad_norm": 0.2138671875, | |
| "learning_rate": 2.020597927280089e-06, | |
| "loss": 0.1498, | |
| "mean_token_accuracy": 0.9685159015655518, | |
| "num_tokens": 5411689647.0, | |
| "step": 51100 | |
| }, | |
| { | |
| "entropy": 1.24640625, | |
| "epoch": 1.2523872484207432, | |
| "grad_norm": 2.640625, | |
| "learning_rate": 2.014859928240058e-06, | |
| "loss": 0.1583, | |
| "mean_token_accuracy": 0.9665188646316528, | |
| "num_tokens": 5416677115.0, | |
| "step": 51150 | |
| }, | |
| { | |
| "entropy": 1.24796875, | |
| "epoch": 1.253611478380099, | |
| "grad_norm": 1.84375, | |
| "learning_rate": 2.0091259658894926e-06, | |
| "loss": 0.1525, | |
| "mean_token_accuracy": 0.9675477313995361, | |
| "num_tokens": 5422071895.0, | |
| "step": 51200 | |
| }, | |
| { | |
| "entropy": 1.27703125, | |
| "epoch": 1.2548357083394546, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 2.00339606372376e-06, | |
| "loss": 0.1796, | |
| "mean_token_accuracy": 0.9615858125686646, | |
| "num_tokens": 5427896152.0, | |
| "step": 51250 | |
| }, | |
| { | |
| "entropy": 1.25203125, | |
| "epoch": 1.25605993829881, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 1.9976702452215846e-06, | |
| "loss": 0.1615, | |
| "mean_token_accuracy": 0.9655699288845062, | |
| "num_tokens": 5432956715.0, | |
| "step": 51300 | |
| }, | |
| { | |
| "entropy": 1.25671875, | |
| "epoch": 1.2572841682581657, | |
| "grad_norm": 2.5, | |
| "learning_rate": 1.9919485338449633e-06, | |
| "loss": 0.1669, | |
| "mean_token_accuracy": 0.963955899477005, | |
| "num_tokens": 5438521726.0, | |
| "step": 51350 | |
| }, | |
| { | |
| "entropy": 1.25890625, | |
| "epoch": 1.2585083982175211, | |
| "grad_norm": 3.671875, | |
| "learning_rate": 1.9862309530390627e-06, | |
| "loss": 0.1604, | |
| "mean_token_accuracy": 0.9649885761737823, | |
| "num_tokens": 5443663826.0, | |
| "step": 51400 | |
| }, | |
| { | |
| "entropy": 1.25375, | |
| "epoch": 1.2597326281768768, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 1.98051752623212e-06, | |
| "loss": 0.1607, | |
| "mean_token_accuracy": 0.9659333276748657, | |
| "num_tokens": 5448801306.0, | |
| "step": 51450 | |
| }, | |
| { | |
| "entropy": 1.26546875, | |
| "epoch": 1.2609568581362323, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 1.9748082768353554e-06, | |
| "loss": 0.1624, | |
| "mean_token_accuracy": 0.9649898850917816, | |
| "num_tokens": 5454048809.0, | |
| "step": 51500 | |
| }, | |
| { | |
| "entropy": 1.2559375, | |
| "epoch": 1.262181088095588, | |
| "grad_norm": 3.40625, | |
| "learning_rate": 1.969103228242872e-06, | |
| "loss": 0.1671, | |
| "mean_token_accuracy": 0.9636943113803863, | |
| "num_tokens": 5459063221.0, | |
| "step": 51550 | |
| }, | |
| { | |
| "entropy": 1.26359375, | |
| "epoch": 1.2634053180549434, | |
| "grad_norm": 0.01025390625, | |
| "learning_rate": 1.9634024038315556e-06, | |
| "loss": 0.1555, | |
| "mean_token_accuracy": 0.9668670952320099, | |
| "num_tokens": 5464218533.0, | |
| "step": 51600 | |
| }, | |
| { | |
| "entropy": 1.26984375, | |
| "epoch": 1.264629548014299, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 1.9577058269609873e-06, | |
| "loss": 0.1677, | |
| "mean_token_accuracy": 0.9646493744850159, | |
| "num_tokens": 5469633751.0, | |
| "step": 51650 | |
| }, | |
| { | |
| "entropy": 1.26015625, | |
| "epoch": 1.2658537779736545, | |
| "grad_norm": 2.5, | |
| "learning_rate": 1.9520135209733434e-06, | |
| "loss": 0.1548, | |
| "mean_token_accuracy": 0.9670298910140991, | |
| "num_tokens": 5474658175.0, | |
| "step": 51700 | |
| }, | |
| { | |
| "entropy": 1.24671875, | |
| "epoch": 1.2670780079330102, | |
| "grad_norm": 2.921875, | |
| "learning_rate": 1.9463255091932946e-06, | |
| "loss": 0.168, | |
| "mean_token_accuracy": 0.9642450773715973, | |
| "num_tokens": 5480009732.0, | |
| "step": 51750 | |
| }, | |
| { | |
| "entropy": 1.25875, | |
| "epoch": 1.2683022378923656, | |
| "grad_norm": 2.703125, | |
| "learning_rate": 1.9406418149279224e-06, | |
| "loss": 0.1667, | |
| "mean_token_accuracy": 0.9646876096725464, | |
| "num_tokens": 5485352642.0, | |
| "step": 51800 | |
| }, | |
| { | |
| "entropy": 1.25078125, | |
| "epoch": 1.2695264678517213, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 1.9349624614666137e-06, | |
| "loss": 0.1599, | |
| "mean_token_accuracy": 0.9663380241394043, | |
| "num_tokens": 5490516069.0, | |
| "step": 51850 | |
| }, | |
| { | |
| "entropy": 1.2540625, | |
| "epoch": 1.270750697811077, | |
| "grad_norm": 2.125, | |
| "learning_rate": 1.9292874720809706e-06, | |
| "loss": 0.1691, | |
| "mean_token_accuracy": 0.9637067282199859, | |
| "num_tokens": 5495858878.0, | |
| "step": 51900 | |
| }, | |
| { | |
| "entropy": 1.2459375, | |
| "epoch": 1.2719749277704324, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 1.9236168700247085e-06, | |
| "loss": 0.1597, | |
| "mean_token_accuracy": 0.9652304399013519, | |
| "num_tokens": 5500992334.0, | |
| "step": 51950 | |
| }, | |
| { | |
| "entropy": 1.26390625, | |
| "epoch": 1.2731991577297879, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 1.9179506785335695e-06, | |
| "loss": 0.1784, | |
| "mean_token_accuracy": 0.9612833940982819, | |
| "num_tokens": 5506364973.0, | |
| "step": 52000 | |
| }, | |
| { | |
| "entropy": 1.2540625, | |
| "epoch": 1.2744233876891435, | |
| "grad_norm": 3.09375, | |
| "learning_rate": 1.912288920825224e-06, | |
| "loss": 0.1668, | |
| "mean_token_accuracy": 0.9639379584789276, | |
| "num_tokens": 5511847363.0, | |
| "step": 52050 | |
| }, | |
| { | |
| "entropy": 1.26140625, | |
| "epoch": 1.2756476176484992, | |
| "grad_norm": 3.59375, | |
| "learning_rate": 1.9066316200991702e-06, | |
| "loss": 0.1739, | |
| "mean_token_accuracy": 0.9622644722461701, | |
| "num_tokens": 5517402202.0, | |
| "step": 52100 | |
| }, | |
| { | |
| "entropy": 1.23109375, | |
| "epoch": 1.2768718476078547, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 1.9009787995366464e-06, | |
| "loss": 0.1571, | |
| "mean_token_accuracy": 0.9665352630615235, | |
| "num_tokens": 5522479618.0, | |
| "step": 52150 | |
| }, | |
| { | |
| "entropy": 1.2396875, | |
| "epoch": 1.27809607756721, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 1.8953304823005346e-06, | |
| "loss": 0.159, | |
| "mean_token_accuracy": 0.965977475643158, | |
| "num_tokens": 5527761846.0, | |
| "step": 52200 | |
| }, | |
| { | |
| "entropy": 1.24140625, | |
| "epoch": 1.2793203075265658, | |
| "grad_norm": 2.765625, | |
| "learning_rate": 1.889686691535259e-06, | |
| "loss": 0.1713, | |
| "mean_token_accuracy": 0.9641374492645264, | |
| "num_tokens": 5533078395.0, | |
| "step": 52250 | |
| }, | |
| { | |
| "entropy": 1.22796875, | |
| "epoch": 1.2805445374859215, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 1.8840474503667003e-06, | |
| "loss": 0.1613, | |
| "mean_token_accuracy": 0.96567800283432, | |
| "num_tokens": 5538079639.0, | |
| "step": 52300 | |
| }, | |
| { | |
| "entropy": 1.233125, | |
| "epoch": 1.281768767445277, | |
| "grad_norm": 1.828125, | |
| "learning_rate": 1.8784127819020977e-06, | |
| "loss": 0.1696, | |
| "mean_token_accuracy": 0.9639940130710601, | |
| "num_tokens": 5543060468.0, | |
| "step": 52350 | |
| }, | |
| { | |
| "entropy": 1.23828125, | |
| "epoch": 1.2829929974046324, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 1.8727827092299486e-06, | |
| "loss": 0.1713, | |
| "mean_token_accuracy": 0.9634285986423492, | |
| "num_tokens": 5548455628.0, | |
| "step": 52400 | |
| }, | |
| { | |
| "entropy": 1.2671875, | |
| "epoch": 1.284217227363988, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 1.8671572554199227e-06, | |
| "loss": 0.1745, | |
| "mean_token_accuracy": 0.9630351853370667, | |
| "num_tokens": 5554243712.0, | |
| "step": 52450 | |
| }, | |
| { | |
| "entropy": 1.256875, | |
| "epoch": 1.2854414573233437, | |
| "grad_norm": 3.09375, | |
| "learning_rate": 1.8615364435227627e-06, | |
| "loss": 0.1713, | |
| "mean_token_accuracy": 0.9632880544662475, | |
| "num_tokens": 5559645728.0, | |
| "step": 52500 | |
| }, | |
| { | |
| "entropy": 1.25578125, | |
| "epoch": 1.2866656872826991, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 1.8559202965701921e-06, | |
| "loss": 0.1729, | |
| "mean_token_accuracy": 0.9628579890727997, | |
| "num_tokens": 5565441017.0, | |
| "step": 52550 | |
| }, | |
| { | |
| "entropy": 1.241875, | |
| "epoch": 1.2878899172420548, | |
| "grad_norm": 3.0625, | |
| "learning_rate": 1.850308837574815e-06, | |
| "loss": 0.1567, | |
| "mean_token_accuracy": 0.9662058663368225, | |
| "num_tokens": 5570548727.0, | |
| "step": 52600 | |
| }, | |
| { | |
| "entropy": 1.2465625, | |
| "epoch": 1.2891141472014103, | |
| "grad_norm": 3.671875, | |
| "learning_rate": 1.8447020895300304e-06, | |
| "loss": 0.1627, | |
| "mean_token_accuracy": 0.9654901123046875, | |
| "num_tokens": 5575812384.0, | |
| "step": 52650 | |
| }, | |
| { | |
| "entropy": 1.26609375, | |
| "epoch": 1.290338377160766, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 1.8391000754099329e-06, | |
| "loss": 0.1704, | |
| "mean_token_accuracy": 0.9641706418991088, | |
| "num_tokens": 5581119333.0, | |
| "step": 52700 | |
| }, | |
| { | |
| "entropy": 1.25296875, | |
| "epoch": 1.2915626071201214, | |
| "grad_norm": 3.484375, | |
| "learning_rate": 1.8335028181692183e-06, | |
| "loss": 0.1591, | |
| "mean_token_accuracy": 0.9657709896564484, | |
| "num_tokens": 5586146551.0, | |
| "step": 52750 | |
| }, | |
| { | |
| "entropy": 1.26609375, | |
| "epoch": 1.292786837079477, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 1.8279103407430918e-06, | |
| "loss": 0.1682, | |
| "mean_token_accuracy": 0.9645370328426361, | |
| "num_tokens": 5591535827.0, | |
| "step": 52800 | |
| }, | |
| { | |
| "entropy": 1.24609375, | |
| "epoch": 1.2940110670388325, | |
| "grad_norm": 3.09375, | |
| "learning_rate": 1.822322666047173e-06, | |
| "loss": 0.156, | |
| "mean_token_accuracy": 0.966865359544754, | |
| "num_tokens": 5596513224.0, | |
| "step": 52850 | |
| }, | |
| { | |
| "entropy": 1.23125, | |
| "epoch": 1.2952352969981882, | |
| "grad_norm": 2.59375, | |
| "learning_rate": 1.8167398169774003e-06, | |
| "loss": 0.1562, | |
| "mean_token_accuracy": 0.9663991129398346, | |
| "num_tokens": 5601409756.0, | |
| "step": 52900 | |
| }, | |
| { | |
| "entropy": 1.24203125, | |
| "epoch": 1.2964595269575436, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 1.8111618164099405e-06, | |
| "loss": 0.1586, | |
| "mean_token_accuracy": 0.965841782093048, | |
| "num_tokens": 5606579901.0, | |
| "step": 52950 | |
| }, | |
| { | |
| "entropy": 1.25640625, | |
| "epoch": 1.2976837569168993, | |
| "grad_norm": 4.40625, | |
| "learning_rate": 1.805588687201094e-06, | |
| "loss": 0.1551, | |
| "mean_token_accuracy": 0.9661786913871765, | |
| "num_tokens": 5611890254.0, | |
| "step": 53000 | |
| }, | |
| { | |
| "entropy": 1.27453125, | |
| "epoch": 1.2989079868762547, | |
| "grad_norm": 2.9375, | |
| "learning_rate": 1.8000204521871968e-06, | |
| "loss": 0.1736, | |
| "mean_token_accuracy": 0.9631719040870667, | |
| "num_tokens": 5617317192.0, | |
| "step": 53050 | |
| }, | |
| { | |
| "entropy": 1.254375, | |
| "epoch": 1.3001322168356104, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 1.7944571341845338e-06, | |
| "loss": 0.1735, | |
| "mean_token_accuracy": 0.9628773295879364, | |
| "num_tokens": 5622759860.0, | |
| "step": 53100 | |
| }, | |
| { | |
| "entropy": 1.251875, | |
| "epoch": 1.301356446794966, | |
| "grad_norm": 3.328125, | |
| "learning_rate": 1.788898755989241e-06, | |
| "loss": 0.1544, | |
| "mean_token_accuracy": 0.966829891204834, | |
| "num_tokens": 5628009830.0, | |
| "step": 53150 | |
| }, | |
| { | |
| "entropy": 1.2346875, | |
| "epoch": 1.3025806767543215, | |
| "grad_norm": 1.9140625, | |
| "learning_rate": 1.7833453403772148e-06, | |
| "loss": 0.1496, | |
| "mean_token_accuracy": 0.9679068636894226, | |
| "num_tokens": 5633028331.0, | |
| "step": 53200 | |
| }, | |
| { | |
| "entropy": 1.23625, | |
| "epoch": 1.303804906713677, | |
| "grad_norm": 2.765625, | |
| "learning_rate": 1.7777969101040137e-06, | |
| "loss": 0.1598, | |
| "mean_token_accuracy": 0.9658224785327911, | |
| "num_tokens": 5638192081.0, | |
| "step": 53250 | |
| }, | |
| { | |
| "entropy": 1.25921875, | |
| "epoch": 1.3050291366730327, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 1.7722534879047704e-06, | |
| "loss": 0.1679, | |
| "mean_token_accuracy": 0.9648814105987549, | |
| "num_tokens": 5643678649.0, | |
| "step": 53300 | |
| }, | |
| { | |
| "entropy": 1.23703125, | |
| "epoch": 1.3062533666323883, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 1.7667150964940981e-06, | |
| "loss": 0.1542, | |
| "mean_token_accuracy": 0.9665197932720184, | |
| "num_tokens": 5648865610.0, | |
| "step": 53350 | |
| }, | |
| { | |
| "entropy": 1.2546875, | |
| "epoch": 1.3074775965917438, | |
| "grad_norm": 3.46875, | |
| "learning_rate": 1.7611817585659915e-06, | |
| "loss": 0.1695, | |
| "mean_token_accuracy": 0.96389883518219, | |
| "num_tokens": 5654452208.0, | |
| "step": 53400 | |
| }, | |
| { | |
| "entropy": 1.23046875, | |
| "epoch": 1.3087018265510992, | |
| "grad_norm": 2.9375, | |
| "learning_rate": 1.7556534967937428e-06, | |
| "loss": 0.1477, | |
| "mean_token_accuracy": 0.967578010559082, | |
| "num_tokens": 5659553855.0, | |
| "step": 53450 | |
| }, | |
| { | |
| "entropy": 1.2696875, | |
| "epoch": 1.309926056510455, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 1.750130333829843e-06, | |
| "loss": 0.174, | |
| "mean_token_accuracy": 0.9626197755336762, | |
| "num_tokens": 5665208689.0, | |
| "step": 53500 | |
| }, | |
| { | |
| "entropy": 1.230625, | |
| "epoch": 1.3111502864698106, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 1.744612292305887e-06, | |
| "loss": 0.1488, | |
| "mean_token_accuracy": 0.9678456223011017, | |
| "num_tokens": 5670219320.0, | |
| "step": 53550 | |
| }, | |
| { | |
| "entropy": 1.26109375, | |
| "epoch": 1.312374516429166, | |
| "grad_norm": 3.46875, | |
| "learning_rate": 1.73909939483249e-06, | |
| "loss": 0.176, | |
| "mean_token_accuracy": 0.9616470074653626, | |
| "num_tokens": 5676005681.0, | |
| "step": 53600 | |
| }, | |
| { | |
| "entropy": 1.23359375, | |
| "epoch": 1.3135987463885217, | |
| "grad_norm": 3.46875, | |
| "learning_rate": 1.7335916639991833e-06, | |
| "loss": 0.1579, | |
| "mean_token_accuracy": 0.9656192350387574, | |
| "num_tokens": 5680838804.0, | |
| "step": 53650 | |
| }, | |
| { | |
| "entropy": 1.24828125, | |
| "epoch": 1.3148229763478771, | |
| "grad_norm": 3.25, | |
| "learning_rate": 1.7280891223743347e-06, | |
| "loss": 0.1663, | |
| "mean_token_accuracy": 0.9647430288791656, | |
| "num_tokens": 5686118856.0, | |
| "step": 53700 | |
| }, | |
| { | |
| "entropy": 1.25203125, | |
| "epoch": 1.3160472063072328, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 1.7225917925050384e-06, | |
| "loss": 0.1808, | |
| "mean_token_accuracy": 0.9617255198955535, | |
| "num_tokens": 5691606584.0, | |
| "step": 53750 | |
| }, | |
| { | |
| "entropy": 1.23875, | |
| "epoch": 1.3172714362665883, | |
| "grad_norm": 2.796875, | |
| "learning_rate": 1.7170996969170434e-06, | |
| "loss": 0.1643, | |
| "mean_token_accuracy": 0.9644413828849793, | |
| "num_tokens": 5697025528.0, | |
| "step": 53800 | |
| }, | |
| { | |
| "entropy": 1.23609375, | |
| "epoch": 1.318495666225944, | |
| "grad_norm": 3.0, | |
| "learning_rate": 1.7116128581146443e-06, | |
| "loss": 0.1579, | |
| "mean_token_accuracy": 0.9660075342655182, | |
| "num_tokens": 5702129646.0, | |
| "step": 53850 | |
| }, | |
| { | |
| "entropy": 1.239375, | |
| "epoch": 1.3197198961852994, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 1.7061312985805986e-06, | |
| "loss": 0.1659, | |
| "mean_token_accuracy": 0.9642334473133087, | |
| "num_tokens": 5707290385.0, | |
| "step": 53900 | |
| }, | |
| { | |
| "entropy": 1.23515625, | |
| "epoch": 1.320944126144655, | |
| "grad_norm": 3.421875, | |
| "learning_rate": 1.7006550407760285e-06, | |
| "loss": 0.1636, | |
| "mean_token_accuracy": 0.9647632312774658, | |
| "num_tokens": 5712555849.0, | |
| "step": 53950 | |
| }, | |
| { | |
| "entropy": 1.2396875, | |
| "epoch": 1.3221683561040105, | |
| "grad_norm": 3.71875, | |
| "learning_rate": 1.695184107140337e-06, | |
| "loss": 0.1682, | |
| "mean_token_accuracy": 0.9639084780216217, | |
| "num_tokens": 5717928890.0, | |
| "step": 54000 | |
| }, | |
| { | |
| "entropy": 1.2246875, | |
| "epoch": 1.3233925860633662, | |
| "grad_norm": 3.921875, | |
| "learning_rate": 1.6897185200911068e-06, | |
| "loss": 0.1468, | |
| "mean_token_accuracy": 0.9690938425064087, | |
| "num_tokens": 5722987021.0, | |
| "step": 54050 | |
| }, | |
| { | |
| "entropy": 1.2565625, | |
| "epoch": 1.3246168160227216, | |
| "grad_norm": 2.875, | |
| "learning_rate": 1.6842583020240137e-06, | |
| "loss": 0.166, | |
| "mean_token_accuracy": 0.9647270548343658, | |
| "num_tokens": 5728523665.0, | |
| "step": 54100 | |
| }, | |
| { | |
| "entropy": 1.2253125, | |
| "epoch": 1.3258410459820773, | |
| "grad_norm": 3.046875, | |
| "learning_rate": 1.6788034753127332e-06, | |
| "loss": 0.1509, | |
| "mean_token_accuracy": 0.9676713216304779, | |
| "num_tokens": 5733724051.0, | |
| "step": 54150 | |
| }, | |
| { | |
| "entropy": 1.2478125, | |
| "epoch": 1.327065275941433, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 1.6733540623088485e-06, | |
| "loss": 0.1703, | |
| "mean_token_accuracy": 0.9635128057003022, | |
| "num_tokens": 5739544907.0, | |
| "step": 54200 | |
| }, | |
| { | |
| "entropy": 1.245, | |
| "epoch": 1.3282895059007884, | |
| "grad_norm": 2.25, | |
| "learning_rate": 1.6679100853417647e-06, | |
| "loss": 0.1592, | |
| "mean_token_accuracy": 0.9656123912334442, | |
| "num_tokens": 5744896935.0, | |
| "step": 54250 | |
| }, | |
| { | |
| "entropy": 1.25453125, | |
| "epoch": 1.3295137358601439, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 1.6624715667186047e-06, | |
| "loss": 0.1756, | |
| "mean_token_accuracy": 0.962364639043808, | |
| "num_tokens": 5750164763.0, | |
| "step": 54300 | |
| }, | |
| { | |
| "entropy": 1.23609375, | |
| "epoch": 1.3307379658194995, | |
| "grad_norm": 3.15625, | |
| "learning_rate": 1.6570385287241335e-06, | |
| "loss": 0.1577, | |
| "mean_token_accuracy": 0.9660208249092102, | |
| "num_tokens": 5755265140.0, | |
| "step": 54350 | |
| }, | |
| { | |
| "entropy": 1.25390625, | |
| "epoch": 1.3319621957788552, | |
| "grad_norm": 1.640625, | |
| "learning_rate": 1.6516109936206498e-06, | |
| "loss": 0.1756, | |
| "mean_token_accuracy": 0.9626241695880889, | |
| "num_tokens": 5760623089.0, | |
| "step": 54400 | |
| }, | |
| { | |
| "entropy": 1.246875, | |
| "epoch": 1.3331864257382107, | |
| "grad_norm": 2.125, | |
| "learning_rate": 1.646188983647912e-06, | |
| "loss": 0.1734, | |
| "mean_token_accuracy": 0.9631841456890107, | |
| "num_tokens": 5766177496.0, | |
| "step": 54450 | |
| }, | |
| { | |
| "entropy": 1.26140625, | |
| "epoch": 1.3344106556975661, | |
| "grad_norm": 2.921875, | |
| "learning_rate": 1.6407725210230344e-06, | |
| "loss": 0.1766, | |
| "mean_token_accuracy": 0.9622941052913666, | |
| "num_tokens": 5771692920.0, | |
| "step": 54500 | |
| }, | |
| { | |
| "entropy": 1.2415625, | |
| "epoch": 1.3356348856569218, | |
| "grad_norm": 4.0, | |
| "learning_rate": 1.6353616279404013e-06, | |
| "loss": 0.1569, | |
| "mean_token_accuracy": 0.9662493073940277, | |
| "num_tokens": 5777098724.0, | |
| "step": 54550 | |
| }, | |
| { | |
| "entropy": 1.23234375, | |
| "epoch": 1.3368591156162775, | |
| "grad_norm": 1.0, | |
| "learning_rate": 1.6299563265715747e-06, | |
| "loss": 0.148, | |
| "mean_token_accuracy": 0.9682403624057769, | |
| "num_tokens": 5782119917.0, | |
| "step": 54600 | |
| }, | |
| { | |
| "entropy": 1.25578125, | |
| "epoch": 1.338083345575633, | |
| "grad_norm": 3.46875, | |
| "learning_rate": 1.624556639065207e-06, | |
| "loss": 0.1594, | |
| "mean_token_accuracy": 0.9662695753574372, | |
| "num_tokens": 5787291101.0, | |
| "step": 54650 | |
| }, | |
| { | |
| "entropy": 1.24171875, | |
| "epoch": 1.3393075755349884, | |
| "grad_norm": 3.609375, | |
| "learning_rate": 1.6191625875469446e-06, | |
| "loss": 0.157, | |
| "mean_token_accuracy": 0.9663849449157715, | |
| "num_tokens": 5792520283.0, | |
| "step": 54700 | |
| }, | |
| { | |
| "entropy": 1.25046875, | |
| "epoch": 1.340531805494344, | |
| "grad_norm": 1.7734375, | |
| "learning_rate": 1.6137741941193398e-06, | |
| "loss": 0.1495, | |
| "mean_token_accuracy": 0.9671278047561646, | |
| "num_tokens": 5797431576.0, | |
| "step": 54750 | |
| }, | |
| { | |
| "entropy": 1.26546875, | |
| "epoch": 1.3417560354536997, | |
| "grad_norm": 2.734375, | |
| "learning_rate": 1.6083914808617645e-06, | |
| "loss": 0.1765, | |
| "mean_token_accuracy": 0.9622493016719819, | |
| "num_tokens": 5803286714.0, | |
| "step": 54800 | |
| }, | |
| { | |
| "entropy": 1.224375, | |
| "epoch": 1.3429802654130552, | |
| "grad_norm": 3.109375, | |
| "learning_rate": 1.6030144698303079e-06, | |
| "loss": 0.1544, | |
| "mean_token_accuracy": 0.9669049537181854, | |
| "num_tokens": 5807862828.0, | |
| "step": 54850 | |
| }, | |
| { | |
| "entropy": 1.26, | |
| "epoch": 1.3442044953724108, | |
| "grad_norm": 3.0, | |
| "learning_rate": 1.5976431830577022e-06, | |
| "loss": 0.1636, | |
| "mean_token_accuracy": 0.964913833141327, | |
| "num_tokens": 5813034358.0, | |
| "step": 54900 | |
| }, | |
| { | |
| "entropy": 1.25109375, | |
| "epoch": 1.3454287253317663, | |
| "grad_norm": 3.515625, | |
| "learning_rate": 1.5922776425532186e-06, | |
| "loss": 0.1659, | |
| "mean_token_accuracy": 0.9639725112915039, | |
| "num_tokens": 5818413943.0, | |
| "step": 54950 | |
| }, | |
| { | |
| "entropy": 1.2321875, | |
| "epoch": 1.346652955291122, | |
| "grad_norm": 2.453125, | |
| "learning_rate": 1.5869178703025869e-06, | |
| "loss": 0.1489, | |
| "mean_token_accuracy": 0.9674529373645783, | |
| "num_tokens": 5823085402.0, | |
| "step": 55000 | |
| }, | |
| { | |
| "entropy": 1.23859375, | |
| "epoch": 1.3478771852504774, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 1.5815638882678944e-06, | |
| "loss": 0.1608, | |
| "mean_token_accuracy": 0.9654952967166901, | |
| "num_tokens": 5828359072.0, | |
| "step": 55050 | |
| }, | |
| { | |
| "entropy": 1.2465625, | |
| "epoch": 1.349101415209833, | |
| "grad_norm": 2.8125, | |
| "learning_rate": 1.5762157183875092e-06, | |
| "loss": 0.1618, | |
| "mean_token_accuracy": 0.965077908039093, | |
| "num_tokens": 5833897215.0, | |
| "step": 55100 | |
| }, | |
| { | |
| "entropy": 1.24125, | |
| "epoch": 1.3503256451691885, | |
| "grad_norm": 2.25, | |
| "learning_rate": 1.5708733825759804e-06, | |
| "loss": 0.1597, | |
| "mean_token_accuracy": 0.9658141255378723, | |
| "num_tokens": 5839005187.0, | |
| "step": 55150 | |
| }, | |
| { | |
| "entropy": 1.2690625, | |
| "epoch": 1.3515498751285442, | |
| "grad_norm": 3.0625, | |
| "learning_rate": 1.5655369027239507e-06, | |
| "loss": 0.1728, | |
| "mean_token_accuracy": 0.9630602359771728, | |
| "num_tokens": 5844499544.0, | |
| "step": 55200 | |
| }, | |
| { | |
| "entropy": 1.2484375, | |
| "epoch": 1.3527741050878996, | |
| "grad_norm": 2.734375, | |
| "learning_rate": 1.5602063006980713e-06, | |
| "loss": 0.1606, | |
| "mean_token_accuracy": 0.9662463283538818, | |
| "num_tokens": 5849831304.0, | |
| "step": 55250 | |
| }, | |
| { | |
| "entropy": 1.243125, | |
| "epoch": 1.3539983350472553, | |
| "grad_norm": 4.125, | |
| "learning_rate": 1.5548815983409054e-06, | |
| "loss": 0.1584, | |
| "mean_token_accuracy": 0.9648811197280884, | |
| "num_tokens": 5854831384.0, | |
| "step": 55300 | |
| }, | |
| { | |
| "entropy": 1.2475, | |
| "epoch": 1.3552225650066108, | |
| "grad_norm": 0.0169677734375, | |
| "learning_rate": 1.5495628174708422e-06, | |
| "loss": 0.1583, | |
| "mean_token_accuracy": 0.9666490364074707, | |
| "num_tokens": 5860380821.0, | |
| "step": 55350 | |
| }, | |
| { | |
| "entropy": 1.2428125, | |
| "epoch": 1.3564467949659664, | |
| "grad_norm": 2.96875, | |
| "learning_rate": 1.5442499798820062e-06, | |
| "loss": 0.1636, | |
| "mean_token_accuracy": 0.9649770343303681, | |
| "num_tokens": 5865590076.0, | |
| "step": 55400 | |
| }, | |
| { | |
| "entropy": 1.25265625, | |
| "epoch": 1.357671024925322, | |
| "grad_norm": 5.0625, | |
| "learning_rate": 1.5389431073441742e-06, | |
| "loss": 0.1625, | |
| "mean_token_accuracy": 0.9651528835296631, | |
| "num_tokens": 5870893580.0, | |
| "step": 55450 | |
| }, | |
| { | |
| "entropy": 1.2590625, | |
| "epoch": 1.3588952548846776, | |
| "grad_norm": 3.03125, | |
| "learning_rate": 1.5336422216026717e-06, | |
| "loss": 0.1708, | |
| "mean_token_accuracy": 0.9625674414634705, | |
| "num_tokens": 5876137820.0, | |
| "step": 55500 | |
| }, | |
| { | |
| "entropy": 1.24515625, | |
| "epoch": 1.360119484844033, | |
| "grad_norm": 3.046875, | |
| "learning_rate": 1.5283473443783021e-06, | |
| "loss": 0.1575, | |
| "mean_token_accuracy": 0.9658649146556855, | |
| "num_tokens": 5881136105.0, | |
| "step": 55550 | |
| }, | |
| { | |
| "entropy": 1.2434375, | |
| "epoch": 1.3613437148033887, | |
| "grad_norm": 3.015625, | |
| "learning_rate": 1.5230584973672404e-06, | |
| "loss": 0.1716, | |
| "mean_token_accuracy": 0.9642657494544983, | |
| "num_tokens": 5886333380.0, | |
| "step": 55600 | |
| }, | |
| { | |
| "entropy": 1.2628125, | |
| "epoch": 1.3625679447627443, | |
| "grad_norm": 1.6640625, | |
| "learning_rate": 1.5177757022409606e-06, | |
| "loss": 0.1788, | |
| "mean_token_accuracy": 0.9618762648105621, | |
| "num_tokens": 5892147042.0, | |
| "step": 55650 | |
| }, | |
| { | |
| "entropy": 1.25875, | |
| "epoch": 1.3637921747220998, | |
| "grad_norm": 3.0625, | |
| "learning_rate": 1.5124989806461293e-06, | |
| "loss": 0.1678, | |
| "mean_token_accuracy": 0.9644319689273835, | |
| "num_tokens": 5897583102.0, | |
| "step": 55700 | |
| }, | |
| { | |
| "entropy": 1.2546875, | |
| "epoch": 1.3650164046814552, | |
| "grad_norm": 2.453125, | |
| "learning_rate": 1.5072283542045348e-06, | |
| "loss": 0.1558, | |
| "mean_token_accuracy": 0.9658961379528046, | |
| "num_tokens": 5902701860.0, | |
| "step": 55750 | |
| }, | |
| { | |
| "entropy": 1.2584375, | |
| "epoch": 1.366240634640811, | |
| "grad_norm": 3.03125, | |
| "learning_rate": 1.5019638445129849e-06, | |
| "loss": 0.1656, | |
| "mean_token_accuracy": 0.9642118716239929, | |
| "num_tokens": 5908066266.0, | |
| "step": 55800 | |
| }, | |
| { | |
| "entropy": 1.24375, | |
| "epoch": 1.3674648646001666, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 1.496705473143224e-06, | |
| "loss": 0.1467, | |
| "mean_token_accuracy": 0.9683407878875733, | |
| "num_tokens": 5913106858.0, | |
| "step": 55850 | |
| }, | |
| { | |
| "entropy": 1.24109375, | |
| "epoch": 1.368689094559522, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 1.4914532616418477e-06, | |
| "loss": 0.1619, | |
| "mean_token_accuracy": 0.9651940071582794, | |
| "num_tokens": 5918299911.0, | |
| "step": 55900 | |
| }, | |
| { | |
| "entropy": 1.24296875, | |
| "epoch": 1.3699133245188777, | |
| "grad_norm": 3.015625, | |
| "learning_rate": 1.486207231530207e-06, | |
| "loss": 0.1533, | |
| "mean_token_accuracy": 0.966886637210846, | |
| "num_tokens": 5923373367.0, | |
| "step": 55950 | |
| }, | |
| { | |
| "entropy": 1.25984375, | |
| "epoch": 1.3711375544782332, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 1.4809674043043262e-06, | |
| "loss": 0.1714, | |
| "mean_token_accuracy": 0.9631552195549011, | |
| "num_tokens": 5928830248.0, | |
| "step": 56000 | |
| }, | |
| { | |
| "entropy": 1.24640625, | |
| "epoch": 1.3723617844375888, | |
| "grad_norm": 2.84375, | |
| "learning_rate": 1.4757338014348108e-06, | |
| "loss": 0.17, | |
| "mean_token_accuracy": 0.9638724672794342, | |
| "num_tokens": 5934360325.0, | |
| "step": 56050 | |
| }, | |
| { | |
| "entropy": 1.2428125, | |
| "epoch": 1.3735860143969443, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 1.4705064443667672e-06, | |
| "loss": 0.1672, | |
| "mean_token_accuracy": 0.9640205073356628, | |
| "num_tokens": 5939749032.0, | |
| "step": 56100 | |
| }, | |
| { | |
| "entropy": 1.2396875, | |
| "epoch": 1.3748102443563, | |
| "grad_norm": 3.140625, | |
| "learning_rate": 1.4652853545196994e-06, | |
| "loss": 0.1698, | |
| "mean_token_accuracy": 0.9635356509685516, | |
| "num_tokens": 5944946908.0, | |
| "step": 56150 | |
| }, | |
| { | |
| "entropy": 1.2471875, | |
| "epoch": 1.3760344743156554, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 1.4600705532874409e-06, | |
| "loss": 0.1612, | |
| "mean_token_accuracy": 0.9657069194316864, | |
| "num_tokens": 5950153678.0, | |
| "step": 56200 | |
| }, | |
| { | |
| "entropy": 1.2515625, | |
| "epoch": 1.377258704275011, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 1.45486206203805e-06, | |
| "loss": 0.1694, | |
| "mean_token_accuracy": 0.9643088591098785, | |
| "num_tokens": 5955488321.0, | |
| "step": 56250 | |
| }, | |
| { | |
| "entropy": 1.24984375, | |
| "epoch": 1.3784829342343665, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 1.4496599021137346e-06, | |
| "loss": 0.1802, | |
| "mean_token_accuracy": 0.9621450281143189, | |
| "num_tokens": 5961263793.0, | |
| "step": 56300 | |
| }, | |
| { | |
| "entropy": 1.22625, | |
| "epoch": 1.3797071641937222, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 1.4444640948307554e-06, | |
| "loss": 0.1567, | |
| "mean_token_accuracy": 0.9664753973484039, | |
| "num_tokens": 5966590895.0, | |
| "step": 56350 | |
| }, | |
| { | |
| "entropy": 1.2453125, | |
| "epoch": 1.3809313941530776, | |
| "grad_norm": 1.9921875, | |
| "learning_rate": 1.4392746614793446e-06, | |
| "loss": 0.162, | |
| "mean_token_accuracy": 0.9654717576503754, | |
| "num_tokens": 5972160004.0, | |
| "step": 56400 | |
| }, | |
| { | |
| "entropy": 1.24125, | |
| "epoch": 1.3821556241124333, | |
| "grad_norm": 2.90625, | |
| "learning_rate": 1.4340916233236167e-06, | |
| "loss": 0.1685, | |
| "mean_token_accuracy": 0.9643662881851196, | |
| "num_tokens": 5977855909.0, | |
| "step": 56450 | |
| }, | |
| { | |
| "entropy": 1.2490625, | |
| "epoch": 1.383379854071789, | |
| "grad_norm": 1.6171875, | |
| "learning_rate": 1.4289150016014792e-06, | |
| "loss": 0.1663, | |
| "mean_token_accuracy": 0.9650551450252532, | |
| "num_tokens": 5983284719.0, | |
| "step": 56500 | |
| }, | |
| { | |
| "entropy": 1.245, | |
| "epoch": 1.3846040840311444, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 1.4237448175245523e-06, | |
| "loss": 0.1565, | |
| "mean_token_accuracy": 0.9658044958114624, | |
| "num_tokens": 5988559128.0, | |
| "step": 56550 | |
| }, | |
| { | |
| "entropy": 1.24140625, | |
| "epoch": 1.3858283139904999, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 1.4185810922780736e-06, | |
| "loss": 0.1665, | |
| "mean_token_accuracy": 0.9643181717395782, | |
| "num_tokens": 5993939256.0, | |
| "step": 56600 | |
| }, | |
| { | |
| "entropy": 1.245, | |
| "epoch": 1.3870525439498556, | |
| "grad_norm": 2.796875, | |
| "learning_rate": 1.413423847020816e-06, | |
| "loss": 0.1721, | |
| "mean_token_accuracy": 0.963967101573944, | |
| "num_tokens": 5999401709.0, | |
| "step": 56650 | |
| }, | |
| { | |
| "entropy": 1.23953125, | |
| "epoch": 1.3882767739092112, | |
| "grad_norm": 3.203125, | |
| "learning_rate": 1.4082731028849995e-06, | |
| "loss": 0.1636, | |
| "mean_token_accuracy": 0.9649562358856201, | |
| "num_tokens": 6004763257.0, | |
| "step": 56700 | |
| }, | |
| { | |
| "entropy": 1.263125, | |
| "epoch": 1.3895010038685667, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 1.4031288809762096e-06, | |
| "loss": 0.1734, | |
| "mean_token_accuracy": 0.9629300630092621, | |
| "num_tokens": 6010451639.0, | |
| "step": 56750 | |
| }, | |
| { | |
| "entropy": 1.23171875, | |
| "epoch": 1.3907252338279221, | |
| "grad_norm": 2.734375, | |
| "learning_rate": 1.397991202373298e-06, | |
| "loss": 0.16, | |
| "mean_token_accuracy": 0.9664403641223908, | |
| "num_tokens": 6015769794.0, | |
| "step": 56800 | |
| }, | |
| { | |
| "entropy": 1.24171875, | |
| "epoch": 1.3919494637872778, | |
| "grad_norm": 1.71875, | |
| "learning_rate": 1.3928600881283135e-06, | |
| "loss": 0.1741, | |
| "mean_token_accuracy": 0.9627274203300477, | |
| "num_tokens": 6020957098.0, | |
| "step": 56850 | |
| }, | |
| { | |
| "entropy": 1.2315625, | |
| "epoch": 1.3931736937466335, | |
| "grad_norm": 0.00994873046875, | |
| "learning_rate": 1.3877355592664005e-06, | |
| "loss": 0.1509, | |
| "mean_token_accuracy": 0.9681152474880218, | |
| "num_tokens": 6026298682.0, | |
| "step": 56900 | |
| }, | |
| { | |
| "entropy": 1.24703125, | |
| "epoch": 1.394397923705989, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 1.3826176367857244e-06, | |
| "loss": 0.1599, | |
| "mean_token_accuracy": 0.9659165751934051, | |
| "num_tokens": 6031577635.0, | |
| "step": 56950 | |
| }, | |
| { | |
| "entropy": 1.23828125, | |
| "epoch": 1.3956221536653444, | |
| "grad_norm": 3.734375, | |
| "learning_rate": 1.3775063416573772e-06, | |
| "loss": 0.1602, | |
| "mean_token_accuracy": 0.9653304886817932, | |
| "num_tokens": 6036759854.0, | |
| "step": 57000 | |
| }, | |
| { | |
| "entropy": 1.23265625, | |
| "epoch": 1.3968463836247, | |
| "grad_norm": 1.6875, | |
| "learning_rate": 1.3724016948252932e-06, | |
| "loss": 0.1561, | |
| "mean_token_accuracy": 0.9671315121650695, | |
| "num_tokens": 6042005844.0, | |
| "step": 57050 | |
| }, | |
| { | |
| "entropy": 1.245, | |
| "epoch": 1.3980706135840557, | |
| "grad_norm": 4.125, | |
| "learning_rate": 1.3673037172061715e-06, | |
| "loss": 0.1645, | |
| "mean_token_accuracy": 0.9652763676643371, | |
| "num_tokens": 6047109956.0, | |
| "step": 57100 | |
| }, | |
| { | |
| "entropy": 1.23859375, | |
| "epoch": 1.3992948435434112, | |
| "grad_norm": 3.53125, | |
| "learning_rate": 1.362212429689374e-06, | |
| "loss": 0.1638, | |
| "mean_token_accuracy": 0.9652803325653077, | |
| "num_tokens": 6052155256.0, | |
| "step": 57150 | |
| }, | |
| { | |
| "entropy": 1.270625, | |
| "epoch": 1.4005190735027668, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 1.3571278531368583e-06, | |
| "loss": 0.1746, | |
| "mean_token_accuracy": 0.9618336653709412, | |
| "num_tokens": 6057754576.0, | |
| "step": 57200 | |
| }, | |
| { | |
| "entropy": 1.25484375, | |
| "epoch": 1.4017433034621223, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 1.3520500083830786e-06, | |
| "loss": 0.1611, | |
| "mean_token_accuracy": 0.9656724345684051, | |
| "num_tokens": 6063117197.0, | |
| "step": 57250 | |
| }, | |
| { | |
| "entropy": 1.25125, | |
| "epoch": 1.402967533421478, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 1.346978916234905e-06, | |
| "loss": 0.1737, | |
| "mean_token_accuracy": 0.9628279542922974, | |
| "num_tokens": 6068604024.0, | |
| "step": 57300 | |
| }, | |
| { | |
| "entropy": 1.25015625, | |
| "epoch": 1.4041917633808334, | |
| "grad_norm": 3.34375, | |
| "learning_rate": 1.3419145974715394e-06, | |
| "loss": 0.1561, | |
| "mean_token_accuracy": 0.9659430325031281, | |
| "num_tokens": 6073902078.0, | |
| "step": 57350 | |
| }, | |
| { | |
| "entropy": 1.26703125, | |
| "epoch": 1.405415993340189, | |
| "grad_norm": 3.859375, | |
| "learning_rate": 1.3368570728444298e-06, | |
| "loss": 0.1718, | |
| "mean_token_accuracy": 0.9625124716758728, | |
| "num_tokens": 6079405655.0, | |
| "step": 57400 | |
| }, | |
| { | |
| "entropy": 1.2446875, | |
| "epoch": 1.4066402232995445, | |
| "grad_norm": 3.828125, | |
| "learning_rate": 1.331806363077184e-06, | |
| "loss": 0.1662, | |
| "mean_token_accuracy": 0.9648419404029847, | |
| "num_tokens": 6084626144.0, | |
| "step": 57450 | |
| }, | |
| { | |
| "entropy": 1.23234375, | |
| "epoch": 1.4078644532589002, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 1.3267624888654835e-06, | |
| "loss": 0.1479, | |
| "mean_token_accuracy": 0.9676874935626983, | |
| "num_tokens": 6089664069.0, | |
| "step": 57500 | |
| }, | |
| { | |
| "entropy": 1.255625, | |
| "epoch": 1.4090886832182556, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 1.3217254708770053e-06, | |
| "loss": 0.1648, | |
| "mean_token_accuracy": 0.964464715719223, | |
| "num_tokens": 6095025878.0, | |
| "step": 57550 | |
| }, | |
| { | |
| "entropy": 1.25140625, | |
| "epoch": 1.4103129131776113, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 1.3166953297513275e-06, | |
| "loss": 0.1638, | |
| "mean_token_accuracy": 0.9649744808673859, | |
| "num_tokens": 6100414900.0, | |
| "step": 57600 | |
| }, | |
| { | |
| "entropy": 1.24765625, | |
| "epoch": 1.4115371431369668, | |
| "grad_norm": 0.0166015625, | |
| "learning_rate": 1.311672086099852e-06, | |
| "loss": 0.1621, | |
| "mean_token_accuracy": 0.9656559634208679, | |
| "num_tokens": 6105532948.0, | |
| "step": 57650 | |
| }, | |
| { | |
| "entropy": 1.2375, | |
| "epoch": 1.4127613730963224, | |
| "grad_norm": 2.671875, | |
| "learning_rate": 1.3066557605057167e-06, | |
| "loss": 0.1633, | |
| "mean_token_accuracy": 0.9653026688098908, | |
| "num_tokens": 6110851956.0, | |
| "step": 57700 | |
| }, | |
| { | |
| "entropy": 1.26578125, | |
| "epoch": 1.413985603055678, | |
| "grad_norm": 1.9921875, | |
| "learning_rate": 1.3016463735237164e-06, | |
| "loss": 0.1721, | |
| "mean_token_accuracy": 0.9625765991210937, | |
| "num_tokens": 6116317682.0, | |
| "step": 57750 | |
| }, | |
| { | |
| "entropy": 1.2565625, | |
| "epoch": 1.4152098330150336, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 1.2966439456802059e-06, | |
| "loss": 0.1742, | |
| "mean_token_accuracy": 0.9632444334030151, | |
| "num_tokens": 6122164130.0, | |
| "step": 57800 | |
| }, | |
| { | |
| "entropy": 1.24140625, | |
| "epoch": 1.416434062974389, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 1.2916484974730335e-06, | |
| "loss": 0.1672, | |
| "mean_token_accuracy": 0.9641308975219727, | |
| "num_tokens": 6127574306.0, | |
| "step": 57850 | |
| }, | |
| { | |
| "entropy": 1.246875, | |
| "epoch": 1.4176582929337447, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 1.2866600493714425e-06, | |
| "loss": 0.1725, | |
| "mean_token_accuracy": 0.9628300058841706, | |
| "num_tokens": 6133295960.0, | |
| "step": 57900 | |
| }, | |
| { | |
| "entropy": 1.25625, | |
| "epoch": 1.4188825228931004, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 1.281678621815994e-06, | |
| "loss": 0.1727, | |
| "mean_token_accuracy": 0.9640992879867554, | |
| "num_tokens": 6138729294.0, | |
| "step": 57950 | |
| }, | |
| { | |
| "entropy": 1.21765625, | |
| "epoch": 1.4201067528524558, | |
| "grad_norm": 3.34375, | |
| "learning_rate": 1.276704235218481e-06, | |
| "loss": 0.1483, | |
| "mean_token_accuracy": 0.9675537276268006, | |
| "num_tokens": 6143658701.0, | |
| "step": 58000 | |
| }, | |
| { | |
| "entropy": 1.24703125, | |
| "epoch": 1.4213309828118112, | |
| "grad_norm": 3.359375, | |
| "learning_rate": 1.2717369099618487e-06, | |
| "loss": 0.168, | |
| "mean_token_accuracy": 0.9638211143016815, | |
| "num_tokens": 6148836685.0, | |
| "step": 58050 | |
| }, | |
| { | |
| "entropy": 1.2190625, | |
| "epoch": 1.422555212771167, | |
| "grad_norm": 3.296875, | |
| "learning_rate": 1.2667766664001044e-06, | |
| "loss": 0.1527, | |
| "mean_token_accuracy": 0.9670968425273895, | |
| "num_tokens": 6153703845.0, | |
| "step": 58100 | |
| }, | |
| { | |
| "entropy": 1.23734375, | |
| "epoch": 1.4237794427305226, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 1.2618235248582383e-06, | |
| "loss": 0.1583, | |
| "mean_token_accuracy": 0.9668286955356598, | |
| "num_tokens": 6158817391.0, | |
| "step": 58150 | |
| }, | |
| { | |
| "entropy": 1.23171875, | |
| "epoch": 1.425003672689878, | |
| "grad_norm": 3.28125, | |
| "learning_rate": 1.2568775056321422e-06, | |
| "loss": 0.1593, | |
| "mean_token_accuracy": 0.9661485147476196, | |
| "num_tokens": 6163833832.0, | |
| "step": 58200 | |
| }, | |
| { | |
| "entropy": 1.24328125, | |
| "epoch": 1.4262279026492337, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 1.25193862898852e-06, | |
| "loss": 0.1737, | |
| "mean_token_accuracy": 0.9620695877075195, | |
| "num_tokens": 6169273441.0, | |
| "step": 58250 | |
| }, | |
| { | |
| "entropy": 1.2259375, | |
| "epoch": 1.4274521326085892, | |
| "grad_norm": 4.15625, | |
| "learning_rate": 1.2470069151648105e-06, | |
| "loss": 0.1605, | |
| "mean_token_accuracy": 0.964862027168274, | |
| "num_tokens": 6174358443.0, | |
| "step": 58300 | |
| }, | |
| { | |
| "entropy": 1.23609375, | |
| "epoch": 1.4286763625679448, | |
| "grad_norm": 3.1875, | |
| "learning_rate": 1.2420823843691005e-06, | |
| "loss": 0.1665, | |
| "mean_token_accuracy": 0.9651170766353607, | |
| "num_tokens": 6179906475.0, | |
| "step": 58350 | |
| }, | |
| { | |
| "entropy": 1.2340625, | |
| "epoch": 1.4299005925273003, | |
| "grad_norm": 2.90625, | |
| "learning_rate": 1.2371650567800477e-06, | |
| "loss": 0.1489, | |
| "mean_token_accuracy": 0.967512333393097, | |
| "num_tokens": 6184768923.0, | |
| "step": 58400 | |
| }, | |
| { | |
| "entropy": 1.250625, | |
| "epoch": 1.431124822486656, | |
| "grad_norm": 3.484375, | |
| "learning_rate": 1.2322549525467878e-06, | |
| "loss": 0.1697, | |
| "mean_token_accuracy": 0.9635206353664398, | |
| "num_tokens": 6190151181.0, | |
| "step": 58450 | |
| }, | |
| { | |
| "entropy": 1.23453125, | |
| "epoch": 1.4323490524460114, | |
| "grad_norm": 3.65625, | |
| "learning_rate": 1.2273520917888645e-06, | |
| "loss": 0.1624, | |
| "mean_token_accuracy": 0.9650914788246154, | |
| "num_tokens": 6195374468.0, | |
| "step": 58500 | |
| }, | |
| { | |
| "entropy": 1.24296875, | |
| "epoch": 1.433573282405367, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 1.2224564945961372e-06, | |
| "loss": 0.1738, | |
| "mean_token_accuracy": 0.9630816507339478, | |
| "num_tokens": 6200703908.0, | |
| "step": 58550 | |
| }, | |
| { | |
| "entropy": 1.21984375, | |
| "epoch": 1.4347975123647225, | |
| "grad_norm": 2.96875, | |
| "learning_rate": 1.2175681810287018e-06, | |
| "loss": 0.142, | |
| "mean_token_accuracy": 0.96914306640625, | |
| "num_tokens": 6205730956.0, | |
| "step": 58600 | |
| }, | |
| { | |
| "entropy": 1.24125, | |
| "epoch": 1.4360217423240782, | |
| "grad_norm": 3.109375, | |
| "learning_rate": 1.2126871711168126e-06, | |
| "loss": 0.1744, | |
| "mean_token_accuracy": 0.9625077545642853, | |
| "num_tokens": 6211224150.0, | |
| "step": 58650 | |
| }, | |
| { | |
| "entropy": 1.23828125, | |
| "epoch": 1.4372459722834336, | |
| "grad_norm": 2.890625, | |
| "learning_rate": 1.2078134848607935e-06, | |
| "loss": 0.1578, | |
| "mean_token_accuracy": 0.9665833008289337, | |
| "num_tokens": 6216480413.0, | |
| "step": 58700 | |
| }, | |
| { | |
| "entropy": 1.22734375, | |
| "epoch": 1.4384702022427893, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 1.2029471422309593e-06, | |
| "loss": 0.1592, | |
| "mean_token_accuracy": 0.9655974650382996, | |
| "num_tokens": 6221594113.0, | |
| "step": 58750 | |
| }, | |
| { | |
| "entropy": 1.2396875, | |
| "epoch": 1.4396944322021448, | |
| "grad_norm": 3.15625, | |
| "learning_rate": 1.1980881631675338e-06, | |
| "loss": 0.1642, | |
| "mean_token_accuracy": 0.9646211445331574, | |
| "num_tokens": 6226912535.0, | |
| "step": 58800 | |
| }, | |
| { | |
| "entropy": 1.2421875, | |
| "epoch": 1.4409186621615004, | |
| "grad_norm": 2.953125, | |
| "learning_rate": 1.1932365675805704e-06, | |
| "loss": 0.1704, | |
| "mean_token_accuracy": 0.9632949602603912, | |
| "num_tokens": 6232510565.0, | |
| "step": 58850 | |
| }, | |
| { | |
| "entropy": 1.2271875, | |
| "epoch": 1.442142892120856, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 1.1883923753498652e-06, | |
| "loss": 0.1629, | |
| "mean_token_accuracy": 0.9651079893112182, | |
| "num_tokens": 6237750599.0, | |
| "step": 58900 | |
| }, | |
| { | |
| "entropy": 1.235, | |
| "epoch": 1.4433671220802116, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 1.1835556063248796e-06, | |
| "loss": 0.157, | |
| "mean_token_accuracy": 0.9665428209304809, | |
| "num_tokens": 6243089430.0, | |
| "step": 58950 | |
| }, | |
| { | |
| "entropy": 1.22171875, | |
| "epoch": 1.4445913520395672, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 1.1787262803246568e-06, | |
| "loss": 0.159, | |
| "mean_token_accuracy": 0.9651802563667298, | |
| "num_tokens": 6248152093.0, | |
| "step": 59000 | |
| }, | |
| { | |
| "entropy": 1.2453125, | |
| "epoch": 1.4458155819989227, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 1.1739044171377455e-06, | |
| "loss": 0.1685, | |
| "mean_token_accuracy": 0.963554357290268, | |
| "num_tokens": 6253653648.0, | |
| "step": 59050 | |
| }, | |
| { | |
| "entropy": 1.24859375, | |
| "epoch": 1.4470398119582781, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 1.1690900365221082e-06, | |
| "loss": 0.1675, | |
| "mean_token_accuracy": 0.9636942827701569, | |
| "num_tokens": 6259395328.0, | |
| "step": 59100 | |
| }, | |
| { | |
| "entropy": 1.23515625, | |
| "epoch": 1.4482640419176338, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 1.164283158205053e-06, | |
| "loss": 0.163, | |
| "mean_token_accuracy": 0.9648255848884583, | |
| "num_tokens": 6264597318.0, | |
| "step": 59150 | |
| }, | |
| { | |
| "entropy": 1.22296875, | |
| "epoch": 1.4494882718769895, | |
| "grad_norm": 2.890625, | |
| "learning_rate": 1.1594838018831444e-06, | |
| "loss": 0.1506, | |
| "mean_token_accuracy": 0.9675889956951141, | |
| "num_tokens": 6269482590.0, | |
| "step": 59200 | |
| }, | |
| { | |
| "entropy": 1.26875, | |
| "epoch": 1.450712501836345, | |
| "grad_norm": 3.71875, | |
| "learning_rate": 1.1546919872221238e-06, | |
| "loss": 0.1858, | |
| "mean_token_accuracy": 0.9605572533607483, | |
| "num_tokens": 6275753206.0, | |
| "step": 59250 | |
| }, | |
| { | |
| "entropy": 1.235625, | |
| "epoch": 1.4519367317957004, | |
| "grad_norm": 3.34375, | |
| "learning_rate": 1.1499077338568329e-06, | |
| "loss": 0.1589, | |
| "mean_token_accuracy": 0.9655532228946686, | |
| "num_tokens": 6281061992.0, | |
| "step": 59300 | |
| }, | |
| { | |
| "entropy": 1.2371875, | |
| "epoch": 1.453160961755056, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 1.1451310613911282e-06, | |
| "loss": 0.1668, | |
| "mean_token_accuracy": 0.9643084633350373, | |
| "num_tokens": 6286356933.0, | |
| "step": 59350 | |
| }, | |
| { | |
| "entropy": 1.22546875, | |
| "epoch": 1.4543851917144117, | |
| "grad_norm": 0.0068359375, | |
| "learning_rate": 1.1403619893978035e-06, | |
| "loss": 0.1536, | |
| "mean_token_accuracy": 0.9669885611534119, | |
| "num_tokens": 6291298254.0, | |
| "step": 59400 | |
| }, | |
| { | |
| "entropy": 1.22484375, | |
| "epoch": 1.4556094216737672, | |
| "grad_norm": 2.734375, | |
| "learning_rate": 1.1356005374185075e-06, | |
| "loss": 0.1541, | |
| "mean_token_accuracy": 0.9667747104167939, | |
| "num_tokens": 6296386141.0, | |
| "step": 59450 | |
| }, | |
| { | |
| "entropy": 1.233125, | |
| "epoch": 1.4568336516331228, | |
| "grad_norm": 2.890625, | |
| "learning_rate": 1.1308467249636693e-06, | |
| "loss": 0.1546, | |
| "mean_token_accuracy": 0.9666030180454254, | |
| "num_tokens": 6301578433.0, | |
| "step": 59500 | |
| }, | |
| { | |
| "entropy": 1.228125, | |
| "epoch": 1.4580578815924783, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 1.1261005715124106e-06, | |
| "loss": 0.1653, | |
| "mean_token_accuracy": 0.9642830669879914, | |
| "num_tokens": 6306834089.0, | |
| "step": 59550 | |
| }, | |
| { | |
| "entropy": 1.24140625, | |
| "epoch": 1.459282111551834, | |
| "grad_norm": 3.28125, | |
| "learning_rate": 1.1213620965124711e-06, | |
| "loss": 0.1713, | |
| "mean_token_accuracy": 0.9641312193870545, | |
| "num_tokens": 6312270957.0, | |
| "step": 59600 | |
| }, | |
| { | |
| "entropy": 1.238125, | |
| "epoch": 1.4605063415111894, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 1.1166313193801264e-06, | |
| "loss": 0.1717, | |
| "mean_token_accuracy": 0.9619838237762451, | |
| "num_tokens": 6317571444.0, | |
| "step": 59650 | |
| }, | |
| { | |
| "entropy": 1.235, | |
| "epoch": 1.461730571470545, | |
| "grad_norm": 1.6328125, | |
| "learning_rate": 1.1119082595001127e-06, | |
| "loss": 0.1617, | |
| "mean_token_accuracy": 0.9648803687095642, | |
| "num_tokens": 6322810865.0, | |
| "step": 59700 | |
| }, | |
| { | |
| "entropy": 1.24390625, | |
| "epoch": 1.4629548014299005, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 1.1071929362255407e-06, | |
| "loss": 0.1768, | |
| "mean_token_accuracy": 0.9624212658405304, | |
| "num_tokens": 6328065527.0, | |
| "step": 59750 | |
| }, | |
| { | |
| "entropy": 1.2346875, | |
| "epoch": 1.4641790313892562, | |
| "grad_norm": 1.90625, | |
| "learning_rate": 1.102485368877821e-06, | |
| "loss": 0.1547, | |
| "mean_token_accuracy": 0.96669025182724, | |
| "num_tokens": 6332934140.0, | |
| "step": 59800 | |
| }, | |
| { | |
| "entropy": 1.24796875, | |
| "epoch": 1.4654032613486117, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 1.0977855767465834e-06, | |
| "loss": 0.1683, | |
| "mean_token_accuracy": 0.9648297607898713, | |
| "num_tokens": 6338286149.0, | |
| "step": 59850 | |
| }, | |
| { | |
| "entropy": 1.23640625, | |
| "epoch": 1.4666274913079673, | |
| "grad_norm": 2.703125, | |
| "learning_rate": 1.0930935790895982e-06, | |
| "loss": 0.1481, | |
| "mean_token_accuracy": 0.9682129454612732, | |
| "num_tokens": 6343347728.0, | |
| "step": 59900 | |
| }, | |
| { | |
| "entropy": 1.2359375, | |
| "epoch": 1.4678517212673228, | |
| "grad_norm": 1.65625, | |
| "learning_rate": 1.0884093951326982e-06, | |
| "loss": 0.1662, | |
| "mean_token_accuracy": 0.9638714647293091, | |
| "num_tokens": 6348595585.0, | |
| "step": 59950 | |
| }, | |
| { | |
| "entropy": 1.23671875, | |
| "epoch": 1.4690759512266784, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 1.083733044069698e-06, | |
| "loss": 0.1533, | |
| "mean_token_accuracy": 0.9660887753963471, | |
| "num_tokens": 6353539392.0, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 1.4690759512266784, | |
| "eval_entropy": 1.2380208333333333, | |
| "eval_loss": 0.17763087153434753, | |
| "eval_mean_token_accuracy": 0.9620065187414487, | |
| "eval_num_tokens": 6353539392.0, | |
| "eval_runtime": 603.0528, | |
| "eval_samples_per_second": 16.012, | |
| "eval_steps_per_second": 0.201, | |
| "step": 60000 | |
| }, | |
| { | |
| "entropy": 1.229375, | |
| "epoch": 1.4703001811860341, | |
| "grad_norm": 2.96875, | |
| "learning_rate": 1.0790645450623166e-06, | |
| "loss": 0.1552, | |
| "mean_token_accuracy": 0.9666960227489472, | |
| "num_tokens": 6358769999.0, | |
| "step": 60050 | |
| }, | |
| { | |
| "entropy": 1.23546875, | |
| "epoch": 1.4715244111453896, | |
| "grad_norm": 3.0, | |
| "learning_rate": 1.0744039172400965e-06, | |
| "loss": 0.1538, | |
| "mean_token_accuracy": 0.9672531485557556, | |
| "num_tokens": 6363778830.0, | |
| "step": 60100 | |
| }, | |
| { | |
| "entropy": 1.21921875, | |
| "epoch": 1.472748641104745, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 1.0697511797003325e-06, | |
| "loss": 0.1562, | |
| "mean_token_accuracy": 0.9664645326137543, | |
| "num_tokens": 6368813861.0, | |
| "step": 60150 | |
| }, | |
| { | |
| "entropy": 1.2353125, | |
| "epoch": 1.4739728710641007, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 1.0651063515079833e-06, | |
| "loss": 0.1537, | |
| "mean_token_accuracy": 0.9665102756023407, | |
| "num_tokens": 6374106711.0, | |
| "step": 60200 | |
| }, | |
| { | |
| "entropy": 1.22546875, | |
| "epoch": 1.4751971010234564, | |
| "grad_norm": 1.75, | |
| "learning_rate": 1.0604694516956e-06, | |
| "loss": 0.151, | |
| "mean_token_accuracy": 0.9675907123088837, | |
| "num_tokens": 6379244247.0, | |
| "step": 60250 | |
| }, | |
| { | |
| "entropy": 1.22953125, | |
| "epoch": 1.4764213309828118, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 1.055840499263247e-06, | |
| "loss": 0.1624, | |
| "mean_token_accuracy": 0.964186635017395, | |
| "num_tokens": 6384481392.0, | |
| "step": 60300 | |
| }, | |
| { | |
| "entropy": 1.23578125, | |
| "epoch": 1.4776455609421673, | |
| "grad_norm": 3.5625, | |
| "learning_rate": 1.0512195131784247e-06, | |
| "loss": 0.1575, | |
| "mean_token_accuracy": 0.965451090335846, | |
| "num_tokens": 6389460183.0, | |
| "step": 60350 | |
| }, | |
| { | |
| "entropy": 1.2571875, | |
| "epoch": 1.478869790901523, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 1.0466065123759882e-06, | |
| "loss": 0.1706, | |
| "mean_token_accuracy": 0.9634547913074494, | |
| "num_tokens": 6395040346.0, | |
| "step": 60400 | |
| }, | |
| { | |
| "entropy": 1.2428125, | |
| "epoch": 1.4800940208608786, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 1.0420015157580736e-06, | |
| "loss": 0.1614, | |
| "mean_token_accuracy": 0.9662553632259369, | |
| "num_tokens": 6400379406.0, | |
| "step": 60450 | |
| }, | |
| { | |
| "entropy": 1.2384375, | |
| "epoch": 1.481318250820234, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 1.0374045421940215e-06, | |
| "loss": 0.1574, | |
| "mean_token_accuracy": 0.9662669360637665, | |
| "num_tokens": 6405924043.0, | |
| "step": 60500 | |
| }, | |
| { | |
| "entropy": 1.24515625, | |
| "epoch": 1.4825424807795895, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 1.0328156105202916e-06, | |
| "loss": 0.1666, | |
| "mean_token_accuracy": 0.9644035375118256, | |
| "num_tokens": 6411487076.0, | |
| "step": 60550 | |
| }, | |
| { | |
| "entropy": 1.22125, | |
| "epoch": 1.4837667107389452, | |
| "grad_norm": 2.59375, | |
| "learning_rate": 1.0282347395403978e-06, | |
| "loss": 0.1501, | |
| "mean_token_accuracy": 0.9667956507205964, | |
| "num_tokens": 6416699077.0, | |
| "step": 60600 | |
| }, | |
| { | |
| "entropy": 1.22921875, | |
| "epoch": 1.4849909406983008, | |
| "grad_norm": 3.0625, | |
| "learning_rate": 1.0236619480248205e-06, | |
| "loss": 0.1649, | |
| "mean_token_accuracy": 0.9641565144062042, | |
| "num_tokens": 6421663477.0, | |
| "step": 60650 | |
| }, | |
| { | |
| "entropy": 1.223125, | |
| "epoch": 1.4862151706576563, | |
| "grad_norm": 0.012939453125, | |
| "learning_rate": 1.0190972547109352e-06, | |
| "loss": 0.1465, | |
| "mean_token_accuracy": 0.9683601307868958, | |
| "num_tokens": 6426657308.0, | |
| "step": 60700 | |
| }, | |
| { | |
| "entropy": 1.23296875, | |
| "epoch": 1.487439400617012, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 1.0145406783029337e-06, | |
| "loss": 0.1654, | |
| "mean_token_accuracy": 0.9649899744987488, | |
| "num_tokens": 6432023783.0, | |
| "step": 60750 | |
| }, | |
| { | |
| "entropy": 1.2375, | |
| "epoch": 1.4886636305763674, | |
| "grad_norm": 3.046875, | |
| "learning_rate": 1.0099922374717499e-06, | |
| "loss": 0.162, | |
| "mean_token_accuracy": 0.9657110357284546, | |
| "num_tokens": 6437497556.0, | |
| "step": 60800 | |
| }, | |
| { | |
| "entropy": 1.2409375, | |
| "epoch": 1.489887860535723, | |
| "grad_norm": 3.96875, | |
| "learning_rate": 1.0054519508549797e-06, | |
| "loss": 0.177, | |
| "mean_token_accuracy": 0.9619574582576752, | |
| "num_tokens": 6443350702.0, | |
| "step": 60850 | |
| }, | |
| { | |
| "entropy": 1.23125, | |
| "epoch": 1.4911120904950785, | |
| "grad_norm": 2.78125, | |
| "learning_rate": 1.0009198370568066e-06, | |
| "loss": 0.1627, | |
| "mean_token_accuracy": 0.964564654827118, | |
| "num_tokens": 6448658491.0, | |
| "step": 60900 | |
| }, | |
| { | |
| "entropy": 1.2153125, | |
| "epoch": 1.4923363204544342, | |
| "grad_norm": 3.15625, | |
| "learning_rate": 9.96395914647927e-07, | |
| "loss": 0.1507, | |
| "mean_token_accuracy": 0.9671316814422607, | |
| "num_tokens": 6453556941.0, | |
| "step": 60950 | |
| }, | |
| { | |
| "entropy": 1.21390625, | |
| "epoch": 1.4935605504137897, | |
| "grad_norm": 2.5, | |
| "learning_rate": 9.91880202165471e-07, | |
| "loss": 0.1637, | |
| "mean_token_accuracy": 0.964778846502304, | |
| "num_tokens": 6458635664.0, | |
| "step": 61000 | |
| }, | |
| { | |
| "entropy": 1.23234375, | |
| "epoch": 1.4947847803731453, | |
| "grad_norm": 2.671875, | |
| "learning_rate": 9.873727181129275e-07, | |
| "loss": 0.17, | |
| "mean_token_accuracy": 0.9645189070701599, | |
| "num_tokens": 6464088495.0, | |
| "step": 61050 | |
| }, | |
| { | |
| "entropy": 1.23078125, | |
| "epoch": 1.4960090103325008, | |
| "grad_norm": 2.953125, | |
| "learning_rate": 9.828734809600687e-07, | |
| "loss": 0.1594, | |
| "mean_token_accuracy": 0.9656787288188934, | |
| "num_tokens": 6469234194.0, | |
| "step": 61100 | |
| }, | |
| { | |
| "entropy": 1.22921875, | |
| "epoch": 1.4972332402918564, | |
| "grad_norm": 2.25, | |
| "learning_rate": 9.783825091428782e-07, | |
| "loss": 0.1618, | |
| "mean_token_accuracy": 0.965996481180191, | |
| "num_tokens": 6474528140.0, | |
| "step": 61150 | |
| }, | |
| { | |
| "entropy": 1.2309375, | |
| "epoch": 1.498457470251212, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 9.738998210634644e-07, | |
| "loss": 0.1728, | |
| "mean_token_accuracy": 0.9626871156692505, | |
| "num_tokens": 6480082901.0, | |
| "step": 61200 | |
| }, | |
| { | |
| "entropy": 1.23, | |
| "epoch": 1.4996817002105676, | |
| "grad_norm": 3.796875, | |
| "learning_rate": 9.694254350900005e-07, | |
| "loss": 0.1585, | |
| "mean_token_accuracy": 0.9654109764099121, | |
| "num_tokens": 6485373470.0, | |
| "step": 61250 | |
| }, | |
| { | |
| "entropy": 1.24640625, | |
| "epoch": 1.5009059301699232, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 9.649593695566355e-07, | |
| "loss": 0.1673, | |
| "mean_token_accuracy": 0.9639886951446534, | |
| "num_tokens": 6490817618.0, | |
| "step": 61300 | |
| }, | |
| { | |
| "entropy": 1.2215625, | |
| "epoch": 1.5021301601292787, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 9.605016427634272e-07, | |
| "loss": 0.1513, | |
| "mean_token_accuracy": 0.9674781799316406, | |
| "num_tokens": 6495843357.0, | |
| "step": 61350 | |
| }, | |
| { | |
| "entropy": 1.2359375, | |
| "epoch": 1.5033543900886341, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 9.560522729762628e-07, | |
| "loss": 0.1621, | |
| "mean_token_accuracy": 0.96533607006073, | |
| "num_tokens": 6500949587.0, | |
| "step": 61400 | |
| }, | |
| { | |
| "entropy": 1.2359375, | |
| "epoch": 1.5045786200479898, | |
| "grad_norm": 2.875, | |
| "learning_rate": 9.516112784267896e-07, | |
| "loss": 0.1714, | |
| "mean_token_accuracy": 0.9637338280677795, | |
| "num_tokens": 6506340396.0, | |
| "step": 61450 | |
| }, | |
| { | |
| "entropy": 1.2215625, | |
| "epoch": 1.5058028500073455, | |
| "grad_norm": 4.65625, | |
| "learning_rate": 9.471786773123337e-07, | |
| "loss": 0.1591, | |
| "mean_token_accuracy": 0.9650114715099335, | |
| "num_tokens": 6511689926.0, | |
| "step": 61500 | |
| }, | |
| { | |
| "entropy": 1.243125, | |
| "epoch": 1.507027079966701, | |
| "grad_norm": 2.859375, | |
| "learning_rate": 9.427544877958278e-07, | |
| "loss": 0.1678, | |
| "mean_token_accuracy": 0.9641639375686646, | |
| "num_tokens": 6517204008.0, | |
| "step": 61550 | |
| }, | |
| { | |
| "entropy": 1.21296875, | |
| "epoch": 1.5082513099260564, | |
| "grad_norm": 2.453125, | |
| "learning_rate": 9.383387280057409e-07, | |
| "loss": 0.1615, | |
| "mean_token_accuracy": 0.9646773946285248, | |
| "num_tokens": 6522483140.0, | |
| "step": 61600 | |
| }, | |
| { | |
| "entropy": 1.23234375, | |
| "epoch": 1.509475539885412, | |
| "grad_norm": 2.953125, | |
| "learning_rate": 9.339314160359977e-07, | |
| "loss": 0.1588, | |
| "mean_token_accuracy": 0.9658515179157257, | |
| "num_tokens": 6527644811.0, | |
| "step": 61650 | |
| }, | |
| { | |
| "entropy": 1.233125, | |
| "epoch": 1.5106997698447677, | |
| "grad_norm": 3.125, | |
| "learning_rate": 9.295325699459082e-07, | |
| "loss": 0.1629, | |
| "mean_token_accuracy": 0.9652837121486664, | |
| "num_tokens": 6532774529.0, | |
| "step": 61700 | |
| }, | |
| { | |
| "entropy": 1.22390625, | |
| "epoch": 1.5119239998041232, | |
| "grad_norm": 3.90625, | |
| "learning_rate": 9.251422077600911e-07, | |
| "loss": 0.1658, | |
| "mean_token_accuracy": 0.9642423093318939, | |
| "num_tokens": 6538188895.0, | |
| "step": 61750 | |
| }, | |
| { | |
| "entropy": 1.223125, | |
| "epoch": 1.5131482297634786, | |
| "grad_norm": 2.84375, | |
| "learning_rate": 9.207603474684063e-07, | |
| "loss": 0.1576, | |
| "mean_token_accuracy": 0.9674744582176209, | |
| "num_tokens": 6543389288.0, | |
| "step": 61800 | |
| }, | |
| { | |
| "entropy": 1.22765625, | |
| "epoch": 1.5143724597228343, | |
| "grad_norm": 2.375, | |
| "learning_rate": 9.163870070258698e-07, | |
| "loss": 0.1563, | |
| "mean_token_accuracy": 0.9665237212181091, | |
| "num_tokens": 6548548612.0, | |
| "step": 61850 | |
| }, | |
| { | |
| "entropy": 1.22171875, | |
| "epoch": 1.51559668968219, | |
| "grad_norm": 3.375, | |
| "learning_rate": 9.120222043525931e-07, | |
| "loss": 0.1515, | |
| "mean_token_accuracy": 0.9670096004009247, | |
| "num_tokens": 6553657775.0, | |
| "step": 61900 | |
| }, | |
| { | |
| "entropy": 1.23703125, | |
| "epoch": 1.5168209196415454, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 9.076659573337e-07, | |
| "loss": 0.1619, | |
| "mean_token_accuracy": 0.9654546058177949, | |
| "num_tokens": 6559027325.0, | |
| "step": 61950 | |
| }, | |
| { | |
| "entropy": 1.22515625, | |
| "epoch": 1.5180451496009009, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 9.033182838192564e-07, | |
| "loss": 0.1595, | |
| "mean_token_accuracy": 0.9660532510280609, | |
| "num_tokens": 6564515287.0, | |
| "step": 62000 | |
| }, | |
| { | |
| "entropy": 1.236875, | |
| "epoch": 1.5192693795602565, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 8.98979201624201e-07, | |
| "loss": 0.161, | |
| "mean_token_accuracy": 0.9655505573749542, | |
| "num_tokens": 6569987402.0, | |
| "step": 62050 | |
| }, | |
| { | |
| "entropy": 1.249375, | |
| "epoch": 1.5204936095196122, | |
| "grad_norm": 3.125, | |
| "learning_rate": 8.946487285282659e-07, | |
| "loss": 0.1724, | |
| "mean_token_accuracy": 0.9626421999931335, | |
| "num_tokens": 6575526706.0, | |
| "step": 62100 | |
| }, | |
| { | |
| "entropy": 1.225, | |
| "epoch": 1.5217178394789679, | |
| "grad_norm": 3.65625, | |
| "learning_rate": 8.903268822759075e-07, | |
| "loss": 0.1615, | |
| "mean_token_accuracy": 0.966062741279602, | |
| "num_tokens": 6580795009.0, | |
| "step": 62150 | |
| }, | |
| { | |
| "entropy": 1.23375, | |
| "epoch": 1.5229420694383233, | |
| "grad_norm": 4.5, | |
| "learning_rate": 8.860136805762319e-07, | |
| "loss": 0.1617, | |
| "mean_token_accuracy": 0.9658437705039978, | |
| "num_tokens": 6586016211.0, | |
| "step": 62200 | |
| }, | |
| { | |
| "entropy": 1.2359375, | |
| "epoch": 1.5241662993976788, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 8.817091411029271e-07, | |
| "loss": 0.1593, | |
| "mean_token_accuracy": 0.966154944896698, | |
| "num_tokens": 6591160444.0, | |
| "step": 62250 | |
| }, | |
| { | |
| "entropy": 1.2134375, | |
| "epoch": 1.5253905293570345, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 8.774132814941828e-07, | |
| "loss": 0.1579, | |
| "mean_token_accuracy": 0.9668165516853332, | |
| "num_tokens": 6596300228.0, | |
| "step": 62300 | |
| }, | |
| { | |
| "entropy": 1.21953125, | |
| "epoch": 1.5266147593163901, | |
| "grad_norm": 2.859375, | |
| "learning_rate": 8.731261193526248e-07, | |
| "loss": 0.1586, | |
| "mean_token_accuracy": 0.9657115602493286, | |
| "num_tokens": 6601689242.0, | |
| "step": 62350 | |
| }, | |
| { | |
| "entropy": 1.2521875, | |
| "epoch": 1.5278389892757456, | |
| "grad_norm": 3.046875, | |
| "learning_rate": 8.688476722452379e-07, | |
| "loss": 0.1732, | |
| "mean_token_accuracy": 0.9633473336696625, | |
| "num_tokens": 6607301069.0, | |
| "step": 62400 | |
| }, | |
| { | |
| "entropy": 1.23234375, | |
| "epoch": 1.529063219235101, | |
| "grad_norm": 2.0, | |
| "learning_rate": 8.645779577033011e-07, | |
| "loss": 0.1655, | |
| "mean_token_accuracy": 0.9651632213592529, | |
| "num_tokens": 6612690182.0, | |
| "step": 62450 | |
| }, | |
| { | |
| "entropy": 1.22234375, | |
| "epoch": 1.5302874491944567, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 8.603169932223042e-07, | |
| "loss": 0.1644, | |
| "mean_token_accuracy": 0.9645105350017548, | |
| "num_tokens": 6618066965.0, | |
| "step": 62500 | |
| }, | |
| { | |
| "entropy": 1.2134375, | |
| "epoch": 1.5315116791538124, | |
| "grad_norm": 3.03125, | |
| "learning_rate": 8.560647962618894e-07, | |
| "loss": 0.1473, | |
| "mean_token_accuracy": 0.9680246078968048, | |
| "num_tokens": 6623009283.0, | |
| "step": 62550 | |
| }, | |
| { | |
| "entropy": 1.2359375, | |
| "epoch": 1.5327359091131678, | |
| "grad_norm": 2.9375, | |
| "learning_rate": 8.518213842457696e-07, | |
| "loss": 0.1684, | |
| "mean_token_accuracy": 0.9639063477516174, | |
| "num_tokens": 6628694150.0, | |
| "step": 62600 | |
| }, | |
| { | |
| "entropy": 1.2365625, | |
| "epoch": 1.5339601390725233, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 8.475867745616605e-07, | |
| "loss": 0.1699, | |
| "mean_token_accuracy": 0.9639629209041596, | |
| "num_tokens": 6634163539.0, | |
| "step": 62650 | |
| }, | |
| { | |
| "entropy": 1.23515625, | |
| "epoch": 1.535184369031879, | |
| "grad_norm": 1.8125, | |
| "learning_rate": 8.433609845612123e-07, | |
| "loss": 0.1681, | |
| "mean_token_accuracy": 0.9637242484092713, | |
| "num_tokens": 6639673078.0, | |
| "step": 62700 | |
| }, | |
| { | |
| "entropy": 1.21796875, | |
| "epoch": 1.5364085989912346, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 8.39144031559933e-07, | |
| "loss": 0.1653, | |
| "mean_token_accuracy": 0.9641383695602417, | |
| "num_tokens": 6645021375.0, | |
| "step": 62750 | |
| }, | |
| { | |
| "entropy": 1.218125, | |
| "epoch": 1.53763282895059, | |
| "grad_norm": 3.65625, | |
| "learning_rate": 8.349359328371241e-07, | |
| "loss": 0.1557, | |
| "mean_token_accuracy": 0.9672486507892608, | |
| "num_tokens": 6650282385.0, | |
| "step": 62800 | |
| }, | |
| { | |
| "entropy": 1.24453125, | |
| "epoch": 1.5388570589099455, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 8.307367056357993e-07, | |
| "loss": 0.1744, | |
| "mean_token_accuracy": 0.9627921509742737, | |
| "num_tokens": 6655617849.0, | |
| "step": 62850 | |
| }, | |
| { | |
| "entropy": 1.2384375, | |
| "epoch": 1.5400812888693012, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 8.265463671626277e-07, | |
| "loss": 0.1643, | |
| "mean_token_accuracy": 0.9646277320384979, | |
| "num_tokens": 6660898400.0, | |
| "step": 62900 | |
| }, | |
| { | |
| "entropy": 1.2315625, | |
| "epoch": 1.5413055188286569, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 8.223649345878521e-07, | |
| "loss": 0.1595, | |
| "mean_token_accuracy": 0.9663047862052917, | |
| "num_tokens": 6666546321.0, | |
| "step": 62950 | |
| }, | |
| { | |
| "entropy": 1.21890625, | |
| "epoch": 1.5425297487880123, | |
| "grad_norm": 0.01312255859375, | |
| "learning_rate": 8.181924250452234e-07, | |
| "loss": 0.1479, | |
| "mean_token_accuracy": 0.9685621929168701, | |
| "num_tokens": 6671900409.0, | |
| "step": 63000 | |
| }, | |
| { | |
| "entropy": 1.22140625, | |
| "epoch": 1.5437539787473677, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 8.140288556319295e-07, | |
| "loss": 0.1564, | |
| "mean_token_accuracy": 0.9663173937797547, | |
| "num_tokens": 6676916235.0, | |
| "step": 63050 | |
| }, | |
| { | |
| "entropy": 1.2315625, | |
| "epoch": 1.5449782087067234, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 8.098742434085274e-07, | |
| "loss": 0.1619, | |
| "mean_token_accuracy": 0.9653417527675628, | |
| "num_tokens": 6681811077.0, | |
| "step": 63100 | |
| }, | |
| { | |
| "entropy": 1.2278125, | |
| "epoch": 1.546202438666079, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 8.057286053988688e-07, | |
| "loss": 0.155, | |
| "mean_token_accuracy": 0.9668863129615783, | |
| "num_tokens": 6687079259.0, | |
| "step": 63150 | |
| }, | |
| { | |
| "entropy": 1.23734375, | |
| "epoch": 1.5474266686254348, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 8.015919585900328e-07, | |
| "loss": 0.1698, | |
| "mean_token_accuracy": 0.9634287714958191, | |
| "num_tokens": 6692413841.0, | |
| "step": 63200 | |
| }, | |
| { | |
| "entropy": 1.20328125, | |
| "epoch": 1.5486508985847902, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 7.974643199322591e-07, | |
| "loss": 0.1459, | |
| "mean_token_accuracy": 0.9686257600784302, | |
| "num_tokens": 6697530112.0, | |
| "step": 63250 | |
| }, | |
| { | |
| "entropy": 1.238125, | |
| "epoch": 1.5498751285441457, | |
| "grad_norm": 2.84375, | |
| "learning_rate": 7.933457063388733e-07, | |
| "loss": 0.171, | |
| "mean_token_accuracy": 0.9629907369613647, | |
| "num_tokens": 6702988908.0, | |
| "step": 63300 | |
| }, | |
| { | |
| "entropy": 1.215625, | |
| "epoch": 1.5510993585035013, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 7.892361346862206e-07, | |
| "loss": 0.1588, | |
| "mean_token_accuracy": 0.9652132534980774, | |
| "num_tokens": 6708127766.0, | |
| "step": 63350 | |
| }, | |
| { | |
| "entropy": 1.216875, | |
| "epoch": 1.552323588462857, | |
| "grad_norm": 3.546875, | |
| "learning_rate": 7.851356218135953e-07, | |
| "loss": 0.1565, | |
| "mean_token_accuracy": 0.9663667130470276, | |
| "num_tokens": 6713202542.0, | |
| "step": 63400 | |
| }, | |
| { | |
| "entropy": 1.2165625, | |
| "epoch": 1.5535478184222125, | |
| "grad_norm": 3.546875, | |
| "learning_rate": 7.810441845231768e-07, | |
| "loss": 0.1562, | |
| "mean_token_accuracy": 0.9665763390064239, | |
| "num_tokens": 6718170250.0, | |
| "step": 63450 | |
| }, | |
| { | |
| "entropy": 1.239375, | |
| "epoch": 1.554772048381568, | |
| "grad_norm": 2.671875, | |
| "learning_rate": 7.769618395799495e-07, | |
| "loss": 0.1701, | |
| "mean_token_accuracy": 0.9642471766471863, | |
| "num_tokens": 6723417011.0, | |
| "step": 63500 | |
| }, | |
| { | |
| "entropy": 1.20984375, | |
| "epoch": 1.5559962783409236, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 7.728886037116482e-07, | |
| "loss": 0.1445, | |
| "mean_token_accuracy": 0.9684971439838409, | |
| "num_tokens": 6728453094.0, | |
| "step": 63550 | |
| }, | |
| { | |
| "entropy": 1.22625, | |
| "epoch": 1.5572205083002792, | |
| "grad_norm": 3.203125, | |
| "learning_rate": 7.688244936086779e-07, | |
| "loss": 0.1591, | |
| "mean_token_accuracy": 0.9653982555866242, | |
| "num_tokens": 6733460582.0, | |
| "step": 63600 | |
| }, | |
| { | |
| "entropy": 1.23765625, | |
| "epoch": 1.5584447382596347, | |
| "grad_norm": 1.5625, | |
| "learning_rate": 7.64769525924052e-07, | |
| "loss": 0.1631, | |
| "mean_token_accuracy": 0.9650383579730988, | |
| "num_tokens": 6739025377.0, | |
| "step": 63650 | |
| }, | |
| { | |
| "entropy": 1.241875, | |
| "epoch": 1.5596689682189901, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 7.607237172733212e-07, | |
| "loss": 0.1629, | |
| "mean_token_accuracy": 0.9644639611244201, | |
| "num_tokens": 6744632607.0, | |
| "step": 63700 | |
| }, | |
| { | |
| "entropy": 1.21015625, | |
| "epoch": 1.5608931981783458, | |
| "grad_norm": 2.8125, | |
| "learning_rate": 7.566870842345078e-07, | |
| "loss": 0.1438, | |
| "mean_token_accuracy": 0.9694548106193542, | |
| "num_tokens": 6749711105.0, | |
| "step": 63750 | |
| }, | |
| { | |
| "entropy": 1.22625, | |
| "epoch": 1.5621174281377015, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 7.526596433480352e-07, | |
| "loss": 0.162, | |
| "mean_token_accuracy": 0.9650256216526032, | |
| "num_tokens": 6755001114.0, | |
| "step": 63800 | |
| }, | |
| { | |
| "entropy": 1.2203125, | |
| "epoch": 1.563341658097057, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 7.486414111166603e-07, | |
| "loss": 0.1585, | |
| "mean_token_accuracy": 0.9653235769271851, | |
| "num_tokens": 6760148593.0, | |
| "step": 63850 | |
| }, | |
| { | |
| "entropy": 1.22015625, | |
| "epoch": 1.5645658880564124, | |
| "grad_norm": 4.15625, | |
| "learning_rate": 7.446324040054098e-07, | |
| "loss": 0.1545, | |
| "mean_token_accuracy": 0.9676208901405334, | |
| "num_tokens": 6765196202.0, | |
| "step": 63900 | |
| }, | |
| { | |
| "entropy": 1.2396875, | |
| "epoch": 1.565790118015768, | |
| "grad_norm": 1.6640625, | |
| "learning_rate": 7.406326384415069e-07, | |
| "loss": 0.1645, | |
| "mean_token_accuracy": 0.964854439496994, | |
| "num_tokens": 6770864758.0, | |
| "step": 63950 | |
| }, | |
| { | |
| "entropy": 1.23265625, | |
| "epoch": 1.5670143479751237, | |
| "grad_norm": 4.53125, | |
| "learning_rate": 7.366421308143074e-07, | |
| "loss": 0.1678, | |
| "mean_token_accuracy": 0.9636308062076568, | |
| "num_tokens": 6776309871.0, | |
| "step": 64000 | |
| }, | |
| { | |
| "entropy": 1.22203125, | |
| "epoch": 1.5682385779344792, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 7.326608974752318e-07, | |
| "loss": 0.1537, | |
| "mean_token_accuracy": 0.9670063924789428, | |
| "num_tokens": 6781591477.0, | |
| "step": 64050 | |
| }, | |
| { | |
| "entropy": 1.23421875, | |
| "epoch": 1.5694628078938346, | |
| "grad_norm": 3.671875, | |
| "learning_rate": 7.286889547377019e-07, | |
| "loss": 0.1576, | |
| "mean_token_accuracy": 0.9661747896671296, | |
| "num_tokens": 6787008758.0, | |
| "step": 64100 | |
| }, | |
| { | |
| "entropy": 1.2321875, | |
| "epoch": 1.5706870378531903, | |
| "grad_norm": 2.921875, | |
| "learning_rate": 7.247263188770635e-07, | |
| "loss": 0.1658, | |
| "mean_token_accuracy": 0.9641131579875946, | |
| "num_tokens": 6792453198.0, | |
| "step": 64150 | |
| }, | |
| { | |
| "entropy": 1.23484375, | |
| "epoch": 1.571911267812546, | |
| "grad_norm": 3.59375, | |
| "learning_rate": 7.207730061305342e-07, | |
| "loss": 0.1715, | |
| "mean_token_accuracy": 0.9631493031978607, | |
| "num_tokens": 6798199941.0, | |
| "step": 64200 | |
| }, | |
| { | |
| "entropy": 1.2353125, | |
| "epoch": 1.5731354977719014, | |
| "grad_norm": 3.3125, | |
| "learning_rate": 7.168290326971248e-07, | |
| "loss": 0.1629, | |
| "mean_token_accuracy": 0.9649174082279205, | |
| "num_tokens": 6803443062.0, | |
| "step": 64250 | |
| }, | |
| { | |
| "entropy": 1.220625, | |
| "epoch": 1.5743597277312569, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 7.128944147375779e-07, | |
| "loss": 0.1518, | |
| "mean_token_accuracy": 0.967359025478363, | |
| "num_tokens": 6808707076.0, | |
| "step": 64300 | |
| }, | |
| { | |
| "entropy": 1.2209375, | |
| "epoch": 1.5755839576906125, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 7.08969168374304e-07, | |
| "loss": 0.1596, | |
| "mean_token_accuracy": 0.9663796508312226, | |
| "num_tokens": 6813958298.0, | |
| "step": 64350 | |
| }, | |
| { | |
| "entropy": 1.2228125, | |
| "epoch": 1.5768081876499682, | |
| "grad_norm": 3.828125, | |
| "learning_rate": 7.050533096913104e-07, | |
| "loss": 0.162, | |
| "mean_token_accuracy": 0.9654451417922973, | |
| "num_tokens": 6819296053.0, | |
| "step": 64400 | |
| }, | |
| { | |
| "entropy": 1.228125, | |
| "epoch": 1.578032417609324, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 7.011468547341376e-07, | |
| "loss": 0.1488, | |
| "mean_token_accuracy": 0.9677229869365692, | |
| "num_tokens": 6824596472.0, | |
| "step": 64450 | |
| }, | |
| { | |
| "entropy": 1.23953125, | |
| "epoch": 1.5792566475686793, | |
| "grad_norm": 2.84375, | |
| "learning_rate": 6.972498195097937e-07, | |
| "loss": 0.1723, | |
| "mean_token_accuracy": 0.962650990486145, | |
| "num_tokens": 6830407037.0, | |
| "step": 64500 | |
| }, | |
| { | |
| "entropy": 1.230625, | |
| "epoch": 1.5804808775280348, | |
| "grad_norm": 2.375, | |
| "learning_rate": 6.933622199866912e-07, | |
| "loss": 0.1624, | |
| "mean_token_accuracy": 0.9654111993312836, | |
| "num_tokens": 6835900402.0, | |
| "step": 64550 | |
| }, | |
| { | |
| "entropy": 1.234375, | |
| "epoch": 1.5817051074873905, | |
| "grad_norm": 3.0625, | |
| "learning_rate": 6.894840720945754e-07, | |
| "loss": 0.1665, | |
| "mean_token_accuracy": 0.9645081627368927, | |
| "num_tokens": 6841235827.0, | |
| "step": 64600 | |
| }, | |
| { | |
| "entropy": 1.21890625, | |
| "epoch": 1.5829293374467461, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 6.856153917244647e-07, | |
| "loss": 0.1611, | |
| "mean_token_accuracy": 0.9654888653755188, | |
| "num_tokens": 6846579737.0, | |
| "step": 64650 | |
| }, | |
| { | |
| "entropy": 1.2153125, | |
| "epoch": 1.5841535674061016, | |
| "grad_norm": 3.203125, | |
| "learning_rate": 6.81756194728583e-07, | |
| "loss": 0.1546, | |
| "mean_token_accuracy": 0.9667556810379029, | |
| "num_tokens": 6851881949.0, | |
| "step": 64700 | |
| }, | |
| { | |
| "entropy": 1.22421875, | |
| "epoch": 1.585377797365457, | |
| "grad_norm": 3.046875, | |
| "learning_rate": 6.779064969202973e-07, | |
| "loss": 0.1583, | |
| "mean_token_accuracy": 0.966250067949295, | |
| "num_tokens": 6857094183.0, | |
| "step": 64750 | |
| }, | |
| { | |
| "entropy": 1.2265625, | |
| "epoch": 1.5866020273248127, | |
| "grad_norm": 2.96875, | |
| "learning_rate": 6.740663140740467e-07, | |
| "loss": 0.163, | |
| "mean_token_accuracy": 0.9652321350574493, | |
| "num_tokens": 6862381095.0, | |
| "step": 64800 | |
| }, | |
| { | |
| "entropy": 1.2184375, | |
| "epoch": 1.5878262572841684, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 6.70235661925287e-07, | |
| "loss": 0.1594, | |
| "mean_token_accuracy": 0.965182945728302, | |
| "num_tokens": 6867345829.0, | |
| "step": 64850 | |
| }, | |
| { | |
| "entropy": 1.22640625, | |
| "epoch": 1.5890504872435238, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 6.664145561704173e-07, | |
| "loss": 0.1548, | |
| "mean_token_accuracy": 0.9668359410762787, | |
| "num_tokens": 6872899925.0, | |
| "step": 64900 | |
| }, | |
| { | |
| "entropy": 1.23359375, | |
| "epoch": 1.5902747172028793, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 6.626030124667204e-07, | |
| "loss": 0.1695, | |
| "mean_token_accuracy": 0.9634568047523498, | |
| "num_tokens": 6878428253.0, | |
| "step": 64950 | |
| }, | |
| { | |
| "entropy": 1.22609375, | |
| "epoch": 1.591498947162235, | |
| "grad_norm": 3.40625, | |
| "learning_rate": 6.588010464323006e-07, | |
| "loss": 0.1689, | |
| "mean_token_accuracy": 0.9639648401737213, | |
| "num_tokens": 6883915733.0, | |
| "step": 65000 | |
| }, | |
| { | |
| "entropy": 1.22859375, | |
| "epoch": 1.5927231771215906, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 6.550086736460136e-07, | |
| "loss": 0.1719, | |
| "mean_token_accuracy": 0.9634046721458435, | |
| "num_tokens": 6889133852.0, | |
| "step": 65050 | |
| }, | |
| { | |
| "entropy": 1.23578125, | |
| "epoch": 1.593947407080946, | |
| "grad_norm": 2.984375, | |
| "learning_rate": 6.512259096474075e-07, | |
| "loss": 0.1729, | |
| "mean_token_accuracy": 0.9630839240550995, | |
| "num_tokens": 6894861703.0, | |
| "step": 65100 | |
| }, | |
| { | |
| "entropy": 1.21921875, | |
| "epoch": 1.5951716370403015, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 6.474527699366567e-07, | |
| "loss": 0.1599, | |
| "mean_token_accuracy": 0.965704824924469, | |
| "num_tokens": 6899940861.0, | |
| "step": 65150 | |
| }, | |
| { | |
| "entropy": 1.21625, | |
| "epoch": 1.5963958669996572, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 6.436892699745009e-07, | |
| "loss": 0.1572, | |
| "mean_token_accuracy": 0.9666438150405884, | |
| "num_tokens": 6905083361.0, | |
| "step": 65200 | |
| }, | |
| { | |
| "entropy": 1.2153125, | |
| "epoch": 1.5976200969590129, | |
| "grad_norm": 3.5625, | |
| "learning_rate": 6.399354251821792e-07, | |
| "loss": 0.1554, | |
| "mean_token_accuracy": 0.9674275135993957, | |
| "num_tokens": 6910092703.0, | |
| "step": 65250 | |
| }, | |
| { | |
| "entropy": 1.22984375, | |
| "epoch": 1.5988443269183683, | |
| "grad_norm": 2.828125, | |
| "learning_rate": 6.361912509413676e-07, | |
| "loss": 0.1645, | |
| "mean_token_accuracy": 0.9646131348609924, | |
| "num_tokens": 6915320978.0, | |
| "step": 65300 | |
| }, | |
| { | |
| "entropy": 1.22984375, | |
| "epoch": 1.6000685568777238, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 6.32456762594116e-07, | |
| "loss": 0.1594, | |
| "mean_token_accuracy": 0.9651407063007355, | |
| "num_tokens": 6920827957.0, | |
| "step": 65350 | |
| }, | |
| { | |
| "entropy": 1.21140625, | |
| "epoch": 1.6012927868370794, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 6.287319754427873e-07, | |
| "loss": 0.1533, | |
| "mean_token_accuracy": 0.9665750122070312, | |
| "num_tokens": 6926133415.0, | |
| "step": 65400 | |
| }, | |
| { | |
| "entropy": 1.22109375, | |
| "epoch": 1.602517016796435, | |
| "grad_norm": 2.859375, | |
| "learning_rate": 6.250169047499916e-07, | |
| "loss": 0.1563, | |
| "mean_token_accuracy": 0.9660931730270386, | |
| "num_tokens": 6931165132.0, | |
| "step": 65450 | |
| }, | |
| { | |
| "entropy": 1.2040625, | |
| "epoch": 1.6037412467557908, | |
| "grad_norm": 3.890625, | |
| "learning_rate": 6.213115657385244e-07, | |
| "loss": 0.1473, | |
| "mean_token_accuracy": 0.9677533149719239, | |
| "num_tokens": 6936236474.0, | |
| "step": 65500 | |
| }, | |
| { | |
| "entropy": 1.22515625, | |
| "epoch": 1.6049654767151462, | |
| "grad_norm": 1.9140625, | |
| "learning_rate": 6.176159735913079e-07, | |
| "loss": 0.1698, | |
| "mean_token_accuracy": 0.9640149748325348, | |
| "num_tokens": 6941667389.0, | |
| "step": 65550 | |
| }, | |
| { | |
| "entropy": 1.210625, | |
| "epoch": 1.6061897066745017, | |
| "grad_norm": 2.828125, | |
| "learning_rate": 6.139301434513204e-07, | |
| "loss": 0.1495, | |
| "mean_token_accuracy": 0.9672247707843781, | |
| "num_tokens": 6947023413.0, | |
| "step": 65600 | |
| }, | |
| { | |
| "entropy": 1.21921875, | |
| "epoch": 1.6074139366338573, | |
| "grad_norm": 2.859375, | |
| "learning_rate": 6.102540904215455e-07, | |
| "loss": 0.1579, | |
| "mean_token_accuracy": 0.9656173276901245, | |
| "num_tokens": 6952441096.0, | |
| "step": 65650 | |
| }, | |
| { | |
| "entropy": 1.223125, | |
| "epoch": 1.608638166593213, | |
| "grad_norm": 3.71875, | |
| "learning_rate": 6.065878295649004e-07, | |
| "loss": 0.166, | |
| "mean_token_accuracy": 0.9646958529949188, | |
| "num_tokens": 6957942190.0, | |
| "step": 65700 | |
| }, | |
| { | |
| "entropy": 1.2084375, | |
| "epoch": 1.6098623965525685, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 6.0293137590418e-07, | |
| "loss": 0.15, | |
| "mean_token_accuracy": 0.9669717216491699, | |
| "num_tokens": 6963300846.0, | |
| "step": 65750 | |
| }, | |
| { | |
| "entropy": 1.22921875, | |
| "epoch": 1.611086626511924, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 5.992847444219915e-07, | |
| "loss": 0.1614, | |
| "mean_token_accuracy": 0.9650086772441864, | |
| "num_tokens": 6968779335.0, | |
| "step": 65800 | |
| }, | |
| { | |
| "entropy": 1.22625, | |
| "epoch": 1.6123108564712796, | |
| "grad_norm": 2.78125, | |
| "learning_rate": 5.956479500606977e-07, | |
| "loss": 0.171, | |
| "mean_token_accuracy": 0.9639155077934265, | |
| "num_tokens": 6974202109.0, | |
| "step": 65850 | |
| }, | |
| { | |
| "entropy": 1.21328125, | |
| "epoch": 1.6135350864306353, | |
| "grad_norm": 3.375, | |
| "learning_rate": 5.920210077223508e-07, | |
| "loss": 0.1488, | |
| "mean_token_accuracy": 0.9683645820617676, | |
| "num_tokens": 6979171497.0, | |
| "step": 65900 | |
| }, | |
| { | |
| "entropy": 1.21875, | |
| "epoch": 1.6147593163899907, | |
| "grad_norm": 2.734375, | |
| "learning_rate": 5.884039322686345e-07, | |
| "loss": 0.1593, | |
| "mean_token_accuracy": 0.9662387585639953, | |
| "num_tokens": 6984410380.0, | |
| "step": 65950 | |
| }, | |
| { | |
| "entropy": 1.198125, | |
| "epoch": 1.6159835463493462, | |
| "grad_norm": 2.0, | |
| "learning_rate": 5.847967385208012e-07, | |
| "loss": 0.1521, | |
| "mean_token_accuracy": 0.966891850233078, | |
| "num_tokens": 6989408812.0, | |
| "step": 66000 | |
| }, | |
| { | |
| "entropy": 1.20296875, | |
| "epoch": 1.6172077763087018, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 5.81199441259614e-07, | |
| "loss": 0.1509, | |
| "mean_token_accuracy": 0.9681426680088043, | |
| "num_tokens": 6994432516.0, | |
| "step": 66050 | |
| }, | |
| { | |
| "entropy": 1.225625, | |
| "epoch": 1.6184320062680575, | |
| "grad_norm": 3.140625, | |
| "learning_rate": 5.776120552252833e-07, | |
| "loss": 0.1638, | |
| "mean_token_accuracy": 0.965145457983017, | |
| "num_tokens": 6999763932.0, | |
| "step": 66100 | |
| }, | |
| { | |
| "entropy": 1.22421875, | |
| "epoch": 1.619656236227413, | |
| "grad_norm": 3.078125, | |
| "learning_rate": 5.740345951174062e-07, | |
| "loss": 0.1654, | |
| "mean_token_accuracy": 0.9642065274715423, | |
| "num_tokens": 7005089905.0, | |
| "step": 66150 | |
| }, | |
| { | |
| "entropy": 1.238125, | |
| "epoch": 1.6208804661867684, | |
| "grad_norm": 2.78125, | |
| "learning_rate": 5.704670755949111e-07, | |
| "loss": 0.1742, | |
| "mean_token_accuracy": 0.962605128288269, | |
| "num_tokens": 7010758688.0, | |
| "step": 66200 | |
| }, | |
| { | |
| "entropy": 1.2284375, | |
| "epoch": 1.622104696146124, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 5.669095112759893e-07, | |
| "loss": 0.1699, | |
| "mean_token_accuracy": 0.9639213311672211, | |
| "num_tokens": 7015757555.0, | |
| "step": 66250 | |
| }, | |
| { | |
| "entropy": 1.215, | |
| "epoch": 1.6233289261054797, | |
| "grad_norm": 3.609375, | |
| "learning_rate": 5.633619167380439e-07, | |
| "loss": 0.1542, | |
| "mean_token_accuracy": 0.9669547820091248, | |
| "num_tokens": 7020934918.0, | |
| "step": 66300 | |
| }, | |
| { | |
| "entropy": 1.20421875, | |
| "epoch": 1.6245531560648352, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 5.598243065176243e-07, | |
| "loss": 0.1491, | |
| "mean_token_accuracy": 0.9682400977611542, | |
| "num_tokens": 7026062287.0, | |
| "step": 66350 | |
| }, | |
| { | |
| "entropy": 1.224375, | |
| "epoch": 1.6257773860241906, | |
| "grad_norm": 3.328125, | |
| "learning_rate": 5.56296695110368e-07, | |
| "loss": 0.1563, | |
| "mean_token_accuracy": 0.965864109992981, | |
| "num_tokens": 7031243491.0, | |
| "step": 66400 | |
| }, | |
| { | |
| "entropy": 1.21640625, | |
| "epoch": 1.6270016159835463, | |
| "grad_norm": 1.875, | |
| "learning_rate": 5.527790969709421e-07, | |
| "loss": 0.1591, | |
| "mean_token_accuracy": 0.9661051654815673, | |
| "num_tokens": 7036518719.0, | |
| "step": 66450 | |
| }, | |
| { | |
| "entropy": 1.21765625, | |
| "epoch": 1.628225845942902, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 5.492715265129842e-07, | |
| "loss": 0.1526, | |
| "mean_token_accuracy": 0.967378306388855, | |
| "num_tokens": 7041605356.0, | |
| "step": 66500 | |
| }, | |
| { | |
| "entropy": 1.22578125, | |
| "epoch": 1.6294500759022574, | |
| "grad_norm": 3.25, | |
| "learning_rate": 5.457739981090422e-07, | |
| "loss": 0.1608, | |
| "mean_token_accuracy": 0.965805538892746, | |
| "num_tokens": 7047131119.0, | |
| "step": 66550 | |
| }, | |
| { | |
| "entropy": 1.22296875, | |
| "epoch": 1.6306743058616129, | |
| "grad_norm": 3.9375, | |
| "learning_rate": 5.422865260905141e-07, | |
| "loss": 0.162, | |
| "mean_token_accuracy": 0.9653668451309204, | |
| "num_tokens": 7052461810.0, | |
| "step": 66600 | |
| }, | |
| { | |
| "entropy": 1.2321875, | |
| "epoch": 1.6318985358209686, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 5.388091247475948e-07, | |
| "loss": 0.1674, | |
| "mean_token_accuracy": 0.9641144728660583, | |
| "num_tokens": 7057861665.0, | |
| "step": 66650 | |
| }, | |
| { | |
| "entropy": 1.22, | |
| "epoch": 1.6331227657803242, | |
| "grad_norm": 4.875, | |
| "learning_rate": 5.35341808329211e-07, | |
| "loss": 0.1612, | |
| "mean_token_accuracy": 0.9650032806396485, | |
| "num_tokens": 7063074323.0, | |
| "step": 66700 | |
| }, | |
| { | |
| "entropy": 1.2290625, | |
| "epoch": 1.63434699573968, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 5.31884591042966e-07, | |
| "loss": 0.1642, | |
| "mean_token_accuracy": 0.9645574033260346, | |
| "num_tokens": 7068850662.0, | |
| "step": 66750 | |
| }, | |
| { | |
| "entropy": 1.21140625, | |
| "epoch": 1.6355712256990353, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 5.284374870550806e-07, | |
| "loss": 0.1513, | |
| "mean_token_accuracy": 0.9664854764938354, | |
| "num_tokens": 7073845156.0, | |
| "step": 66800 | |
| }, | |
| { | |
| "entropy": 1.2134375, | |
| "epoch": 1.6367954556583908, | |
| "grad_norm": 2.59375, | |
| "learning_rate": 5.250005104903391e-07, | |
| "loss": 0.1526, | |
| "mean_token_accuracy": 0.9672818171977997, | |
| "num_tokens": 7078890553.0, | |
| "step": 66850 | |
| }, | |
| { | |
| "entropy": 1.21890625, | |
| "epoch": 1.6380196856177465, | |
| "grad_norm": 3.1875, | |
| "learning_rate": 5.215736754320221e-07, | |
| "loss": 0.1559, | |
| "mean_token_accuracy": 0.9661696362495422, | |
| "num_tokens": 7084113116.0, | |
| "step": 66900 | |
| }, | |
| { | |
| "entropy": 1.2209375, | |
| "epoch": 1.6392439155771021, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 5.181569959218593e-07, | |
| "loss": 0.1537, | |
| "mean_token_accuracy": 0.9654488229751587, | |
| "num_tokens": 7089341607.0, | |
| "step": 66950 | |
| }, | |
| { | |
| "entropy": 1.21953125, | |
| "epoch": 1.6404681455364576, | |
| "grad_norm": 2.9375, | |
| "learning_rate": 5.147504859599658e-07, | |
| "loss": 0.1627, | |
| "mean_token_accuracy": 0.9644181895256042, | |
| "num_tokens": 7094625061.0, | |
| "step": 67000 | |
| }, | |
| { | |
| "entropy": 1.226875, | |
| "epoch": 1.641692375495813, | |
| "grad_norm": 2.78125, | |
| "learning_rate": 5.113541595047853e-07, | |
| "loss": 0.1638, | |
| "mean_token_accuracy": 0.9646450591087341, | |
| "num_tokens": 7100017216.0, | |
| "step": 67050 | |
| }, | |
| { | |
| "entropy": 1.22828125, | |
| "epoch": 1.6429166054551687, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 5.079680304730336e-07, | |
| "loss": 0.1632, | |
| "mean_token_accuracy": 0.9642895436286927, | |
| "num_tokens": 7105647390.0, | |
| "step": 67100 | |
| }, | |
| { | |
| "entropy": 1.2190625, | |
| "epoch": 1.6441408354145244, | |
| "grad_norm": 3.1875, | |
| "learning_rate": 5.045921127396446e-07, | |
| "loss": 0.1568, | |
| "mean_token_accuracy": 0.9664517366886138, | |
| "num_tokens": 7111038795.0, | |
| "step": 67150 | |
| }, | |
| { | |
| "entropy": 1.20453125, | |
| "epoch": 1.6453650653738798, | |
| "grad_norm": 3.5625, | |
| "learning_rate": 5.012264201377073e-07, | |
| "loss": 0.1546, | |
| "mean_token_accuracy": 0.9667070829868316, | |
| "num_tokens": 7116213641.0, | |
| "step": 67200 | |
| }, | |
| { | |
| "entropy": 1.22828125, | |
| "epoch": 1.6465892953332353, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 4.978709664584132e-07, | |
| "loss": 0.1502, | |
| "mean_token_accuracy": 0.9669265413284301, | |
| "num_tokens": 7121369080.0, | |
| "step": 67250 | |
| }, | |
| { | |
| "entropy": 1.2240625, | |
| "epoch": 1.647813525292591, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 4.945257654510013e-07, | |
| "loss": 0.1614, | |
| "mean_token_accuracy": 0.966176050901413, | |
| "num_tokens": 7126738052.0, | |
| "step": 67300 | |
| }, | |
| { | |
| "entropy": 1.21375, | |
| "epoch": 1.6490377552519466, | |
| "grad_norm": 3.21875, | |
| "learning_rate": 4.911908308226965e-07, | |
| "loss": 0.1425, | |
| "mean_token_accuracy": 0.969027806520462, | |
| "num_tokens": 7131902692.0, | |
| "step": 67350 | |
| }, | |
| { | |
| "entropy": 1.20609375, | |
| "epoch": 1.650261985211302, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 4.878661762386575e-07, | |
| "loss": 0.1494, | |
| "mean_token_accuracy": 0.966635344028473, | |
| "num_tokens": 7136808281.0, | |
| "step": 67400 | |
| }, | |
| { | |
| "entropy": 1.2134375, | |
| "epoch": 1.6514862151706575, | |
| "grad_norm": 3.921875, | |
| "learning_rate": 4.845518153219194e-07, | |
| "loss": 0.1536, | |
| "mean_token_accuracy": 0.9664989912509918, | |
| "num_tokens": 7141996551.0, | |
| "step": 67450 | |
| }, | |
| { | |
| "entropy": 1.2096875, | |
| "epoch": 1.6527104451300132, | |
| "grad_norm": 2.875, | |
| "learning_rate": 4.812477616533406e-07, | |
| "loss": 0.1517, | |
| "mean_token_accuracy": 0.9665413784980774, | |
| "num_tokens": 7146993092.0, | |
| "step": 67500 | |
| }, | |
| { | |
| "entropy": 1.209375, | |
| "epoch": 1.6539346750893689, | |
| "grad_norm": 3.3125, | |
| "learning_rate": 4.779540287715394e-07, | |
| "loss": 0.1583, | |
| "mean_token_accuracy": 0.965690256357193, | |
| "num_tokens": 7152324580.0, | |
| "step": 67550 | |
| }, | |
| { | |
| "entropy": 1.2259375, | |
| "epoch": 1.6551589050487243, | |
| "grad_norm": 3.828125, | |
| "learning_rate": 4.7467063017285005e-07, | |
| "loss": 0.1632, | |
| "mean_token_accuracy": 0.9648753714561462, | |
| "num_tokens": 7157642715.0, | |
| "step": 67600 | |
| }, | |
| { | |
| "entropy": 1.21328125, | |
| "epoch": 1.6563831350080798, | |
| "grad_norm": 3.4375, | |
| "learning_rate": 4.713975793112569e-07, | |
| "loss": 0.1542, | |
| "mean_token_accuracy": 0.9669430148601532, | |
| "num_tokens": 7162998030.0, | |
| "step": 67650 | |
| }, | |
| { | |
| "entropy": 1.185625, | |
| "epoch": 1.6576073649674354, | |
| "grad_norm": 3.953125, | |
| "learning_rate": 4.681348895983448e-07, | |
| "loss": 0.1379, | |
| "mean_token_accuracy": 0.9700025701522828, | |
| "num_tokens": 7167607013.0, | |
| "step": 67700 | |
| }, | |
| { | |
| "entropy": 1.2225, | |
| "epoch": 1.658831594926791, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 4.648825744032449e-07, | |
| "loss": 0.1614, | |
| "mean_token_accuracy": 0.9637822723388672, | |
| "num_tokens": 7172916071.0, | |
| "step": 67750 | |
| }, | |
| { | |
| "entropy": 1.22109375, | |
| "epoch": 1.6600558248861468, | |
| "grad_norm": 0.003997802734375, | |
| "learning_rate": 4.6164064705257424e-07, | |
| "loss": 0.1604, | |
| "mean_token_accuracy": 0.9653963768482208, | |
| "num_tokens": 7178344100.0, | |
| "step": 67800 | |
| }, | |
| { | |
| "entropy": 1.21921875, | |
| "epoch": 1.6612800548455022, | |
| "grad_norm": 2.453125, | |
| "learning_rate": 4.584091208303891e-07, | |
| "loss": 0.1583, | |
| "mean_token_accuracy": 0.9654520618915557, | |
| "num_tokens": 7183547126.0, | |
| "step": 67850 | |
| }, | |
| { | |
| "entropy": 1.2121875, | |
| "epoch": 1.6625042848048577, | |
| "grad_norm": 1.7578125, | |
| "learning_rate": 4.5518800897812174e-07, | |
| "loss": 0.1521, | |
| "mean_token_accuracy": 0.9661059749126434, | |
| "num_tokens": 7188532212.0, | |
| "step": 67900 | |
| }, | |
| { | |
| "entropy": 1.2209375, | |
| "epoch": 1.6637285147642134, | |
| "grad_norm": 2.734375, | |
| "learning_rate": 4.519773246945349e-07, | |
| "loss": 0.1576, | |
| "mean_token_accuracy": 0.9657674777507782, | |
| "num_tokens": 7193693940.0, | |
| "step": 67950 | |
| }, | |
| { | |
| "entropy": 1.23375, | |
| "epoch": 1.664952744723569, | |
| "grad_norm": 2.953125, | |
| "learning_rate": 4.487770811356612e-07, | |
| "loss": 0.1664, | |
| "mean_token_accuracy": 0.9635096192359924, | |
| "num_tokens": 7199191726.0, | |
| "step": 68000 | |
| }, | |
| { | |
| "entropy": 1.21625, | |
| "epoch": 1.6661769746829245, | |
| "grad_norm": 3.03125, | |
| "learning_rate": 4.455872914147521e-07, | |
| "loss": 0.1614, | |
| "mean_token_accuracy": 0.965412712097168, | |
| "num_tokens": 7204740271.0, | |
| "step": 68050 | |
| }, | |
| { | |
| "entropy": 1.2178125, | |
| "epoch": 1.66740120464228, | |
| "grad_norm": 1.96875, | |
| "learning_rate": 4.424079686022223e-07, | |
| "loss": 0.1647, | |
| "mean_token_accuracy": 0.9641766202449799, | |
| "num_tokens": 7210407120.0, | |
| "step": 68100 | |
| }, | |
| { | |
| "entropy": 1.22875, | |
| "epoch": 1.6686254346016356, | |
| "grad_norm": 2.953125, | |
| "learning_rate": 4.39239125725601e-07, | |
| "loss": 0.162, | |
| "mean_token_accuracy": 0.9659585297107697, | |
| "num_tokens": 7215783474.0, | |
| "step": 68150 | |
| }, | |
| { | |
| "entropy": 1.226875, | |
| "epoch": 1.6698496645609913, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 4.360807757694718e-07, | |
| "loss": 0.1626, | |
| "mean_token_accuracy": 0.9646227335929871, | |
| "num_tokens": 7220993281.0, | |
| "step": 68200 | |
| }, | |
| { | |
| "entropy": 1.19703125, | |
| "epoch": 1.6710738945203467, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 4.329329316754236e-07, | |
| "loss": 0.1441, | |
| "mean_token_accuracy": 0.9685395467281341, | |
| "num_tokens": 7225810836.0, | |
| "step": 68250 | |
| }, | |
| { | |
| "entropy": 1.21875, | |
| "epoch": 1.6722981244797022, | |
| "grad_norm": 2.984375, | |
| "learning_rate": 4.2979560634199754e-07, | |
| "loss": 0.1688, | |
| "mean_token_accuracy": 0.9636458623409271, | |
| "num_tokens": 7231649459.0, | |
| "step": 68300 | |
| }, | |
| { | |
| "entropy": 1.19296875, | |
| "epoch": 1.6735223544390578, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 4.266688126246311e-07, | |
| "loss": 0.1424, | |
| "mean_token_accuracy": 0.9688647317886353, | |
| "num_tokens": 7236848069.0, | |
| "step": 68350 | |
| }, | |
| { | |
| "entropy": 1.2278125, | |
| "epoch": 1.6747465843984135, | |
| "grad_norm": 3.203125, | |
| "learning_rate": 4.235525633356111e-07, | |
| "loss": 0.1676, | |
| "mean_token_accuracy": 0.963608900308609, | |
| "num_tokens": 7242384952.0, | |
| "step": 68400 | |
| }, | |
| { | |
| "entropy": 1.238125, | |
| "epoch": 1.675970814357769, | |
| "grad_norm": 3.359375, | |
| "learning_rate": 4.204468712440144e-07, | |
| "loss": 0.1653, | |
| "mean_token_accuracy": 0.9638743424415588, | |
| "num_tokens": 7247699380.0, | |
| "step": 68450 | |
| }, | |
| { | |
| "entropy": 1.21671875, | |
| "epoch": 1.6771950443171244, | |
| "grad_norm": 3.015625, | |
| "learning_rate": 4.1735174907566234e-07, | |
| "loss": 0.1507, | |
| "mean_token_accuracy": 0.9674655389785767, | |
| "num_tokens": 7252973599.0, | |
| "step": 68500 | |
| }, | |
| { | |
| "entropy": 1.2109375, | |
| "epoch": 1.67841927427648, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 4.142672095130603e-07, | |
| "loss": 0.1488, | |
| "mean_token_accuracy": 0.9676065123081208, | |
| "num_tokens": 7257981736.0, | |
| "step": 68550 | |
| }, | |
| { | |
| "entropy": 1.2084375, | |
| "epoch": 1.6796435042358357, | |
| "grad_norm": 2.765625, | |
| "learning_rate": 4.111932651953554e-07, | |
| "loss": 0.1537, | |
| "mean_token_accuracy": 0.9668715631961823, | |
| "num_tokens": 7263067623.0, | |
| "step": 68600 | |
| }, | |
| { | |
| "entropy": 1.2253125, | |
| "epoch": 1.6808677341951912, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 4.0812992871827737e-07, | |
| "loss": 0.1514, | |
| "mean_token_accuracy": 0.967187968492508, | |
| "num_tokens": 7268515412.0, | |
| "step": 68650 | |
| }, | |
| { | |
| "entropy": 1.2240625, | |
| "epoch": 1.6820919641545466, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 4.0507721263409016e-07, | |
| "loss": 0.155, | |
| "mean_token_accuracy": 0.9657605230808258, | |
| "num_tokens": 7273767424.0, | |
| "step": 68700 | |
| }, | |
| { | |
| "entropy": 1.21890625, | |
| "epoch": 1.6833161941139023, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 4.0203512945153874e-07, | |
| "loss": 0.1501, | |
| "mean_token_accuracy": 0.9671496486663819, | |
| "num_tokens": 7279187672.0, | |
| "step": 68750 | |
| }, | |
| { | |
| "entropy": 1.20953125, | |
| "epoch": 1.684540424073258, | |
| "grad_norm": 3.0625, | |
| "learning_rate": 3.990036916358014e-07, | |
| "loss": 0.1466, | |
| "mean_token_accuracy": 0.9685079550743103, | |
| "num_tokens": 7284104561.0, | |
| "step": 68800 | |
| }, | |
| { | |
| "entropy": 1.21328125, | |
| "epoch": 1.6857646540326134, | |
| "grad_norm": 4.0625, | |
| "learning_rate": 3.9598291160843393e-07, | |
| "loss": 0.1557, | |
| "mean_token_accuracy": 0.9655941009521485, | |
| "num_tokens": 7289492586.0, | |
| "step": 68850 | |
| }, | |
| { | |
| "entropy": 1.18875, | |
| "epoch": 1.686988883991969, | |
| "grad_norm": 1.875, | |
| "learning_rate": 3.929728017473213e-07, | |
| "loss": 0.14, | |
| "mean_token_accuracy": 0.969061805009842, | |
| "num_tokens": 7294671673.0, | |
| "step": 68900 | |
| }, | |
| { | |
| "entropy": 1.21671875, | |
| "epoch": 1.6882131139513246, | |
| "grad_norm": 1.578125, | |
| "learning_rate": 3.8997337438662893e-07, | |
| "loss": 0.1628, | |
| "mean_token_accuracy": 0.9643185365200043, | |
| "num_tokens": 7300014488.0, | |
| "step": 68950 | |
| }, | |
| { | |
| "entropy": 1.22359375, | |
| "epoch": 1.6894373439106802, | |
| "grad_norm": 0.01251220703125, | |
| "learning_rate": 3.869846418167452e-07, | |
| "loss": 0.1521, | |
| "mean_token_accuracy": 0.9664946186542511, | |
| "num_tokens": 7305132050.0, | |
| "step": 69000 | |
| }, | |
| { | |
| "entropy": 1.21640625, | |
| "epoch": 1.690661573870036, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 3.840066162842405e-07, | |
| "loss": 0.1518, | |
| "mean_token_accuracy": 0.9676698422431946, | |
| "num_tokens": 7310341663.0, | |
| "step": 69050 | |
| }, | |
| { | |
| "entropy": 1.22984375, | |
| "epoch": 1.6918858038293914, | |
| "grad_norm": 3.625, | |
| "learning_rate": 3.8103930999180936e-07, | |
| "loss": 0.1685, | |
| "mean_token_accuracy": 0.963647495508194, | |
| "num_tokens": 7315713992.0, | |
| "step": 69100 | |
| }, | |
| { | |
| "entropy": 1.2271875, | |
| "epoch": 1.6931100337887468, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 3.780827350982258e-07, | |
| "loss": 0.1558, | |
| "mean_token_accuracy": 0.9662664186954498, | |
| "num_tokens": 7321152260.0, | |
| "step": 69150 | |
| }, | |
| { | |
| "entropy": 1.21296875, | |
| "epoch": 1.6943342637481025, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 3.751369037182869e-07, | |
| "loss": 0.1532, | |
| "mean_token_accuracy": 0.9662709140777588, | |
| "num_tokens": 7326190569.0, | |
| "step": 69200 | |
| }, | |
| { | |
| "entropy": 1.198125, | |
| "epoch": 1.6955584937074581, | |
| "grad_norm": 2.9375, | |
| "learning_rate": 3.722018279227728e-07, | |
| "loss": 0.1412, | |
| "mean_token_accuracy": 0.9689172983169556, | |
| "num_tokens": 7331368151.0, | |
| "step": 69250 | |
| }, | |
| { | |
| "entropy": 1.21125, | |
| "epoch": 1.6967827236668136, | |
| "grad_norm": 3.25, | |
| "learning_rate": 3.6927751973838777e-07, | |
| "loss": 0.1578, | |
| "mean_token_accuracy": 0.9661315476894379, | |
| "num_tokens": 7336566118.0, | |
| "step": 69300 | |
| }, | |
| { | |
| "entropy": 1.2215625, | |
| "epoch": 1.698006953626169, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 3.66363991147716e-07, | |
| "loss": 0.1577, | |
| "mean_token_accuracy": 0.9653751969337463, | |
| "num_tokens": 7341728443.0, | |
| "step": 69350 | |
| }, | |
| { | |
| "entropy": 1.20796875, | |
| "epoch": 1.6992311835855247, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 3.6346125408917155e-07, | |
| "loss": 0.1497, | |
| "mean_token_accuracy": 0.9668842852115631, | |
| "num_tokens": 7346956092.0, | |
| "step": 69400 | |
| }, | |
| { | |
| "entropy": 1.216875, | |
| "epoch": 1.7004554135448804, | |
| "grad_norm": 3.15625, | |
| "learning_rate": 3.605693204569506e-07, | |
| "loss": 0.1547, | |
| "mean_token_accuracy": 0.967246618270874, | |
| "num_tokens": 7352423947.0, | |
| "step": 69450 | |
| }, | |
| { | |
| "entropy": 1.2075, | |
| "epoch": 1.7016796435042358, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 3.576882021009792e-07, | |
| "loss": 0.1489, | |
| "mean_token_accuracy": 0.9667674267292022, | |
| "num_tokens": 7357669096.0, | |
| "step": 69500 | |
| }, | |
| { | |
| "entropy": 1.19796875, | |
| "epoch": 1.7029038734635913, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 3.5481791082686757e-07, | |
| "loss": 0.1421, | |
| "mean_token_accuracy": 0.9695830595493317, | |
| "num_tokens": 7362784518.0, | |
| "step": 69550 | |
| }, | |
| { | |
| "entropy": 1.2278125, | |
| "epoch": 1.704128103422947, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 3.519584583958636e-07, | |
| "loss": 0.162, | |
| "mean_token_accuracy": 0.9651164734363555, | |
| "num_tokens": 7368275670.0, | |
| "step": 69600 | |
| }, | |
| { | |
| "entropy": 1.21578125, | |
| "epoch": 1.7053523333823026, | |
| "grad_norm": 2.640625, | |
| "learning_rate": 3.4910985652479757e-07, | |
| "loss": 0.1506, | |
| "mean_token_accuracy": 0.9667972207069397, | |
| "num_tokens": 7373607544.0, | |
| "step": 69650 | |
| }, | |
| { | |
| "entropy": 1.20625, | |
| "epoch": 1.706576563341658, | |
| "grad_norm": 4.71875, | |
| "learning_rate": 3.462721168860428e-07, | |
| "loss": 0.1492, | |
| "mean_token_accuracy": 0.9675750434398651, | |
| "num_tokens": 7378823181.0, | |
| "step": 69700 | |
| }, | |
| { | |
| "entropy": 1.2265625, | |
| "epoch": 1.7078007933010135, | |
| "grad_norm": 2.84375, | |
| "learning_rate": 3.4344525110746127e-07, | |
| "loss": 0.1603, | |
| "mean_token_accuracy": 0.965987560749054, | |
| "num_tokens": 7384384951.0, | |
| "step": 69750 | |
| }, | |
| { | |
| "entropy": 1.21953125, | |
| "epoch": 1.7090250232603692, | |
| "grad_norm": 1.640625, | |
| "learning_rate": 3.4062927077236106e-07, | |
| "loss": 0.1574, | |
| "mean_token_accuracy": 0.9660314428806305, | |
| "num_tokens": 7389942384.0, | |
| "step": 69800 | |
| }, | |
| { | |
| "entropy": 1.21640625, | |
| "epoch": 1.7102492532197249, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 3.3782418741944244e-07, | |
| "loss": 0.1629, | |
| "mean_token_accuracy": 0.9638810443878174, | |
| "num_tokens": 7395323756.0, | |
| "step": 69850 | |
| }, | |
| { | |
| "entropy": 1.20765625, | |
| "epoch": 1.7114734831790803, | |
| "grad_norm": 2.625, | |
| "learning_rate": 3.350300125427578e-07, | |
| "loss": 0.1384, | |
| "mean_token_accuracy": 0.9689883410930633, | |
| "num_tokens": 7400575411.0, | |
| "step": 69900 | |
| }, | |
| { | |
| "entropy": 1.20546875, | |
| "epoch": 1.7126977131384358, | |
| "grad_norm": 3.109375, | |
| "learning_rate": 3.3224675759166026e-07, | |
| "loss": 0.1515, | |
| "mean_token_accuracy": 0.9666663575172424, | |
| "num_tokens": 7405984120.0, | |
| "step": 69950 | |
| }, | |
| { | |
| "entropy": 1.2203125, | |
| "epoch": 1.7139219430977914, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 3.294744339707564e-07, | |
| "loss": 0.1566, | |
| "mean_token_accuracy": 0.9662071549892426, | |
| "num_tokens": 7411306216.0, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 1.7139219430977914, | |
| "eval_entropy": 1.2108072916666666, | |
| "eval_loss": 0.17756883800029755, | |
| "eval_mean_token_accuracy": 0.9620932574073474, | |
| "eval_num_tokens": 7411306216.0, | |
| "eval_runtime": 601.9385, | |
| "eval_samples_per_second": 16.042, | |
| "eval_steps_per_second": 0.201, | |
| "step": 70000 | |
| }, | |
| { | |
| "entropy": 1.21734375, | |
| "epoch": 1.7151461730571471, | |
| "grad_norm": 0.0033111572265625, | |
| "learning_rate": 3.2671305303986264e-07, | |
| "loss": 0.1546, | |
| "mean_token_accuracy": 0.9665888488292694, | |
| "num_tokens": 7416539172.0, | |
| "step": 70050 | |
| }, | |
| { | |
| "entropy": 1.21734375, | |
| "epoch": 1.7163704030165026, | |
| "grad_norm": 2.84375, | |
| "learning_rate": 3.23962626113956e-07, | |
| "loss": 0.151, | |
| "mean_token_accuracy": 0.9668701207637787, | |
| "num_tokens": 7421707836.0, | |
| "step": 70100 | |
| }, | |
| { | |
| "entropy": 1.20390625, | |
| "epoch": 1.7175946329758582, | |
| "grad_norm": 2.875, | |
| "learning_rate": 3.212231644631286e-07, | |
| "loss": 0.1522, | |
| "mean_token_accuracy": 0.967432736158371, | |
| "num_tokens": 7427044054.0, | |
| "step": 70150 | |
| }, | |
| { | |
| "entropy": 1.1990625, | |
| "epoch": 1.7188188629352137, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 3.184946793125406e-07, | |
| "loss": 0.1454, | |
| "mean_token_accuracy": 0.9683572733402253, | |
| "num_tokens": 7432165156.0, | |
| "step": 70200 | |
| }, | |
| { | |
| "entropy": 1.22375, | |
| "epoch": 1.7200430928945694, | |
| "grad_norm": 3.15625, | |
| "learning_rate": 3.157771818423778e-07, | |
| "loss": 0.1574, | |
| "mean_token_accuracy": 0.9646234130859375, | |
| "num_tokens": 7437729163.0, | |
| "step": 70250 | |
| }, | |
| { | |
| "entropy": 1.2253125, | |
| "epoch": 1.721267322853925, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 3.130706831877993e-07, | |
| "loss": 0.1583, | |
| "mean_token_accuracy": 0.965836591720581, | |
| "num_tokens": 7443255376.0, | |
| "step": 70300 | |
| }, | |
| { | |
| "entropy": 1.21734375, | |
| "epoch": 1.7224915528132805, | |
| "grad_norm": 3.8125, | |
| "learning_rate": 3.1037519443889927e-07, | |
| "loss": 0.1502, | |
| "mean_token_accuracy": 0.967227201461792, | |
| "num_tokens": 7448723374.0, | |
| "step": 70350 | |
| }, | |
| { | |
| "entropy": 1.1978125, | |
| "epoch": 1.723715782772636, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 3.07690726640655e-07, | |
| "loss": 0.1386, | |
| "mean_token_accuracy": 0.9692979896068573, | |
| "num_tokens": 7453945048.0, | |
| "step": 70400 | |
| }, | |
| { | |
| "entropy": 1.21671875, | |
| "epoch": 1.7249400127319916, | |
| "grad_norm": 3.359375, | |
| "learning_rate": 3.050172907928872e-07, | |
| "loss": 0.1601, | |
| "mean_token_accuracy": 0.9648488080501556, | |
| "num_tokens": 7459709955.0, | |
| "step": 70450 | |
| }, | |
| { | |
| "entropy": 1.194375, | |
| "epoch": 1.7261642426913473, | |
| "grad_norm": 1.2109375, | |
| "learning_rate": 3.0235489785021073e-07, | |
| "loss": 0.1429, | |
| "mean_token_accuracy": 0.968617148399353, | |
| "num_tokens": 7464731391.0, | |
| "step": 70500 | |
| }, | |
| { | |
| "entropy": 1.21328125, | |
| "epoch": 1.7273884726507027, | |
| "grad_norm": 4.1875, | |
| "learning_rate": 2.997035587219911e-07, | |
| "loss": 0.1509, | |
| "mean_token_accuracy": 0.9667483043670654, | |
| "num_tokens": 7470148354.0, | |
| "step": 70550 | |
| }, | |
| { | |
| "entropy": 1.21015625, | |
| "epoch": 1.7286127026100582, | |
| "grad_norm": 2.890625, | |
| "learning_rate": 2.970632842723001e-07, | |
| "loss": 0.1537, | |
| "mean_token_accuracy": 0.9668030095100403, | |
| "num_tokens": 7475597114.0, | |
| "step": 70600 | |
| }, | |
| { | |
| "entropy": 1.21203125, | |
| "epoch": 1.7298369325694138, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 2.944340853198715e-07, | |
| "loss": 0.1489, | |
| "mean_token_accuracy": 0.9677174651622772, | |
| "num_tokens": 7480924480.0, | |
| "step": 70650 | |
| }, | |
| { | |
| "entropy": 1.1978125, | |
| "epoch": 1.7310611625287695, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 2.9181597263805703e-07, | |
| "loss": 0.1381, | |
| "mean_token_accuracy": 0.9692902910709381, | |
| "num_tokens": 7485944672.0, | |
| "step": 70700 | |
| }, | |
| { | |
| "entropy": 1.2234375, | |
| "epoch": 1.732285392488125, | |
| "grad_norm": 3.15625, | |
| "learning_rate": 2.8920895695478036e-07, | |
| "loss": 0.1575, | |
| "mean_token_accuracy": 0.9657765531539917, | |
| "num_tokens": 7491484223.0, | |
| "step": 70750 | |
| }, | |
| { | |
| "entropy": 1.21984375, | |
| "epoch": 1.7335096224474804, | |
| "grad_norm": 1.640625, | |
| "learning_rate": 2.866130489524946e-07, | |
| "loss": 0.1497, | |
| "mean_token_accuracy": 0.9674056577682495, | |
| "num_tokens": 7496915236.0, | |
| "step": 70800 | |
| }, | |
| { | |
| "entropy": 1.2109375, | |
| "epoch": 1.734733852406836, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 2.8402825926813793e-07, | |
| "loss": 0.1541, | |
| "mean_token_accuracy": 0.9666642725467682, | |
| "num_tokens": 7502068005.0, | |
| "step": 70850 | |
| }, | |
| { | |
| "entropy": 1.22796875, | |
| "epoch": 1.7359580823661918, | |
| "grad_norm": 1.171875, | |
| "learning_rate": 2.814545984930923e-07, | |
| "loss": 0.1643, | |
| "mean_token_accuracy": 0.9640646266937256, | |
| "num_tokens": 7507947357.0, | |
| "step": 70900 | |
| }, | |
| { | |
| "entropy": 1.2171875, | |
| "epoch": 1.7371823123255472, | |
| "grad_norm": 3.78125, | |
| "learning_rate": 2.788920771731344e-07, | |
| "loss": 0.1515, | |
| "mean_token_accuracy": 0.96691251039505, | |
| "num_tokens": 7513464788.0, | |
| "step": 70950 | |
| }, | |
| { | |
| "entropy": 1.21421875, | |
| "epoch": 1.7384065422849027, | |
| "grad_norm": 2.828125, | |
| "learning_rate": 2.763407058083999e-07, | |
| "loss": 0.1562, | |
| "mean_token_accuracy": 0.9653972661495209, | |
| "num_tokens": 7518965009.0, | |
| "step": 71000 | |
| }, | |
| { | |
| "entropy": 1.22109375, | |
| "epoch": 1.7396307722442583, | |
| "grad_norm": 3.09375, | |
| "learning_rate": 2.738004948533338e-07, | |
| "loss": 0.1553, | |
| "mean_token_accuracy": 0.9661720776557923, | |
| "num_tokens": 7524509007.0, | |
| "step": 71050 | |
| }, | |
| { | |
| "entropy": 1.2178125, | |
| "epoch": 1.740855002203614, | |
| "grad_norm": 2.640625, | |
| "learning_rate": 2.712714547166534e-07, | |
| "loss": 0.1494, | |
| "mean_token_accuracy": 0.9680777621269226, | |
| "num_tokens": 7529983645.0, | |
| "step": 71100 | |
| }, | |
| { | |
| "entropy": 1.22078125, | |
| "epoch": 1.7420792321629694, | |
| "grad_norm": 2.640625, | |
| "learning_rate": 2.6875359576129975e-07, | |
| "loss": 0.1604, | |
| "mean_token_accuracy": 0.9644283270835876, | |
| "num_tokens": 7535464039.0, | |
| "step": 71150 | |
| }, | |
| { | |
| "entropy": 1.206875, | |
| "epoch": 1.743303462122325, | |
| "grad_norm": 1.609375, | |
| "learning_rate": 2.662469283043991e-07, | |
| "loss": 0.1434, | |
| "mean_token_accuracy": 0.9683542418479919, | |
| "num_tokens": 7540523414.0, | |
| "step": 71200 | |
| }, | |
| { | |
| "entropy": 1.214375, | |
| "epoch": 1.7445276920816806, | |
| "grad_norm": 2.953125, | |
| "learning_rate": 2.637514626172213e-07, | |
| "loss": 0.1549, | |
| "mean_token_accuracy": 0.9665893888473511, | |
| "num_tokens": 7545849728.0, | |
| "step": 71250 | |
| }, | |
| { | |
| "entropy": 1.2040625, | |
| "epoch": 1.7457519220410362, | |
| "grad_norm": 2.765625, | |
| "learning_rate": 2.6126720892513277e-07, | |
| "loss": 0.1487, | |
| "mean_token_accuracy": 0.9680774366855621, | |
| "num_tokens": 7551159210.0, | |
| "step": 71300 | |
| }, | |
| { | |
| "entropy": 1.19421875, | |
| "epoch": 1.746976152000392, | |
| "grad_norm": 2.640625, | |
| "learning_rate": 2.5879417740756093e-07, | |
| "loss": 0.1363, | |
| "mean_token_accuracy": 0.9701401054859161, | |
| "num_tokens": 7556078762.0, | |
| "step": 71350 | |
| }, | |
| { | |
| "entropy": 1.218125, | |
| "epoch": 1.7482003819597474, | |
| "grad_norm": 1.5625, | |
| "learning_rate": 2.563323781979482e-07, | |
| "loss": 0.1656, | |
| "mean_token_accuracy": 0.9642888736724854, | |
| "num_tokens": 7561736323.0, | |
| "step": 71400 | |
| }, | |
| { | |
| "entropy": 1.21859375, | |
| "epoch": 1.7494246119191028, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 2.5388182138371173e-07, | |
| "loss": 0.1517, | |
| "mean_token_accuracy": 0.966708824634552, | |
| "num_tokens": 7567328811.0, | |
| "step": 71450 | |
| }, | |
| { | |
| "entropy": 1.22109375, | |
| "epoch": 1.7506488418784585, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 2.5144251700620135e-07, | |
| "loss": 0.1629, | |
| "mean_token_accuracy": 0.9650636351108551, | |
| "num_tokens": 7572752827.0, | |
| "step": 71500 | |
| }, | |
| { | |
| "entropy": 1.21, | |
| "epoch": 1.7518730718378142, | |
| "grad_norm": 2.78125, | |
| "learning_rate": 2.4901447506066133e-07, | |
| "loss": 0.1599, | |
| "mean_token_accuracy": 0.9643032836914063, | |
| "num_tokens": 7578362509.0, | |
| "step": 71550 | |
| }, | |
| { | |
| "entropy": 1.2090625, | |
| "epoch": 1.7530973017971696, | |
| "grad_norm": 1.6484375, | |
| "learning_rate": 2.465977054961852e-07, | |
| "loss": 0.1493, | |
| "mean_token_accuracy": 0.9673759829998017, | |
| "num_tokens": 7583839931.0, | |
| "step": 71600 | |
| }, | |
| { | |
| "entropy": 1.21171875, | |
| "epoch": 1.754321531756525, | |
| "grad_norm": 2.828125, | |
| "learning_rate": 2.441922182156775e-07, | |
| "loss": 0.1518, | |
| "mean_token_accuracy": 0.9662256014347076, | |
| "num_tokens": 7589236608.0, | |
| "step": 71650 | |
| }, | |
| { | |
| "entropy": 1.209375, | |
| "epoch": 1.7555457617158807, | |
| "grad_norm": 2.890625, | |
| "learning_rate": 2.4179802307581234e-07, | |
| "loss": 0.1495, | |
| "mean_token_accuracy": 0.9674426424503326, | |
| "num_tokens": 7594652077.0, | |
| "step": 71700 | |
| }, | |
| { | |
| "entropy": 1.20265625, | |
| "epoch": 1.7567699916752364, | |
| "grad_norm": 2.96875, | |
| "learning_rate": 2.394151298869952e-07, | |
| "loss": 0.1451, | |
| "mean_token_accuracy": 0.9673744821548462, | |
| "num_tokens": 7599701409.0, | |
| "step": 71750 | |
| }, | |
| { | |
| "entropy": 1.2153125, | |
| "epoch": 1.7579942216345918, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 2.3704354841331932e-07, | |
| "loss": 0.1505, | |
| "mean_token_accuracy": 0.9669674754142761, | |
| "num_tokens": 7605091932.0, | |
| "step": 71800 | |
| }, | |
| { | |
| "entropy": 1.2065625, | |
| "epoch": 1.7592184515939473, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 2.3468328837252628e-07, | |
| "loss": 0.1478, | |
| "mean_token_accuracy": 0.9676505529880524, | |
| "num_tokens": 7610186489.0, | |
| "step": 71850 | |
| }, | |
| { | |
| "entropy": 1.20890625, | |
| "epoch": 1.760442681553303, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 2.3233435943597114e-07, | |
| "loss": 0.1503, | |
| "mean_token_accuracy": 0.9671880280971528, | |
| "num_tokens": 7615665531.0, | |
| "step": 71900 | |
| }, | |
| { | |
| "entropy": 1.20375, | |
| "epoch": 1.7616669115126586, | |
| "grad_norm": 2.453125, | |
| "learning_rate": 2.299967712285731e-07, | |
| "loss": 0.1423, | |
| "mean_token_accuracy": 0.9683215701580048, | |
| "num_tokens": 7620773654.0, | |
| "step": 71950 | |
| }, | |
| { | |
| "entropy": 1.19234375, | |
| "epoch": 1.762891141472014, | |
| "grad_norm": 3.140625, | |
| "learning_rate": 2.276705333287875e-07, | |
| "loss": 0.1315, | |
| "mean_token_accuracy": 0.9702609395980835, | |
| "num_tokens": 7625470551.0, | |
| "step": 72000 | |
| }, | |
| { | |
| "entropy": 1.21046875, | |
| "epoch": 1.7641153714313695, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 2.253556552685573e-07, | |
| "loss": 0.1433, | |
| "mean_token_accuracy": 0.9681813132762909, | |
| "num_tokens": 7630517430.0, | |
| "step": 72050 | |
| }, | |
| { | |
| "entropy": 1.21, | |
| "epoch": 1.7653396013907252, | |
| "grad_norm": 3.125, | |
| "learning_rate": 2.2305214653327855e-07, | |
| "loss": 0.1406, | |
| "mean_token_accuracy": 0.9686529791355133, | |
| "num_tokens": 7635763079.0, | |
| "step": 72100 | |
| }, | |
| { | |
| "entropy": 1.201875, | |
| "epoch": 1.7665638313500809, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 2.207600165617607e-07, | |
| "loss": 0.1475, | |
| "mean_token_accuracy": 0.9678330075740814, | |
| "num_tokens": 7641423146.0, | |
| "step": 72150 | |
| }, | |
| { | |
| "entropy": 1.176875, | |
| "epoch": 1.7677880613094363, | |
| "grad_norm": 1.8125, | |
| "learning_rate": 2.1847927474618846e-07, | |
| "loss": 0.1314, | |
| "mean_token_accuracy": 0.9702327287197113, | |
| "num_tokens": 7646275038.0, | |
| "step": 72200 | |
| }, | |
| { | |
| "entropy": 1.205, | |
| "epoch": 1.7690122912687918, | |
| "grad_norm": 1.515625, | |
| "learning_rate": 2.1620993043208182e-07, | |
| "loss": 0.1371, | |
| "mean_token_accuracy": 0.9702345824241638, | |
| "num_tokens": 7651591457.0, | |
| "step": 72250 | |
| }, | |
| { | |
| "entropy": 1.2225, | |
| "epoch": 1.7702365212281475, | |
| "grad_norm": 1.6796875, | |
| "learning_rate": 2.139519929182585e-07, | |
| "loss": 0.1507, | |
| "mean_token_accuracy": 0.9666866302490235, | |
| "num_tokens": 7656975261.0, | |
| "step": 72300 | |
| }, | |
| { | |
| "entropy": 1.1996875, | |
| "epoch": 1.7714607511875031, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 2.1170547145679665e-07, | |
| "loss": 0.1492, | |
| "mean_token_accuracy": 0.966531822681427, | |
| "num_tokens": 7662430438.0, | |
| "step": 72350 | |
| }, | |
| { | |
| "entropy": 1.21703125, | |
| "epoch": 1.7726849811468586, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 2.0947037525299606e-07, | |
| "loss": 0.1501, | |
| "mean_token_accuracy": 0.9673058640956879, | |
| "num_tokens": 7667987024.0, | |
| "step": 72400 | |
| }, | |
| { | |
| "entropy": 1.20890625, | |
| "epoch": 1.7739092111062142, | |
| "grad_norm": 2.640625, | |
| "learning_rate": 2.0724671346533975e-07, | |
| "loss": 0.1483, | |
| "mean_token_accuracy": 0.9672919237613677, | |
| "num_tokens": 7673092874.0, | |
| "step": 72450 | |
| }, | |
| { | |
| "entropy": 1.21171875, | |
| "epoch": 1.7751334410655697, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 2.0503449520545814e-07, | |
| "loss": 0.1454, | |
| "mean_token_accuracy": 0.9677470910549164, | |
| "num_tokens": 7678350890.0, | |
| "step": 72500 | |
| }, | |
| { | |
| "entropy": 1.21125, | |
| "epoch": 1.7763576710249254, | |
| "grad_norm": 4.03125, | |
| "learning_rate": 2.0283372953809187e-07, | |
| "loss": 0.1506, | |
| "mean_token_accuracy": 0.9673129177093506, | |
| "num_tokens": 7683768054.0, | |
| "step": 72550 | |
| }, | |
| { | |
| "entropy": 1.19046875, | |
| "epoch": 1.777581900984281, | |
| "grad_norm": 0.010009765625, | |
| "learning_rate": 2.0064442548105078e-07, | |
| "loss": 0.1311, | |
| "mean_token_accuracy": 0.9706909394264222, | |
| "num_tokens": 7688732517.0, | |
| "step": 72600 | |
| }, | |
| { | |
| "entropy": 1.20234375, | |
| "epoch": 1.7788061309436365, | |
| "grad_norm": 2.625, | |
| "learning_rate": 1.9846659200518323e-07, | |
| "loss": 0.1443, | |
| "mean_token_accuracy": 0.9685131824016571, | |
| "num_tokens": 7693833105.0, | |
| "step": 72650 | |
| }, | |
| { | |
| "entropy": 1.1996875, | |
| "epoch": 1.780030360902992, | |
| "grad_norm": 2.8125, | |
| "learning_rate": 1.963002380343336e-07, | |
| "loss": 0.1372, | |
| "mean_token_accuracy": 0.9696123468875885, | |
| "num_tokens": 7698671416.0, | |
| "step": 72700 | |
| }, | |
| { | |
| "entropy": 1.2096875, | |
| "epoch": 1.7812545908623476, | |
| "grad_norm": 3.46875, | |
| "learning_rate": 1.9414537244530883e-07, | |
| "loss": 0.1447, | |
| "mean_token_accuracy": 0.9681323492527008, | |
| "num_tokens": 7704099695.0, | |
| "step": 72750 | |
| }, | |
| { | |
| "entropy": 1.209375, | |
| "epoch": 1.7824788208217033, | |
| "grad_norm": 3.8125, | |
| "learning_rate": 1.9200200406784084e-07, | |
| "loss": 0.1471, | |
| "mean_token_accuracy": 0.9671408832073212, | |
| "num_tokens": 7709413054.0, | |
| "step": 72800 | |
| }, | |
| { | |
| "entropy": 1.22046875, | |
| "epoch": 1.7837030507810587, | |
| "grad_norm": 2.375, | |
| "learning_rate": 1.8987014168455263e-07, | |
| "loss": 0.1513, | |
| "mean_token_accuracy": 0.9667081344127655, | |
| "num_tokens": 7714999778.0, | |
| "step": 72850 | |
| }, | |
| { | |
| "entropy": 1.21765625, | |
| "epoch": 1.7849272807404142, | |
| "grad_norm": 1.59375, | |
| "learning_rate": 1.8774979403091852e-07, | |
| "loss": 0.1467, | |
| "mean_token_accuracy": 0.9685576283931732, | |
| "num_tokens": 7720722054.0, | |
| "step": 72900 | |
| }, | |
| { | |
| "entropy": 1.18796875, | |
| "epoch": 1.7861515106997699, | |
| "grad_norm": 3.015625, | |
| "learning_rate": 1.8564096979523027e-07, | |
| "loss": 0.1448, | |
| "mean_token_accuracy": 0.9685378670692444, | |
| "num_tokens": 7726037284.0, | |
| "step": 72950 | |
| }, | |
| { | |
| "entropy": 1.21359375, | |
| "epoch": 1.7873757406591255, | |
| "grad_norm": 2.75, | |
| "learning_rate": 1.835436776185634e-07, | |
| "loss": 0.1305, | |
| "mean_token_accuracy": 0.9697797727584839, | |
| "num_tokens": 7731254143.0, | |
| "step": 73000 | |
| }, | |
| { | |
| "entropy": 1.189375, | |
| "epoch": 1.788599970618481, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 1.814579260947379e-07, | |
| "loss": 0.1367, | |
| "mean_token_accuracy": 0.969087952375412, | |
| "num_tokens": 7736558719.0, | |
| "step": 73050 | |
| }, | |
| { | |
| "entropy": 1.20109375, | |
| "epoch": 1.7898242005778364, | |
| "grad_norm": 2.640625, | |
| "learning_rate": 1.7938372377028622e-07, | |
| "loss": 0.1265, | |
| "mean_token_accuracy": 0.9715298664569855, | |
| "num_tokens": 7741441296.0, | |
| "step": 73100 | |
| }, | |
| { | |
| "entropy": 1.1953125, | |
| "epoch": 1.791048430537192, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 1.773210791444161e-07, | |
| "loss": 0.131, | |
| "mean_token_accuracy": 0.9706771004199982, | |
| "num_tokens": 7746461885.0, | |
| "step": 73150 | |
| }, | |
| { | |
| "entropy": 1.2090625, | |
| "epoch": 1.7922726604965478, | |
| "grad_norm": 3.375, | |
| "learning_rate": 1.7527000066897837e-07, | |
| "loss": 0.1469, | |
| "mean_token_accuracy": 0.9673126399517059, | |
| "num_tokens": 7752002392.0, | |
| "step": 73200 | |
| }, | |
| { | |
| "entropy": 1.1975, | |
| "epoch": 1.7934968904559032, | |
| "grad_norm": 1.5, | |
| "learning_rate": 1.7323049674842783e-07, | |
| "loss": 0.1437, | |
| "mean_token_accuracy": 0.9683597016334534, | |
| "num_tokens": 7756991548.0, | |
| "step": 73250 | |
| }, | |
| { | |
| "entropy": 1.2171875, | |
| "epoch": 1.7947211204152587, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 1.7120257573979492e-07, | |
| "loss": 0.1454, | |
| "mean_token_accuracy": 0.968316274881363, | |
| "num_tokens": 7762203324.0, | |
| "step": 73300 | |
| }, | |
| { | |
| "entropy": 1.1959375, | |
| "epoch": 1.7959453503746143, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 1.6918624595264597e-07, | |
| "loss": 0.1366, | |
| "mean_token_accuracy": 0.9702933692932129, | |
| "num_tokens": 7767460924.0, | |
| "step": 73350 | |
| }, | |
| { | |
| "entropy": 1.199375, | |
| "epoch": 1.79716958033397, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 1.671815156490517e-07, | |
| "loss": 0.143, | |
| "mean_token_accuracy": 0.9685783159732818, | |
| "num_tokens": 7772824486.0, | |
| "step": 73400 | |
| }, | |
| { | |
| "entropy": 1.21921875, | |
| "epoch": 1.7983938102933255, | |
| "grad_norm": 2.953125, | |
| "learning_rate": 1.651883930435535e-07, | |
| "loss": 0.1362, | |
| "mean_token_accuracy": 0.9696711504459381, | |
| "num_tokens": 7778088634.0, | |
| "step": 73450 | |
| }, | |
| { | |
| "entropy": 1.2078125, | |
| "epoch": 1.799618040252681, | |
| "grad_norm": 0.004302978515625, | |
| "learning_rate": 1.6320688630312908e-07, | |
| "loss": 0.1363, | |
| "mean_token_accuracy": 0.9695776212215423, | |
| "num_tokens": 7783380087.0, | |
| "step": 73500 | |
| }, | |
| { | |
| "entropy": 1.22859375, | |
| "epoch": 1.8008422702120366, | |
| "grad_norm": 1.5625, | |
| "learning_rate": 1.6123700354716032e-07, | |
| "loss": 0.1559, | |
| "mean_token_accuracy": 0.9663217055797577, | |
| "num_tokens": 7789343726.0, | |
| "step": 73550 | |
| }, | |
| { | |
| "entropy": 1.21328125, | |
| "epoch": 1.8020665001713922, | |
| "grad_norm": 1.65625, | |
| "learning_rate": 1.5927875284739546e-07, | |
| "loss": 0.1356, | |
| "mean_token_accuracy": 0.9702400255203247, | |
| "num_tokens": 7794792440.0, | |
| "step": 73600 | |
| }, | |
| { | |
| "entropy": 1.21484375, | |
| "epoch": 1.803290730130748, | |
| "grad_norm": 1.71875, | |
| "learning_rate": 1.5733214222792392e-07, | |
| "loss": 0.1418, | |
| "mean_token_accuracy": 0.9687067580223083, | |
| "num_tokens": 7800254887.0, | |
| "step": 73650 | |
| }, | |
| { | |
| "entropy": 1.21421875, | |
| "epoch": 1.8045149600901034, | |
| "grad_norm": 3.625, | |
| "learning_rate": 1.5539717966513623e-07, | |
| "loss": 0.1361, | |
| "mean_token_accuracy": 0.969369399547577, | |
| "num_tokens": 7805607043.0, | |
| "step": 73700 | |
| }, | |
| { | |
| "entropy": 1.20984375, | |
| "epoch": 1.8057391900494588, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 1.5347387308769478e-07, | |
| "loss": 0.1326, | |
| "mean_token_accuracy": 0.9703532266616821, | |
| "num_tokens": 7810964969.0, | |
| "step": 73750 | |
| }, | |
| { | |
| "entropy": 1.20515625, | |
| "epoch": 1.8069634200088145, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 1.5156223037649985e-07, | |
| "loss": 0.1506, | |
| "mean_token_accuracy": 0.9663440334796906, | |
| "num_tokens": 7816484836.0, | |
| "step": 73800 | |
| }, | |
| { | |
| "entropy": 1.1890625, | |
| "epoch": 1.8081876499681702, | |
| "grad_norm": 3.03125, | |
| "learning_rate": 1.4966225936465993e-07, | |
| "loss": 0.1304, | |
| "mean_token_accuracy": 0.9708381593227386, | |
| "num_tokens": 7821459721.0, | |
| "step": 73850 | |
| }, | |
| { | |
| "entropy": 1.19953125, | |
| "epoch": 1.8094118799275256, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 1.4777396783745612e-07, | |
| "loss": 0.128, | |
| "mean_token_accuracy": 0.9713588643074036, | |
| "num_tokens": 7826287539.0, | |
| "step": 73900 | |
| }, | |
| { | |
| "entropy": 1.1978125, | |
| "epoch": 1.810636109886881, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 1.4589736353231308e-07, | |
| "loss": 0.1202, | |
| "mean_token_accuracy": 0.9729771482944488, | |
| "num_tokens": 7831387963.0, | |
| "step": 73950 | |
| }, | |
| { | |
| "entropy": 1.195, | |
| "epoch": 1.8118603398462367, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 1.4403245413876486e-07, | |
| "loss": 0.1344, | |
| "mean_token_accuracy": 0.9699731683731079, | |
| "num_tokens": 7836315700.0, | |
| "step": 74000 | |
| }, | |
| { | |
| "entropy": 1.18796875, | |
| "epoch": 1.8130845698055924, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 1.4217924729842513e-07, | |
| "loss": 0.1381, | |
| "mean_token_accuracy": 0.9699892640113831, | |
| "num_tokens": 7841453471.0, | |
| "step": 74050 | |
| }, | |
| { | |
| "entropy": 1.2075, | |
| "epoch": 1.8143087997649479, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 1.403377506049569e-07, | |
| "loss": 0.1451, | |
| "mean_token_accuracy": 0.9681575572490693, | |
| "num_tokens": 7846798475.0, | |
| "step": 74100 | |
| }, | |
| { | |
| "entropy": 1.1890625, | |
| "epoch": 1.8155330297243033, | |
| "grad_norm": 3.328125, | |
| "learning_rate": 1.385079716040376e-07, | |
| "loss": 0.1253, | |
| "mean_token_accuracy": 0.9720281398296357, | |
| "num_tokens": 7851768429.0, | |
| "step": 74150 | |
| }, | |
| { | |
| "entropy": 1.19671875, | |
| "epoch": 1.816757259683659, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 1.3668991779333308e-07, | |
| "loss": 0.1218, | |
| "mean_token_accuracy": 0.9725555181503296, | |
| "num_tokens": 7856881793.0, | |
| "step": 74200 | |
| }, | |
| { | |
| "entropy": 1.19890625, | |
| "epoch": 1.8179814896430146, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 1.3488359662246087e-07, | |
| "loss": 0.1272, | |
| "mean_token_accuracy": 0.9715735244750977, | |
| "num_tokens": 7861890257.0, | |
| "step": 74250 | |
| }, | |
| { | |
| "entropy": 1.20390625, | |
| "epoch": 1.81920571960237, | |
| "grad_norm": 1.90625, | |
| "learning_rate": 1.3308901549296604e-07, | |
| "loss": 0.1275, | |
| "mean_token_accuracy": 0.9717478513717651, | |
| "num_tokens": 7867074576.0, | |
| "step": 74300 | |
| }, | |
| { | |
| "entropy": 1.20203125, | |
| "epoch": 1.8204299495617255, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 1.3130618175828713e-07, | |
| "loss": 0.1367, | |
| "mean_token_accuracy": 0.9701256167888641, | |
| "num_tokens": 7872381109.0, | |
| "step": 74350 | |
| }, | |
| { | |
| "entropy": 1.20828125, | |
| "epoch": 1.8216541795210812, | |
| "grad_norm": 3.359375, | |
| "learning_rate": 1.2953510272372647e-07, | |
| "loss": 0.1287, | |
| "mean_token_accuracy": 0.9719671607017517, | |
| "num_tokens": 7877881928.0, | |
| "step": 74400 | |
| }, | |
| { | |
| "entropy": 1.199375, | |
| "epoch": 1.822878409480437, | |
| "grad_norm": 2.59375, | |
| "learning_rate": 1.2777578564641969e-07, | |
| "loss": 0.1309, | |
| "mean_token_accuracy": 0.9707298684120178, | |
| "num_tokens": 7882820168.0, | |
| "step": 74450 | |
| }, | |
| { | |
| "entropy": 1.21734375, | |
| "epoch": 1.8241026394397923, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 1.2602823773530915e-07, | |
| "loss": 0.1426, | |
| "mean_token_accuracy": 0.9688560748100281, | |
| "num_tokens": 7888372934.0, | |
| "step": 74500 | |
| }, | |
| { | |
| "entropy": 1.2046875, | |
| "epoch": 1.8253268693991478, | |
| "grad_norm": 2.703125, | |
| "learning_rate": 1.2429246615111024e-07, | |
| "loss": 0.1331, | |
| "mean_token_accuracy": 0.970300270318985, | |
| "num_tokens": 7893801088.0, | |
| "step": 74550 | |
| }, | |
| { | |
| "entropy": 1.21171875, | |
| "epoch": 1.8265510993585035, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 1.2256847800628425e-07, | |
| "loss": 0.1223, | |
| "mean_token_accuracy": 0.973189731836319, | |
| "num_tokens": 7898852778.0, | |
| "step": 74600 | |
| }, | |
| { | |
| "entropy": 1.20671875, | |
| "epoch": 1.8277753293178591, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 1.2085628036501007e-07, | |
| "loss": 0.123, | |
| "mean_token_accuracy": 0.9726410353183746, | |
| "num_tokens": 7903818883.0, | |
| "step": 74650 | |
| }, | |
| { | |
| "entropy": 1.19265625, | |
| "epoch": 1.8289995592772146, | |
| "grad_norm": 3.21875, | |
| "learning_rate": 1.1915588024315194e-07, | |
| "loss": 0.1278, | |
| "mean_token_accuracy": 0.9702788054943084, | |
| "num_tokens": 7908897679.0, | |
| "step": 74700 | |
| }, | |
| { | |
| "entropy": 1.20984375, | |
| "epoch": 1.83022378923657, | |
| "grad_norm": 3.15625, | |
| "learning_rate": 1.1746728460823508e-07, | |
| "loss": 0.1303, | |
| "mean_token_accuracy": 0.9711257565021515, | |
| "num_tokens": 7914006448.0, | |
| "step": 74750 | |
| }, | |
| { | |
| "entropy": 1.2140625, | |
| "epoch": 1.8314480191959257, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 1.1579050037941275e-07, | |
| "loss": 0.1362, | |
| "mean_token_accuracy": 0.969500253200531, | |
| "num_tokens": 7919510157.0, | |
| "step": 74800 | |
| }, | |
| { | |
| "entropy": 1.21421875, | |
| "epoch": 1.8326722491552814, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 1.1412553442744255e-07, | |
| "loss": 0.132, | |
| "mean_token_accuracy": 0.970678209066391, | |
| "num_tokens": 7924726404.0, | |
| "step": 74850 | |
| }, | |
| { | |
| "entropy": 1.1996875, | |
| "epoch": 1.833896479114637, | |
| "grad_norm": 2.703125, | |
| "learning_rate": 1.1247239357465255e-07, | |
| "loss": 0.13, | |
| "mean_token_accuracy": 0.9713816094398499, | |
| "num_tokens": 7929934384.0, | |
| "step": 74900 | |
| }, | |
| { | |
| "entropy": 1.18921875, | |
| "epoch": 1.8351207090739925, | |
| "grad_norm": 1.9921875, | |
| "learning_rate": 1.1083108459491986e-07, | |
| "loss": 0.1256, | |
| "mean_token_accuracy": 0.9721748220920563, | |
| "num_tokens": 7935196457.0, | |
| "step": 74950 | |
| }, | |
| { | |
| "entropy": 1.2003125, | |
| "epoch": 1.836344939033348, | |
| "grad_norm": 2.703125, | |
| "learning_rate": 1.0920161421363773e-07, | |
| "loss": 0.119, | |
| "mean_token_accuracy": 0.9733594739437104, | |
| "num_tokens": 7940201367.0, | |
| "step": 75000 | |
| }, | |
| { | |
| "entropy": 1.22375, | |
| "epoch": 1.8375691689927036, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 1.0758398910768951e-07, | |
| "loss": 0.1373, | |
| "mean_token_accuracy": 0.9692693221569061, | |
| "num_tokens": 7945635438.0, | |
| "step": 75050 | |
| }, | |
| { | |
| "entropy": 1.20890625, | |
| "epoch": 1.8387933989520593, | |
| "grad_norm": 1.546875, | |
| "learning_rate": 1.0597821590542211e-07, | |
| "loss": 0.1282, | |
| "mean_token_accuracy": 0.9722434699535369, | |
| "num_tokens": 7951091367.0, | |
| "step": 75100 | |
| }, | |
| { | |
| "entropy": 1.18828125, | |
| "epoch": 1.8400176289114147, | |
| "grad_norm": 0.004425048828125, | |
| "learning_rate": 1.0438430118661924e-07, | |
| "loss": 0.124, | |
| "mean_token_accuracy": 0.9725795328617096, | |
| "num_tokens": 7956255217.0, | |
| "step": 75150 | |
| }, | |
| { | |
| "entropy": 1.1903125, | |
| "epoch": 1.8412418588707702, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 1.0280225148247213e-07, | |
| "loss": 0.1179, | |
| "mean_token_accuracy": 0.9743827605247497, | |
| "num_tokens": 7961236486.0, | |
| "step": 75200 | |
| }, | |
| { | |
| "entropy": 1.1996875, | |
| "epoch": 1.8424660888301259, | |
| "grad_norm": 1.640625, | |
| "learning_rate": 1.0123207327555462e-07, | |
| "loss": 0.1156, | |
| "mean_token_accuracy": 0.9743783438205719, | |
| "num_tokens": 7966324215.0, | |
| "step": 75250 | |
| }, | |
| { | |
| "entropy": 1.2090625, | |
| "epoch": 1.8436903187894815, | |
| "grad_norm": 1.71875, | |
| "learning_rate": 9.967377299979708e-08, | |
| "loss": 0.134, | |
| "mean_token_accuracy": 0.9705902481079102, | |
| "num_tokens": 7971817863.0, | |
| "step": 75300 | |
| }, | |
| { | |
| "entropy": 1.19578125, | |
| "epoch": 1.844914548748837, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 9.812735704045684e-08, | |
| "loss": 0.1185, | |
| "mean_token_accuracy": 0.9737985277175903, | |
| "num_tokens": 7977008142.0, | |
| "step": 75350 | |
| }, | |
| { | |
| "entropy": 1.190625, | |
| "epoch": 1.8461387787081924, | |
| "grad_norm": 1.75, | |
| "learning_rate": 9.65928317340975e-08, | |
| "loss": 0.1201, | |
| "mean_token_accuracy": 0.9731456315517426, | |
| "num_tokens": 7982011592.0, | |
| "step": 75400 | |
| }, | |
| { | |
| "entropy": 1.20875, | |
| "epoch": 1.847363008667548, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 9.507020336855632e-08, | |
| "loss": 0.1221, | |
| "mean_token_accuracy": 0.9724456059932709, | |
| "num_tokens": 7987367141.0, | |
| "step": 75450 | |
| }, | |
| { | |
| "entropy": 1.20234375, | |
| "epoch": 1.8485872386269038, | |
| "grad_norm": 1.625, | |
| "learning_rate": 9.355947818292554e-08, | |
| "loss": 0.1149, | |
| "mean_token_accuracy": 0.9738513994216919, | |
| "num_tokens": 7992500198.0, | |
| "step": 75500 | |
| }, | |
| { | |
| "entropy": 1.21625, | |
| "epoch": 1.8498114685862592, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 9.206066236751943e-08, | |
| "loss": 0.1328, | |
| "mean_token_accuracy": 0.9707795882225037, | |
| "num_tokens": 7998217427.0, | |
| "step": 75550 | |
| }, | |
| { | |
| "entropy": 1.1975, | |
| "epoch": 1.8510356985456147, | |
| "grad_norm": 2.125, | |
| "learning_rate": 9.057376206385559e-08, | |
| "loss": 0.1175, | |
| "mean_token_accuracy": 0.9741839158535004, | |
| "num_tokens": 8003308568.0, | |
| "step": 75600 | |
| }, | |
| { | |
| "entropy": 1.1878125, | |
| "epoch": 1.8522599285049703, | |
| "grad_norm": 3.21875, | |
| "learning_rate": 8.90987833646254e-08, | |
| "loss": 0.1077, | |
| "mean_token_accuracy": 0.9759363722801209, | |
| "num_tokens": 8008259087.0, | |
| "step": 75650 | |
| }, | |
| { | |
| "entropy": 1.20125, | |
| "epoch": 1.853484158464326, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 8.763573231367062e-08, | |
| "loss": 0.1256, | |
| "mean_token_accuracy": 0.9727174258232116, | |
| "num_tokens": 8013653351.0, | |
| "step": 75700 | |
| }, | |
| { | |
| "entropy": 1.20078125, | |
| "epoch": 1.8547083884236815, | |
| "grad_norm": 2.765625, | |
| "learning_rate": 8.618461490595975e-08, | |
| "loss": 0.1214, | |
| "mean_token_accuracy": 0.9735188388824463, | |
| "num_tokens": 8018956628.0, | |
| "step": 75750 | |
| }, | |
| { | |
| "entropy": 1.209375, | |
| "epoch": 1.855932618383037, | |
| "grad_norm": 2.84375, | |
| "learning_rate": 8.474543708756044e-08, | |
| "loss": 0.1225, | |
| "mean_token_accuracy": 0.9721533727645874, | |
| "num_tokens": 8024197226.0, | |
| "step": 75800 | |
| }, | |
| { | |
| "entropy": 1.19015625, | |
| "epoch": 1.8571568483423926, | |
| "grad_norm": 0.005462646484375, | |
| "learning_rate": 8.33182047556178e-08, | |
| "loss": 0.1076, | |
| "mean_token_accuracy": 0.9760002064704895, | |
| "num_tokens": 8029024717.0, | |
| "step": 75850 | |
| }, | |
| { | |
| "entropy": 1.1953125, | |
| "epoch": 1.8583810783017483, | |
| "grad_norm": 1.640625, | |
| "learning_rate": 8.190292375832975e-08, | |
| "loss": 0.1274, | |
| "mean_token_accuracy": 0.971969587802887, | |
| "num_tokens": 8034254868.0, | |
| "step": 75900 | |
| }, | |
| { | |
| "entropy": 1.20546875, | |
| "epoch": 1.859605308261104, | |
| "grad_norm": 2.78125, | |
| "learning_rate": 8.049959989492239e-08, | |
| "loss": 0.1248, | |
| "mean_token_accuracy": 0.9728272747993469, | |
| "num_tokens": 8039555218.0, | |
| "step": 75950 | |
| }, | |
| { | |
| "entropy": 1.21359375, | |
| "epoch": 1.8608295382204594, | |
| "grad_norm": 1.640625, | |
| "learning_rate": 7.910823891562536e-08, | |
| "loss": 0.131, | |
| "mean_token_accuracy": 0.9710195803642273, | |
| "num_tokens": 8044915571.0, | |
| "step": 76000 | |
| }, | |
| { | |
| "entropy": 1.19625, | |
| "epoch": 1.8620537681798148, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 7.77288465216518e-08, | |
| "loss": 0.1189, | |
| "mean_token_accuracy": 0.9735661280155182, | |
| "num_tokens": 8050222763.0, | |
| "step": 76050 | |
| }, | |
| { | |
| "entropy": 1.1953125, | |
| "epoch": 1.8632779981391705, | |
| "grad_norm": 2.375, | |
| "learning_rate": 7.636142836517013e-08, | |
| "loss": 0.1211, | |
| "mean_token_accuracy": 0.9737051403522492, | |
| "num_tokens": 8055473678.0, | |
| "step": 76100 | |
| }, | |
| { | |
| "entropy": 1.196875, | |
| "epoch": 1.8645022280985262, | |
| "grad_norm": 1.6796875, | |
| "learning_rate": 7.500599004928565e-08, | |
| "loss": 0.1122, | |
| "mean_token_accuracy": 0.974678498506546, | |
| "num_tokens": 8060311800.0, | |
| "step": 76150 | |
| }, | |
| { | |
| "entropy": 1.18984375, | |
| "epoch": 1.8657264580578816, | |
| "grad_norm": 2.5, | |
| "learning_rate": 7.36625371280133e-08, | |
| "loss": 0.1164, | |
| "mean_token_accuracy": 0.9736955296993256, | |
| "num_tokens": 8065567322.0, | |
| "step": 76200 | |
| }, | |
| { | |
| "entropy": 1.211875, | |
| "epoch": 1.866950688017237, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 7.233107510625858e-08, | |
| "loss": 0.1262, | |
| "mean_token_accuracy": 0.9716404461860657, | |
| "num_tokens": 8070882224.0, | |
| "step": 76250 | |
| }, | |
| { | |
| "entropy": 1.20234375, | |
| "epoch": 1.8681749179765927, | |
| "grad_norm": 1.65625, | |
| "learning_rate": 7.101160943979201e-08, | |
| "loss": 0.1242, | |
| "mean_token_accuracy": 0.9728803491592407, | |
| "num_tokens": 8075963376.0, | |
| "step": 76300 | |
| }, | |
| { | |
| "entropy": 1.20921875, | |
| "epoch": 1.8693991479359484, | |
| "grad_norm": 1.625, | |
| "learning_rate": 6.970414553522842e-08, | |
| "loss": 0.1223, | |
| "mean_token_accuracy": 0.9728834819793701, | |
| "num_tokens": 8081448166.0, | |
| "step": 76350 | |
| }, | |
| { | |
| "entropy": 1.1978125, | |
| "epoch": 1.8706233778953039, | |
| "grad_norm": 2.78125, | |
| "learning_rate": 6.840868875000561e-08, | |
| "loss": 0.1146, | |
| "mean_token_accuracy": 0.9747687363624573, | |
| "num_tokens": 8086285902.0, | |
| "step": 76400 | |
| }, | |
| { | |
| "entropy": 1.200625, | |
| "epoch": 1.8718476078546593, | |
| "grad_norm": 2.765625, | |
| "learning_rate": 6.712524439235978e-08, | |
| "loss": 0.1171, | |
| "mean_token_accuracy": 0.9743122577667236, | |
| "num_tokens": 8091436927.0, | |
| "step": 76450 | |
| }, | |
| { | |
| "entropy": 1.211875, | |
| "epoch": 1.873071837814015, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 6.585381772130584e-08, | |
| "loss": 0.1327, | |
| "mean_token_accuracy": 0.9712537932395935, | |
| "num_tokens": 8097048708.0, | |
| "step": 76500 | |
| }, | |
| { | |
| "entropy": 1.2128125, | |
| "epoch": 1.8742960677733707, | |
| "grad_norm": 2.703125, | |
| "learning_rate": 6.459441394661536e-08, | |
| "loss": 0.1342, | |
| "mean_token_accuracy": 0.9702994549274444, | |
| "num_tokens": 8102302631.0, | |
| "step": 76550 | |
| }, | |
| { | |
| "entropy": 1.20875, | |
| "epoch": 1.875520297732726, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 6.334703822879506e-08, | |
| "loss": 0.1337, | |
| "mean_token_accuracy": 0.970585721731186, | |
| "num_tokens": 8107702374.0, | |
| "step": 76600 | |
| }, | |
| { | |
| "entropy": 1.208125, | |
| "epoch": 1.8767445276920816, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 6.211169567906572e-08, | |
| "loss": 0.1419, | |
| "mean_token_accuracy": 0.9687972629070282, | |
| "num_tokens": 8113119431.0, | |
| "step": 76650 | |
| }, | |
| { | |
| "entropy": 1.20546875, | |
| "epoch": 1.8779687576514372, | |
| "grad_norm": 3.0625, | |
| "learning_rate": 6.08883913593412e-08, | |
| "loss": 0.1354, | |
| "mean_token_accuracy": 0.9701398539543152, | |
| "num_tokens": 8118309412.0, | |
| "step": 76700 | |
| }, | |
| { | |
| "entropy": 1.19796875, | |
| "epoch": 1.879192987610793, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 5.967713028220756e-08, | |
| "loss": 0.1334, | |
| "mean_token_accuracy": 0.9713104116916657, | |
| "num_tokens": 8123346693.0, | |
| "step": 76750 | |
| }, | |
| { | |
| "entropy": 1.2065625, | |
| "epoch": 1.8804172175701483, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 5.8477917410903914e-08, | |
| "loss": 0.1449, | |
| "mean_token_accuracy": 0.968209480047226, | |
| "num_tokens": 8128745782.0, | |
| "step": 76800 | |
| }, | |
| { | |
| "entropy": 1.19703125, | |
| "epoch": 1.8816414475295038, | |
| "grad_norm": 2.78125, | |
| "learning_rate": 5.729075765929925e-08, | |
| "loss": 0.1602, | |
| "mean_token_accuracy": 0.9653090810775757, | |
| "num_tokens": 8133734566.0, | |
| "step": 76850 | |
| }, | |
| { | |
| "entropy": 1.2078125, | |
| "epoch": 1.8828656774888595, | |
| "grad_norm": 3.046875, | |
| "learning_rate": 5.61156558918744e-08, | |
| "loss": 0.1748, | |
| "mean_token_accuracy": 0.9636254405975342, | |
| "num_tokens": 8139112182.0, | |
| "step": 76900 | |
| }, | |
| { | |
| "entropy": 1.19765625, | |
| "epoch": 1.8840899074482151, | |
| "grad_norm": 3.125, | |
| "learning_rate": 5.4952616923703014e-08, | |
| "loss": 0.1508, | |
| "mean_token_accuracy": 0.9667049193382263, | |
| "num_tokens": 8144120297.0, | |
| "step": 76950 | |
| }, | |
| { | |
| "entropy": 1.20921875, | |
| "epoch": 1.8853141374075706, | |
| "grad_norm": 2.8125, | |
| "learning_rate": 5.380164552042832e-08, | |
| "loss": 0.1581, | |
| "mean_token_accuracy": 0.9663659358024597, | |
| "num_tokens": 8149360110.0, | |
| "step": 77000 | |
| }, | |
| { | |
| "entropy": 1.2215625, | |
| "epoch": 1.886538367366926, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 5.266274639824742e-08, | |
| "loss": 0.1807, | |
| "mean_token_accuracy": 0.9613511979579925, | |
| "num_tokens": 8154930968.0, | |
| "step": 77050 | |
| }, | |
| { | |
| "entropy": 1.1940625, | |
| "epoch": 1.8877625973262817, | |
| "grad_norm": 3.390625, | |
| "learning_rate": 5.1535924223889305e-08, | |
| "loss": 0.1593, | |
| "mean_token_accuracy": 0.9654444575309753, | |
| "num_tokens": 8159971112.0, | |
| "step": 77100 | |
| }, | |
| { | |
| "entropy": 1.2128125, | |
| "epoch": 1.8889868272856374, | |
| "grad_norm": 3.328125, | |
| "learning_rate": 5.042118361459724e-08, | |
| "loss": 0.1693, | |
| "mean_token_accuracy": 0.964167617559433, | |
| "num_tokens": 8165136464.0, | |
| "step": 77150 | |
| }, | |
| { | |
| "entropy": 1.20234375, | |
| "epoch": 1.890211057244993, | |
| "grad_norm": 2.84375, | |
| "learning_rate": 4.931852913810875e-08, | |
| "loss": 0.1597, | |
| "mean_token_accuracy": 0.9660988628864289, | |
| "num_tokens": 8170440548.0, | |
| "step": 77200 | |
| }, | |
| { | |
| "entropy": 1.2046875, | |
| "epoch": 1.8914352872043485, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 4.822796531263862e-08, | |
| "loss": 0.163, | |
| "mean_token_accuracy": 0.9647459161281585, | |
| "num_tokens": 8175965156.0, | |
| "step": 77250 | |
| }, | |
| { | |
| "entropy": 1.21484375, | |
| "epoch": 1.892659517163704, | |
| "grad_norm": 3.09375, | |
| "learning_rate": 4.7149496606857966e-08, | |
| "loss": 0.1777, | |
| "mean_token_accuracy": 0.9630069530010223, | |
| "num_tokens": 8181436041.0, | |
| "step": 77300 | |
| }, | |
| { | |
| "entropy": 1.20734375, | |
| "epoch": 1.8938837471230596, | |
| "grad_norm": 3.359375, | |
| "learning_rate": 4.608312743987819e-08, | |
| "loss": 0.1646, | |
| "mean_token_accuracy": 0.9651682090759277, | |
| "num_tokens": 8186577107.0, | |
| "step": 77350 | |
| }, | |
| { | |
| "entropy": 1.2134375, | |
| "epoch": 1.8951079770824153, | |
| "grad_norm": 4.21875, | |
| "learning_rate": 4.50288621812307e-08, | |
| "loss": 0.1701, | |
| "mean_token_accuracy": 0.9638711404800415, | |
| "num_tokens": 8191908989.0, | |
| "step": 77400 | |
| }, | |
| { | |
| "entropy": 1.1978125, | |
| "epoch": 1.8963322070417707, | |
| "grad_norm": 2.921875, | |
| "learning_rate": 4.398670515085157e-08, | |
| "loss": 0.1672, | |
| "mean_token_accuracy": 0.964127391576767, | |
| "num_tokens": 8197252149.0, | |
| "step": 77450 | |
| }, | |
| { | |
| "entropy": 1.2015625, | |
| "epoch": 1.8975564370011262, | |
| "grad_norm": 2.75, | |
| "learning_rate": 4.295666061906156e-08, | |
| "loss": 0.1741, | |
| "mean_token_accuracy": 0.9626425766944885, | |
| "num_tokens": 8202870180.0, | |
| "step": 77500 | |
| }, | |
| { | |
| "entropy": 1.20109375, | |
| "epoch": 1.8987806669604819, | |
| "grad_norm": 4.0625, | |
| "learning_rate": 4.193873280654914e-08, | |
| "loss": 0.1645, | |
| "mean_token_accuracy": 0.964863383769989, | |
| "num_tokens": 8208065173.0, | |
| "step": 77550 | |
| }, | |
| { | |
| "entropy": 1.20234375, | |
| "epoch": 1.9000048969198375, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 4.093292588435549e-08, | |
| "loss": 0.1605, | |
| "mean_token_accuracy": 0.965006741285324, | |
| "num_tokens": 8213242226.0, | |
| "step": 77600 | |
| }, | |
| { | |
| "entropy": 1.20734375, | |
| "epoch": 1.901229126879193, | |
| "grad_norm": 2.0, | |
| "learning_rate": 3.993924397385251e-08, | |
| "loss": 0.1693, | |
| "mean_token_accuracy": 0.9635647284984589, | |
| "num_tokens": 8218628064.0, | |
| "step": 77650 | |
| }, | |
| { | |
| "entropy": 1.21203125, | |
| "epoch": 1.9024533568385484, | |
| "grad_norm": 3.09375, | |
| "learning_rate": 3.895769114673187e-08, | |
| "loss": 0.1657, | |
| "mean_token_accuracy": 0.9649439096450806, | |
| "num_tokens": 8223851321.0, | |
| "step": 77700 | |
| }, | |
| { | |
| "entropy": 1.18859375, | |
| "epoch": 1.903677586797904, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 3.798827142498329e-08, | |
| "loss": 0.1508, | |
| "mean_token_accuracy": 0.9679539859294891, | |
| "num_tokens": 8228778299.0, | |
| "step": 77750 | |
| }, | |
| { | |
| "entropy": 1.20296875, | |
| "epoch": 1.9049018167572598, | |
| "grad_norm": 3.0625, | |
| "learning_rate": 3.7030988780880957e-08, | |
| "loss": 0.1541, | |
| "mean_token_accuracy": 0.966580958366394, | |
| "num_tokens": 8233727662.0, | |
| "step": 77800 | |
| }, | |
| { | |
| "entropy": 1.21453125, | |
| "epoch": 1.9061260467166152, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 3.6085847136966164e-08, | |
| "loss": 0.1622, | |
| "mean_token_accuracy": 0.9650613677501678, | |
| "num_tokens": 8239365249.0, | |
| "step": 77850 | |
| }, | |
| { | |
| "entropy": 1.22, | |
| "epoch": 1.9073502766759707, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 3.515285036603233e-08, | |
| "loss": 0.1736, | |
| "mean_token_accuracy": 0.9626342761516571, | |
| "num_tokens": 8244922468.0, | |
| "step": 77900 | |
| }, | |
| { | |
| "entropy": 1.21125, | |
| "epoch": 1.9085745066353264, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 3.423200229110701e-08, | |
| "loss": 0.1665, | |
| "mean_token_accuracy": 0.9643622922897339, | |
| "num_tokens": 8250033392.0, | |
| "step": 77950 | |
| }, | |
| { | |
| "entropy": 1.20125, | |
| "epoch": 1.909798736594682, | |
| "grad_norm": 3.546875, | |
| "learning_rate": 3.3323306685437926e-08, | |
| "loss": 0.1587, | |
| "mean_token_accuracy": 0.9665237700939179, | |
| "num_tokens": 8255293579.0, | |
| "step": 78000 | |
| }, | |
| { | |
| "entropy": 1.189375, | |
| "epoch": 1.9110229665540375, | |
| "grad_norm": 3.296875, | |
| "learning_rate": 3.242676727247795e-08, | |
| "loss": 0.146, | |
| "mean_token_accuracy": 0.9674337708950043, | |
| "num_tokens": 8260317228.0, | |
| "step": 78050 | |
| }, | |
| { | |
| "entropy": 1.2103125, | |
| "epoch": 1.912247196513393, | |
| "grad_norm": 4.0, | |
| "learning_rate": 3.1542387725868146e-08, | |
| "loss": 0.1651, | |
| "mean_token_accuracy": 0.9643155598640442, | |
| "num_tokens": 8265716396.0, | |
| "step": 78100 | |
| }, | |
| { | |
| "entropy": 1.20078125, | |
| "epoch": 1.9134714264727486, | |
| "grad_norm": 2.453125, | |
| "learning_rate": 3.0670171669423764e-08, | |
| "loss": 0.1625, | |
| "mean_token_accuracy": 0.9650612294673919, | |
| "num_tokens": 8270999547.0, | |
| "step": 78150 | |
| }, | |
| { | |
| "entropy": 1.2115625, | |
| "epoch": 1.9146956564321043, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 2.981012267711858e-08, | |
| "loss": 0.1725, | |
| "mean_token_accuracy": 0.9635538387298584, | |
| "num_tokens": 8276439622.0, | |
| "step": 78200 | |
| }, | |
| { | |
| "entropy": 1.203125, | |
| "epoch": 1.91591988639146, | |
| "grad_norm": 3.5625, | |
| "learning_rate": 2.896224427307226e-08, | |
| "loss": 0.1649, | |
| "mean_token_accuracy": 0.9643189585208893, | |
| "num_tokens": 8281629841.0, | |
| "step": 78250 | |
| }, | |
| { | |
| "entropy": 1.20921875, | |
| "epoch": 1.9171441163508154, | |
| "grad_norm": 3.5, | |
| "learning_rate": 2.8126539931533023e-08, | |
| "loss": 0.1601, | |
| "mean_token_accuracy": 0.9657320499420166, | |
| "num_tokens": 8286850296.0, | |
| "step": 78300 | |
| }, | |
| { | |
| "entropy": 1.2075, | |
| "epoch": 1.9183683463101708, | |
| "grad_norm": 3.078125, | |
| "learning_rate": 2.7303013076866335e-08, | |
| "loss": 0.1675, | |
| "mean_token_accuracy": 0.964200325012207, | |
| "num_tokens": 8292528304.0, | |
| "step": 78350 | |
| }, | |
| { | |
| "entropy": 1.21671875, | |
| "epoch": 1.9195925762695265, | |
| "grad_norm": 4.125, | |
| "learning_rate": 2.6491667083537896e-08, | |
| "loss": 0.1674, | |
| "mean_token_accuracy": 0.9635697185993195, | |
| "num_tokens": 8297851717.0, | |
| "step": 78400 | |
| }, | |
| { | |
| "entropy": 1.203125, | |
| "epoch": 1.9208168062288822, | |
| "grad_norm": 3.5625, | |
| "learning_rate": 2.5692505276102673e-08, | |
| "loss": 0.1639, | |
| "mean_token_accuracy": 0.9647056591510773, | |
| "num_tokens": 8302822545.0, | |
| "step": 78450 | |
| }, | |
| { | |
| "entropy": 1.20234375, | |
| "epoch": 1.9220410361882376, | |
| "grad_norm": 4.125, | |
| "learning_rate": 2.490553092918957e-08, | |
| "loss": 0.167, | |
| "mean_token_accuracy": 0.9645107495784759, | |
| "num_tokens": 8308044186.0, | |
| "step": 78500 | |
| }, | |
| { | |
| "entropy": 1.20390625, | |
| "epoch": 1.923265266147593, | |
| "grad_norm": 4.1875, | |
| "learning_rate": 2.4130747267488096e-08, | |
| "loss": 0.1587, | |
| "mean_token_accuracy": 0.9651757764816284, | |
| "num_tokens": 8313261711.0, | |
| "step": 78550 | |
| }, | |
| { | |
| "entropy": 1.20625, | |
| "epoch": 1.9244894961069487, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 2.3368157465735727e-08, | |
| "loss": 0.1729, | |
| "mean_token_accuracy": 0.9643122732639313, | |
| "num_tokens": 8318954245.0, | |
| "step": 78600 | |
| }, | |
| { | |
| "entropy": 1.21640625, | |
| "epoch": 1.9257137260663044, | |
| "grad_norm": 5.46875, | |
| "learning_rate": 2.261776464870424e-08, | |
| "loss": 0.1712, | |
| "mean_token_accuracy": 0.9633339118957519, | |
| "num_tokens": 8324544756.0, | |
| "step": 78650 | |
| }, | |
| { | |
| "entropy": 1.21515625, | |
| "epoch": 1.9269379560256599, | |
| "grad_norm": 2.875, | |
| "learning_rate": 2.1879571891188054e-08, | |
| "loss": 0.1751, | |
| "mean_token_accuracy": 0.9626336395740509, | |
| "num_tokens": 8329948691.0, | |
| "step": 78700 | |
| }, | |
| { | |
| "entropy": 1.20515625, | |
| "epoch": 1.9281621859850153, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 2.1153582217990574e-08, | |
| "loss": 0.1655, | |
| "mean_token_accuracy": 0.964772834777832, | |
| "num_tokens": 8335173517.0, | |
| "step": 78750 | |
| }, | |
| { | |
| "entropy": 1.2015625, | |
| "epoch": 1.929386415944371, | |
| "grad_norm": 2.796875, | |
| "learning_rate": 2.043979860391154e-08, | |
| "loss": 0.1711, | |
| "mean_token_accuracy": 0.9635234928131103, | |
| "num_tokens": 8340379735.0, | |
| "step": 78800 | |
| }, | |
| { | |
| "entropy": 1.1909375, | |
| "epoch": 1.9306106459037267, | |
| "grad_norm": 3.703125, | |
| "learning_rate": 1.9738223973735702e-08, | |
| "loss": 0.1559, | |
| "mean_token_accuracy": 0.9672637641429901, | |
| "num_tokens": 8345381104.0, | |
| "step": 78850 | |
| }, | |
| { | |
| "entropy": 1.21375, | |
| "epoch": 1.9318348758630821, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 1.9048861202221823e-08, | |
| "loss": 0.1681, | |
| "mean_token_accuracy": 0.9651576709747315, | |
| "num_tokens": 8350559447.0, | |
| "step": 78900 | |
| }, | |
| { | |
| "entropy": 1.21234375, | |
| "epoch": 1.9330591058224376, | |
| "grad_norm": 3.9375, | |
| "learning_rate": 1.8371713114086697e-08, | |
| "loss": 0.1652, | |
| "mean_token_accuracy": 0.9637591278553009, | |
| "num_tokens": 8355928028.0, | |
| "step": 78950 | |
| }, | |
| { | |
| "entropy": 1.20640625, | |
| "epoch": 1.9342833357817932, | |
| "grad_norm": 0.4453125, | |
| "learning_rate": 1.770678248399982e-08, | |
| "loss": 0.1621, | |
| "mean_token_accuracy": 0.9652046132087707, | |
| "num_tokens": 8361366979.0, | |
| "step": 79000 | |
| }, | |
| { | |
| "entropy": 1.20453125, | |
| "epoch": 1.935507565741149, | |
| "grad_norm": 4.1875, | |
| "learning_rate": 1.7054072036566394e-08, | |
| "loss": 0.1685, | |
| "mean_token_accuracy": 0.9641025936603547, | |
| "num_tokens": 8366288409.0, | |
| "step": 79050 | |
| }, | |
| { | |
| "entropy": 1.2125, | |
| "epoch": 1.9367317957005044, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 1.6413584446319018e-08, | |
| "loss": 0.1632, | |
| "mean_token_accuracy": 0.9653700625896454, | |
| "num_tokens": 8371880970.0, | |
| "step": 79100 | |
| }, | |
| { | |
| "entropy": 1.20578125, | |
| "epoch": 1.9379560256598598, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 1.5785322337706688e-08, | |
| "loss": 0.164, | |
| "mean_token_accuracy": 0.9650757694244385, | |
| "num_tokens": 8377110509.0, | |
| "step": 79150 | |
| }, | |
| { | |
| "entropy": 1.20046875, | |
| "epoch": 1.9391802556192155, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 1.5169288285082793e-08, | |
| "loss": 0.1631, | |
| "mean_token_accuracy": 0.9651304471492768, | |
| "num_tokens": 8382268459.0, | |
| "step": 79200 | |
| }, | |
| { | |
| "entropy": 1.2075, | |
| "epoch": 1.9404044855785711, | |
| "grad_norm": 3.40625, | |
| "learning_rate": 1.4565484812696151e-08, | |
| "loss": 0.155, | |
| "mean_token_accuracy": 0.9661095356941223, | |
| "num_tokens": 8387474552.0, | |
| "step": 79250 | |
| }, | |
| { | |
| "entropy": 1.1728125, | |
| "epoch": 1.9416287155379266, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 1.3973914394678655e-08, | |
| "loss": 0.1379, | |
| "mean_token_accuracy": 0.9702671027183533, | |
| "num_tokens": 8392280218.0, | |
| "step": 79300 | |
| }, | |
| { | |
| "entropy": 1.21859375, | |
| "epoch": 1.942852945497282, | |
| "grad_norm": 3.828125, | |
| "learning_rate": 1.3394579455037637e-08, | |
| "loss": 0.1586, | |
| "mean_token_accuracy": 0.9652379488945008, | |
| "num_tokens": 8397377045.0, | |
| "step": 79350 | |
| }, | |
| { | |
| "entropy": 1.20515625, | |
| "epoch": 1.9440771754566377, | |
| "grad_norm": 2.453125, | |
| "learning_rate": 1.2827482367643862e-08, | |
| "loss": 0.1537, | |
| "mean_token_accuracy": 0.9671237909793854, | |
| "num_tokens": 8402675630.0, | |
| "step": 79400 | |
| }, | |
| { | |
| "entropy": 1.18984375, | |
| "epoch": 1.9453014054159934, | |
| "grad_norm": 2.703125, | |
| "learning_rate": 1.2272625456221875e-08, | |
| "loss": 0.1511, | |
| "mean_token_accuracy": 0.9674056422710419, | |
| "num_tokens": 8407470922.0, | |
| "step": 79450 | |
| }, | |
| { | |
| "entropy": 1.22015625, | |
| "epoch": 1.946525635375349, | |
| "grad_norm": 3.078125, | |
| "learning_rate": 1.1730010994342344e-08, | |
| "loss": 0.1683, | |
| "mean_token_accuracy": 0.963656575679779, | |
| "num_tokens": 8413030681.0, | |
| "step": 79500 | |
| }, | |
| { | |
| "entropy": 1.2075, | |
| "epoch": 1.9477498653347045, | |
| "grad_norm": 3.703125, | |
| "learning_rate": 1.1199641205410727e-08, | |
| "loss": 0.1676, | |
| "mean_token_accuracy": 0.9641730666160584, | |
| "num_tokens": 8418435608.0, | |
| "step": 79550 | |
| }, | |
| { | |
| "entropy": 1.20109375, | |
| "epoch": 1.94897409529406, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 1.0681518262659618e-08, | |
| "loss": 0.1612, | |
| "mean_token_accuracy": 0.9652375304698944, | |
| "num_tokens": 8423410030.0, | |
| "step": 79600 | |
| }, | |
| { | |
| "entropy": 1.19328125, | |
| "epoch": 1.9501983252534156, | |
| "grad_norm": 4.3125, | |
| "learning_rate": 1.0175644289138419e-08, | |
| "loss": 0.1565, | |
| "mean_token_accuracy": 0.9664306437969208, | |
| "num_tokens": 8428505734.0, | |
| "step": 79650 | |
| }, | |
| { | |
| "entropy": 1.19609375, | |
| "epoch": 1.9514225552127713, | |
| "grad_norm": 3.15625, | |
| "learning_rate": 9.682021357706018e-09, | |
| "loss": 0.1491, | |
| "mean_token_accuracy": 0.968139351606369, | |
| "num_tokens": 8433821851.0, | |
| "step": 79700 | |
| }, | |
| { | |
| "entropy": 1.19390625, | |
| "epoch": 1.9526467851721268, | |
| "grad_norm": 3.53125, | |
| "learning_rate": 9.20065149102145e-09, | |
| "loss": 0.1566, | |
| "mean_token_accuracy": 0.9661858582496643, | |
| "num_tokens": 8438988974.0, | |
| "step": 79750 | |
| }, | |
| { | |
| "entropy": 1.20546875, | |
| "epoch": 1.9538710151314822, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 8.731536661535588e-09, | |
| "loss": 0.1691, | |
| "mean_token_accuracy": 0.9629131543636322, | |
| "num_tokens": 8444297546.0, | |
| "step": 79800 | |
| }, | |
| { | |
| "entropy": 1.20671875, | |
| "epoch": 1.9550952450908379, | |
| "grad_norm": 2.640625, | |
| "learning_rate": 8.274678791484136e-09, | |
| "loss": 0.1603, | |
| "mean_token_accuracy": 0.9652000117301941, | |
| "num_tokens": 8449852335.0, | |
| "step": 79850 | |
| }, | |
| { | |
| "entropy": 1.18640625, | |
| "epoch": 1.9563194750501935, | |
| "grad_norm": 3.359375, | |
| "learning_rate": 7.830079752877973e-09, | |
| "loss": 0.1394, | |
| "mean_token_accuracy": 0.9697746348381042, | |
| "num_tokens": 8454775071.0, | |
| "step": 79900 | |
| }, | |
| { | |
| "entropy": 1.2021875, | |
| "epoch": 1.957543705009549, | |
| "grad_norm": 2.734375, | |
| "learning_rate": 7.397741367497157e-09, | |
| "loss": 0.1613, | |
| "mean_token_accuracy": 0.9663744091987609, | |
| "num_tokens": 8460122393.0, | |
| "step": 79950 | |
| }, | |
| { | |
| "entropy": 1.21125, | |
| "epoch": 1.9587679349689044, | |
| "grad_norm": 3.125, | |
| "learning_rate": 6.977665406882272e-09, | |
| "loss": 0.1689, | |
| "mean_token_accuracy": 0.9630868649482727, | |
| "num_tokens": 8465752957.0, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 1.9587679349689044, | |
| "eval_entropy": 1.2009765625, | |
| "eval_loss": 0.17771507799625397, | |
| "eval_mean_token_accuracy": 0.9620104561249415, | |
| "eval_num_tokens": 8465752957.0, | |
| "eval_runtime": 611.9165, | |
| "eval_samples_per_second": 15.78, | |
| "eval_steps_per_second": 0.198, | |
| "step": 80000 | |
| }, | |
| { | |
| "entropy": 1.21203125, | |
| "epoch": 1.9599921649282601, | |
| "grad_norm": 3.421875, | |
| "learning_rate": 6.569853592327757e-09, | |
| "loss": 0.1792, | |
| "mean_token_accuracy": 0.9620552754402161, | |
| "num_tokens": 8471172063.0, | |
| "step": 80050 | |
| }, | |
| { | |
| "entropy": 1.200625, | |
| "epoch": 1.9612163948876158, | |
| "grad_norm": 3.28125, | |
| "learning_rate": 6.174307594874917e-09, | |
| "loss": 0.1558, | |
| "mean_token_accuracy": 0.9663512742519379, | |
| "num_tokens": 8476017366.0, | |
| "step": 80100 | |
| }, | |
| { | |
| "entropy": 1.195, | |
| "epoch": 1.9624406248469712, | |
| "grad_norm": 3.65625, | |
| "learning_rate": 5.7910290353049285e-09, | |
| "loss": 0.1529, | |
| "mean_token_accuracy": 0.9670116317272186, | |
| "num_tokens": 8481042255.0, | |
| "step": 80150 | |
| }, | |
| { | |
| "entropy": 1.18890625, | |
| "epoch": 1.9636648548063267, | |
| "grad_norm": 2.0, | |
| "learning_rate": 5.420019484131844e-09, | |
| "loss": 0.1608, | |
| "mean_token_accuracy": 0.9656985890865326, | |
| "num_tokens": 8486092212.0, | |
| "step": 80200 | |
| }, | |
| { | |
| "entropy": 1.209375, | |
| "epoch": 1.9648890847656824, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 5.061280461596929e-09, | |
| "loss": 0.1747, | |
| "mean_token_accuracy": 0.962527574300766, | |
| "num_tokens": 8491676835.0, | |
| "step": 80250 | |
| }, | |
| { | |
| "entropy": 1.20671875, | |
| "epoch": 1.966113314725038, | |
| "grad_norm": 3.21875, | |
| "learning_rate": 4.714813437661336e-09, | |
| "loss": 0.1636, | |
| "mean_token_accuracy": 0.9649911904335022, | |
| "num_tokens": 8497110970.0, | |
| "step": 80300 | |
| }, | |
| { | |
| "entropy": 1.2090625, | |
| "epoch": 1.9673375446843935, | |
| "grad_norm": 3.078125, | |
| "learning_rate": 4.380619832001775e-09, | |
| "loss": 0.1698, | |
| "mean_token_accuracy": 0.9634296333789826, | |
| "num_tokens": 8502537441.0, | |
| "step": 80350 | |
| }, | |
| { | |
| "entropy": 1.20875, | |
| "epoch": 1.968561774643749, | |
| "grad_norm": 4.9375, | |
| "learning_rate": 4.058701014002187e-09, | |
| "loss": 0.1637, | |
| "mean_token_accuracy": 0.9648308408260345, | |
| "num_tokens": 8507630732.0, | |
| "step": 80400 | |
| }, | |
| { | |
| "entropy": 1.20390625, | |
| "epoch": 1.9697860046031046, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 3.749058302751074e-09, | |
| "loss": 0.1531, | |
| "mean_token_accuracy": 0.9671729254722595, | |
| "num_tokens": 8512848795.0, | |
| "step": 80450 | |
| }, | |
| { | |
| "entropy": 1.2090625, | |
| "epoch": 1.9710102345624603, | |
| "grad_norm": 3.984375, | |
| "learning_rate": 3.451692967033848e-09, | |
| "loss": 0.1643, | |
| "mean_token_accuracy": 0.9643464314937592, | |
| "num_tokens": 8518368412.0, | |
| "step": 80500 | |
| }, | |
| { | |
| "entropy": 1.20875, | |
| "epoch": 1.972234464521816, | |
| "grad_norm": 3.203125, | |
| "learning_rate": 3.1666062253284942e-09, | |
| "loss": 0.1677, | |
| "mean_token_accuracy": 0.9644135737419128, | |
| "num_tokens": 8523653890.0, | |
| "step": 80550 | |
| }, | |
| { | |
| "entropy": 1.2159375, | |
| "epoch": 1.9734586944811714, | |
| "grad_norm": 3.875, | |
| "learning_rate": 2.893799245800244e-09, | |
| "loss": 0.166, | |
| "mean_token_accuracy": 0.9650824117660523, | |
| "num_tokens": 8529015592.0, | |
| "step": 80600 | |
| }, | |
| { | |
| "entropy": 1.2, | |
| "epoch": 1.9746829244405268, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 2.633273146297577e-09, | |
| "loss": 0.1591, | |
| "mean_token_accuracy": 0.9664743864536285, | |
| "num_tokens": 8534375896.0, | |
| "step": 80650 | |
| }, | |
| { | |
| "entropy": 1.21421875, | |
| "epoch": 1.9759071543998825, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 2.385028994346894e-09, | |
| "loss": 0.1685, | |
| "mean_token_accuracy": 0.9637958765029907, | |
| "num_tokens": 8539783916.0, | |
| "step": 80700 | |
| }, | |
| { | |
| "entropy": 1.19375, | |
| "epoch": 1.9771313843592382, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 2.149067807147853e-09, | |
| "loss": 0.1589, | |
| "mean_token_accuracy": 0.9659115636348724, | |
| "num_tokens": 8544696959.0, | |
| "step": 80750 | |
| }, | |
| { | |
| "entropy": 1.21125, | |
| "epoch": 1.9783556143185936, | |
| "grad_norm": 2.25, | |
| "learning_rate": 1.925390551570705e-09, | |
| "loss": 0.1649, | |
| "mean_token_accuracy": 0.9644637072086334, | |
| "num_tokens": 8550159905.0, | |
| "step": 80800 | |
| }, | |
| { | |
| "entropy": 1.19796875, | |
| "epoch": 1.979579844277949, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 1.7139981441502973e-09, | |
| "loss": 0.1526, | |
| "mean_token_accuracy": 0.9669871032238007, | |
| "num_tokens": 8555193317.0, | |
| "step": 80850 | |
| }, | |
| { | |
| "entropy": 1.1953125, | |
| "epoch": 1.9808040742373048, | |
| "grad_norm": 2.828125, | |
| "learning_rate": 1.514891451083744e-09, | |
| "loss": 0.1676, | |
| "mean_token_accuracy": 0.9646944868564605, | |
| "num_tokens": 8560635630.0, | |
| "step": 80900 | |
| }, | |
| { | |
| "entropy": 1.21921875, | |
| "epoch": 1.9820283041966604, | |
| "grad_norm": 2.90625, | |
| "learning_rate": 1.328071288226762e-09, | |
| "loss": 0.1694, | |
| "mean_token_accuracy": 0.9641423618793488, | |
| "num_tokens": 8566246965.0, | |
| "step": 80950 | |
| }, | |
| { | |
| "entropy": 1.1896875, | |
| "epoch": 1.9832525341560159, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 1.1535384210893395e-09, | |
| "loss": 0.1436, | |
| "mean_token_accuracy": 0.9696673655509949, | |
| "num_tokens": 8571430149.0, | |
| "step": 81000 | |
| }, | |
| { | |
| "entropy": 1.2009375, | |
| "epoch": 1.9844767641153713, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 9.912935648344057e-10, | |
| "loss": 0.1667, | |
| "mean_token_accuracy": 0.9643544840812683, | |
| "num_tokens": 8576922262.0, | |
| "step": 81050 | |
| }, | |
| { | |
| "entropy": 1.208125, | |
| "epoch": 1.985700994074727, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 8.413373842721672e-10, | |
| "loss": 0.1569, | |
| "mean_token_accuracy": 0.9652732384204864, | |
| "num_tokens": 8582076472.0, | |
| "step": 81100 | |
| }, | |
| { | |
| "entropy": 1.20875, | |
| "epoch": 1.9869252240340827, | |
| "grad_norm": 2.671875, | |
| "learning_rate": 7.036704938611083e-10, | |
| "loss": 0.1691, | |
| "mean_token_accuracy": 0.9644241857528687, | |
| "num_tokens": 8587501762.0, | |
| "step": 81150 | |
| }, | |
| { | |
| "entropy": 1.17515625, | |
| "epoch": 1.9881494539934381, | |
| "grad_norm": 3.0, | |
| "learning_rate": 5.782934577009957e-10, | |
| "loss": 0.1391, | |
| "mean_token_accuracy": 0.9696795284748078, | |
| "num_tokens": 8592212871.0, | |
| "step": 81200 | |
| }, | |
| { | |
| "entropy": 1.195, | |
| "epoch": 1.9893736839527936, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 4.652067895352108e-10, | |
| "loss": 0.1522, | |
| "mean_token_accuracy": 0.9666969799995422, | |
| "num_tokens": 8597378423.0, | |
| "step": 81250 | |
| }, | |
| { | |
| "entropy": 1.198125, | |
| "epoch": 1.9905979139121492, | |
| "grad_norm": 2.828125, | |
| "learning_rate": 3.644109527447537e-10, | |
| "loss": 0.1695, | |
| "mean_token_accuracy": 0.9640089082717895, | |
| "num_tokens": 8602514679.0, | |
| "step": 81300 | |
| }, | |
| { | |
| "entropy": 1.20984375, | |
| "epoch": 1.991822143871505, | |
| "grad_norm": 4.5, | |
| "learning_rate": 2.7590636034857675e-10, | |
| "loss": 0.1634, | |
| "mean_token_accuracy": 0.964598093032837, | |
| "num_tokens": 8607950288.0, | |
| "step": 81350 | |
| }, | |
| { | |
| "entropy": 1.21296875, | |
| "epoch": 1.9930463738308604, | |
| "grad_norm": 3.625, | |
| "learning_rate": 1.9969337500125308e-10, | |
| "loss": 0.166, | |
| "mean_token_accuracy": 0.9647457122802734, | |
| "num_tokens": 8613281748.0, | |
| "step": 81400 | |
| }, | |
| { | |
| "entropy": 1.21078125, | |
| "epoch": 1.9942706037902158, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 1.3577230899197712e-10, | |
| "loss": 0.1541, | |
| "mean_token_accuracy": 0.9666912174224853, | |
| "num_tokens": 8618635116.0, | |
| "step": 81450 | |
| }, | |
| { | |
| "entropy": 1.18859375, | |
| "epoch": 1.9954948337495715, | |
| "grad_norm": 3.09375, | |
| "learning_rate": 8.414342424156729e-11, | |
| "loss": 0.149, | |
| "mean_token_accuracy": 0.9679227757453919, | |
| "num_tokens": 8623674169.0, | |
| "step": 81500 | |
| }, | |
| { | |
| "entropy": 1.18640625, | |
| "epoch": 1.9967190637089272, | |
| "grad_norm": 2.75, | |
| "learning_rate": 4.48069323044642e-11, | |
| "loss": 0.1535, | |
| "mean_token_accuracy": 0.966040461063385, | |
| "num_tokens": 8628848521.0, | |
| "step": 81550 | |
| }, | |
| { | |
| "entropy": 1.1934375, | |
| "epoch": 1.9979432936682826, | |
| "grad_norm": 3.5625, | |
| "learning_rate": 1.776299436406781e-11, | |
| "loss": 0.1668, | |
| "mean_token_accuracy": 0.9646425199508667, | |
| "num_tokens": 8634212914.0, | |
| "step": 81600 | |
| }, | |
| { | |
| "entropy": 1.1825, | |
| "epoch": 1.999167523627638, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 3.0117212357350098e-12, | |
| "loss": 0.147, | |
| "mean_token_accuracy": 0.9685306799411774, | |
| "num_tokens": 8639315101.0, | |
| "step": 81650 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 81684, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 10000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 9.811857454193967e+19, | |
| "train_batch_size": 12, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |