Instructions to use roonbug/m630hfii with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use roonbug/m630hfii with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("image-text-to-text", model="roonbug/m630hfii") messages = [ { "role": "user", "content": [ {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"}, {"type": "text", "text": "What animal is on the candy?"} ] }, ] pipe(text=messages)# Load model directly from transformers import AutoProcessor, AutoModelForImageTextToText processor = AutoProcessor.from_pretrained("roonbug/m630hfii") model = AutoModelForImageTextToText.from_pretrained("roonbug/m630hfii") messages = [ { "role": "user", "content": [ {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"}, {"type": "text", "text": "What animal is on the candy?"} ] }, ] inputs = processor.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(processor.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use roonbug/m630hfii with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "roonbug/m630hfii" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "roonbug/m630hfii", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }'Use Docker
docker model run hf.co/roonbug/m630hfii
- SGLang
How to use roonbug/m630hfii with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "roonbug/m630hfii" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "roonbug/m630hfii", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "roonbug/m630hfii" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "roonbug/m630hfii", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }' - Docker Model Runner
How to use roonbug/m630hfii with Docker Model Runner:
docker model run hf.co/roonbug/m630hfii
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.8, | |
| "eval_steps": 100, | |
| "global_step": 2000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 0.6734127338975668, | |
| "epoch": 0.004, | |
| "grad_norm": 1000.0, | |
| "learning_rate": 1.8e-07, | |
| "loss": 28.5463, | |
| "mean_token_accuracy": 0.7051723279058933, | |
| "num_tokens": 89176.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 0.7554282372817397, | |
| "epoch": 0.008, | |
| "grad_norm": 848.0, | |
| "learning_rate": 3.8e-07, | |
| "loss": 30.5507, | |
| "mean_token_accuracy": 0.6821797413751483, | |
| "num_tokens": 173886.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 0.7452052749693394, | |
| "epoch": 0.012, | |
| "grad_norm": 700.0, | |
| "learning_rate": 5.800000000000001e-07, | |
| "loss": 28.0843, | |
| "mean_token_accuracy": 0.6968917248770594, | |
| "num_tokens": 258970.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 0.8406938400119544, | |
| "epoch": 0.016, | |
| "grad_norm": 588.0, | |
| "learning_rate": 7.8e-07, | |
| "loss": 28.3624, | |
| "mean_token_accuracy": 0.6884651094675064, | |
| "num_tokens": 338352.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 0.8782084910199046, | |
| "epoch": 0.02, | |
| "grad_norm": 556.0, | |
| "learning_rate": 9.800000000000001e-07, | |
| "loss": 25.7081, | |
| "mean_token_accuracy": 0.7061796801164746, | |
| "num_tokens": 424243.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 0.9611807337030769, | |
| "epoch": 0.024, | |
| "grad_norm": 456.0, | |
| "learning_rate": 1.1800000000000001e-06, | |
| "loss": 25.387, | |
| "mean_token_accuracy": 0.6989632658660412, | |
| "num_tokens": 505613.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 0.9614389628171921, | |
| "epoch": 0.028, | |
| "grad_norm": 290.0, | |
| "learning_rate": 1.3800000000000001e-06, | |
| "loss": 22.9903, | |
| "mean_token_accuracy": 0.7096886133775115, | |
| "num_tokens": 584157.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 0.9275373946875334, | |
| "epoch": 0.032, | |
| "grad_norm": 161.0, | |
| "learning_rate": 1.5800000000000001e-06, | |
| "loss": 20.0946, | |
| "mean_token_accuracy": 0.7300519198179245, | |
| "num_tokens": 671193.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 1.0108212113380433, | |
| "epoch": 0.036, | |
| "grad_norm": 217.0, | |
| "learning_rate": 1.7800000000000001e-06, | |
| "loss": 20.6725, | |
| "mean_token_accuracy": 0.7165856244042516, | |
| "num_tokens": 753947.0, | |
| "step": 90 | |
| }, | |
| { | |
| "entropy": 0.9366745728999376, | |
| "epoch": 0.04, | |
| "grad_norm": 162.0, | |
| "learning_rate": 1.98e-06, | |
| "loss": 18.0538, | |
| "mean_token_accuracy": 0.7409927822649479, | |
| "num_tokens": 835868.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "eval_biology_entropy": 1.5465301005840302, | |
| "eval_biology_loss": 1.9649326801300049, | |
| "eval_biology_mean_token_accuracy": 0.5973824737071991, | |
| "eval_biology_num_tokens": 835868.0, | |
| "eval_biology_runtime": 35.5773, | |
| "eval_biology_samples_per_second": 14.054, | |
| "eval_biology_steps_per_second": 14.054, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "eval_chemistry_entropy": 1.0573365586400032, | |
| "eval_chemistry_loss": 1.2894866466522217, | |
| "eval_chemistry_mean_token_accuracy": 0.7121288641691208, | |
| "eval_chemistry_num_tokens": 835868.0, | |
| "eval_chemistry_runtime": 39.6571, | |
| "eval_chemistry_samples_per_second": 12.608, | |
| "eval_chemistry_steps_per_second": 12.608, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "eval_math_entropy": 0.8270023443698883, | |
| "eval_math_loss": 1.1661900281906128, | |
| "eval_math_mean_token_accuracy": 0.737815534889698, | |
| "eval_math_num_tokens": 835868.0, | |
| "eval_math_runtime": 40.5371, | |
| "eval_math_samples_per_second": 12.334, | |
| "eval_math_steps_per_second": 12.334, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "eval_physics_entropy": 0.9262812133431435, | |
| "eval_physics_loss": 1.1149247884750366, | |
| "eval_physics_mean_token_accuracy": 0.7413808185458183, | |
| "eval_physics_num_tokens": 835868.0, | |
| "eval_physics_runtime": 45.1205, | |
| "eval_physics_samples_per_second": 11.081, | |
| "eval_physics_steps_per_second": 11.081, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 0.8625543506816029, | |
| "epoch": 0.044, | |
| "grad_norm": 141.0, | |
| "learning_rate": 2.1800000000000003e-06, | |
| "loss": 16.1132, | |
| "mean_token_accuracy": 0.7616209197789431, | |
| "num_tokens": 917534.0, | |
| "step": 110 | |
| }, | |
| { | |
| "entropy": 0.8609532468020916, | |
| "epoch": 0.048, | |
| "grad_norm": 142.0, | |
| "learning_rate": 2.38e-06, | |
| "loss": 15.8038, | |
| "mean_token_accuracy": 0.7642258133739233, | |
| "num_tokens": 1001008.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 0.8615515833720565, | |
| "epoch": 0.052, | |
| "grad_norm": 110.0, | |
| "learning_rate": 2.5800000000000003e-06, | |
| "loss": 15.6256, | |
| "mean_token_accuracy": 0.7626194018870592, | |
| "num_tokens": 1085098.0, | |
| "step": 130 | |
| }, | |
| { | |
| "entropy": 0.9037158312276006, | |
| "epoch": 0.056, | |
| "grad_norm": 122.5, | |
| "learning_rate": 2.7800000000000005e-06, | |
| "loss": 16.4938, | |
| "mean_token_accuracy": 0.7565715182572603, | |
| "num_tokens": 1166615.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 0.8950596721842885, | |
| "epoch": 0.06, | |
| "grad_norm": 134.0, | |
| "learning_rate": 2.9800000000000003e-06, | |
| "loss": 14.5001, | |
| "mean_token_accuracy": 0.7665418200194836, | |
| "num_tokens": 1248387.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 0.961725745536387, | |
| "epoch": 0.064, | |
| "grad_norm": 100.0, | |
| "learning_rate": 3.1800000000000005e-06, | |
| "loss": 15.127, | |
| "mean_token_accuracy": 0.7604502853006124, | |
| "num_tokens": 1331208.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 0.881209403835237, | |
| "epoch": 0.068, | |
| "grad_norm": 98.5, | |
| "learning_rate": 3.3800000000000007e-06, | |
| "loss": 13.694, | |
| "mean_token_accuracy": 0.7777196705341339, | |
| "num_tokens": 1416841.0, | |
| "step": 170 | |
| }, | |
| { | |
| "entropy": 0.8587444640696049, | |
| "epoch": 0.072, | |
| "grad_norm": 109.0, | |
| "learning_rate": 3.58e-06, | |
| "loss": 13.3488, | |
| "mean_token_accuracy": 0.7776382105425, | |
| "num_tokens": 1499449.0, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 0.8419531056657433, | |
| "epoch": 0.076, | |
| "grad_norm": 89.5, | |
| "learning_rate": 3.7800000000000002e-06, | |
| "loss": 13.184, | |
| "mean_token_accuracy": 0.7732796184718609, | |
| "num_tokens": 1585982.0, | |
| "step": 190 | |
| }, | |
| { | |
| "entropy": 0.8316561704501509, | |
| "epoch": 0.08, | |
| "grad_norm": 83.0, | |
| "learning_rate": 3.980000000000001e-06, | |
| "loss": 13.2192, | |
| "mean_token_accuracy": 0.7721541322767734, | |
| "num_tokens": 1667532.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "eval_biology_entropy": 1.3391264139413834, | |
| "eval_biology_loss": 1.495794653892517, | |
| "eval_biology_mean_token_accuracy": 0.6435496712327003, | |
| "eval_biology_num_tokens": 1667532.0, | |
| "eval_biology_runtime": 35.2419, | |
| "eval_biology_samples_per_second": 14.188, | |
| "eval_biology_steps_per_second": 14.188, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "eval_chemistry_entropy": 0.9186333262324333, | |
| "eval_chemistry_loss": 0.9441949725151062, | |
| "eval_chemistry_mean_token_accuracy": 0.749591316819191, | |
| "eval_chemistry_num_tokens": 1667532.0, | |
| "eval_chemistry_runtime": 39.611, | |
| "eval_chemistry_samples_per_second": 12.623, | |
| "eval_chemistry_steps_per_second": 12.623, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "eval_math_entropy": 0.7541183361709118, | |
| "eval_math_loss": 0.8956651091575623, | |
| "eval_math_mean_token_accuracy": 0.7658155815601349, | |
| "eval_math_num_tokens": 1667532.0, | |
| "eval_math_runtime": 40.4594, | |
| "eval_math_samples_per_second": 12.358, | |
| "eval_math_steps_per_second": 12.358, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "eval_physics_entropy": 0.8000644188523293, | |
| "eval_physics_loss": 0.8156371116638184, | |
| "eval_physics_mean_token_accuracy": 0.7777305005192756, | |
| "eval_physics_num_tokens": 1667532.0, | |
| "eval_physics_runtime": 45.1972, | |
| "eval_physics_samples_per_second": 11.063, | |
| "eval_physics_steps_per_second": 11.063, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 0.7765617506578565, | |
| "epoch": 0.084, | |
| "grad_norm": 79.0, | |
| "learning_rate": 4.18e-06, | |
| "loss": 12.5271, | |
| "mean_token_accuracy": 0.7859760526567697, | |
| "num_tokens": 1750813.0, | |
| "step": 210 | |
| }, | |
| { | |
| "entropy": 0.8027046866714954, | |
| "epoch": 0.088, | |
| "grad_norm": 82.0, | |
| "learning_rate": 4.38e-06, | |
| "loss": 12.8324, | |
| "mean_token_accuracy": 0.7811258573085069, | |
| "num_tokens": 1835489.0, | |
| "step": 220 | |
| }, | |
| { | |
| "entropy": 0.8210387529805303, | |
| "epoch": 0.092, | |
| "grad_norm": 93.0, | |
| "learning_rate": 4.58e-06, | |
| "loss": 13.2166, | |
| "mean_token_accuracy": 0.7749050311744213, | |
| "num_tokens": 1915115.0, | |
| "step": 230 | |
| }, | |
| { | |
| "entropy": 0.7716038260608912, | |
| "epoch": 0.096, | |
| "grad_norm": 106.5, | |
| "learning_rate": 4.78e-06, | |
| "loss": 12.3448, | |
| "mean_token_accuracy": 0.7872315399348736, | |
| "num_tokens": 2007176.0, | |
| "step": 240 | |
| }, | |
| { | |
| "entropy": 0.7926006538793444, | |
| "epoch": 0.1, | |
| "grad_norm": 73.0, | |
| "learning_rate": 4.980000000000001e-06, | |
| "loss": 12.647, | |
| "mean_token_accuracy": 0.7829932551831007, | |
| "num_tokens": 2088696.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 0.8025551496073604, | |
| "epoch": 0.104, | |
| "grad_norm": 76.5, | |
| "learning_rate": 5.18e-06, | |
| "loss": 12.8826, | |
| "mean_token_accuracy": 0.7798444323241711, | |
| "num_tokens": 2171335.0, | |
| "step": 260 | |
| }, | |
| { | |
| "entropy": 0.7691786218434572, | |
| "epoch": 0.108, | |
| "grad_norm": 90.5, | |
| "learning_rate": 5.380000000000001e-06, | |
| "loss": 12.3875, | |
| "mean_token_accuracy": 0.7863950747996569, | |
| "num_tokens": 2254168.0, | |
| "step": 270 | |
| }, | |
| { | |
| "entropy": 0.7568587090820074, | |
| "epoch": 0.112, | |
| "grad_norm": 102.5, | |
| "learning_rate": 5.580000000000001e-06, | |
| "loss": 12.0152, | |
| "mean_token_accuracy": 0.7904921755194664, | |
| "num_tokens": 2342259.0, | |
| "step": 280 | |
| }, | |
| { | |
| "entropy": 0.7691523030400276, | |
| "epoch": 0.116, | |
| "grad_norm": 81.0, | |
| "learning_rate": 5.78e-06, | |
| "loss": 12.3692, | |
| "mean_token_accuracy": 0.7872932553291321, | |
| "num_tokens": 2426289.0, | |
| "step": 290 | |
| }, | |
| { | |
| "entropy": 0.8102743584662676, | |
| "epoch": 0.12, | |
| "grad_norm": 72.5, | |
| "learning_rate": 5.98e-06, | |
| "loss": 12.869, | |
| "mean_token_accuracy": 0.7774799339473247, | |
| "num_tokens": 2504279.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "eval_biology_entropy": 1.2733513461351396, | |
| "eval_biology_loss": 1.4364651441574097, | |
| "eval_biology_mean_token_accuracy": 0.6550095283985138, | |
| "eval_biology_num_tokens": 2504279.0, | |
| "eval_biology_runtime": 35.2012, | |
| "eval_biology_samples_per_second": 14.204, | |
| "eval_biology_steps_per_second": 14.204, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "eval_chemistry_entropy": 0.8685060097575188, | |
| "eval_chemistry_loss": 0.891410768032074, | |
| "eval_chemistry_mean_token_accuracy": 0.7606868423819542, | |
| "eval_chemistry_num_tokens": 2504279.0, | |
| "eval_chemistry_runtime": 39.6393, | |
| "eval_chemistry_samples_per_second": 12.614, | |
| "eval_chemistry_steps_per_second": 12.614, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "eval_math_entropy": 0.7212547466754914, | |
| "eval_math_loss": 0.8686274886131287, | |
| "eval_math_mean_token_accuracy": 0.7728212839365005, | |
| "eval_math_num_tokens": 2504279.0, | |
| "eval_math_runtime": 40.496, | |
| "eval_math_samples_per_second": 12.347, | |
| "eval_math_steps_per_second": 12.347, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "eval_physics_entropy": 0.7448382738828659, | |
| "eval_physics_loss": 0.75321364402771, | |
| "eval_physics_mean_token_accuracy": 0.7901370082497596, | |
| "eval_physics_num_tokens": 2504279.0, | |
| "eval_physics_runtime": 45.1204, | |
| "eval_physics_samples_per_second": 11.081, | |
| "eval_physics_steps_per_second": 11.081, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 0.785609095916152, | |
| "epoch": 0.124, | |
| "grad_norm": 72.5, | |
| "learning_rate": 6.18e-06, | |
| "loss": 12.587, | |
| "mean_token_accuracy": 0.7802519340068101, | |
| "num_tokens": 2583599.0, | |
| "step": 310 | |
| }, | |
| { | |
| "entropy": 0.7176813660189509, | |
| "epoch": 0.128, | |
| "grad_norm": 68.0, | |
| "learning_rate": 6.380000000000001e-06, | |
| "loss": 11.598, | |
| "mean_token_accuracy": 0.7966837007552385, | |
| "num_tokens": 2663987.0, | |
| "step": 320 | |
| }, | |
| { | |
| "entropy": 0.7790673697367311, | |
| "epoch": 0.132, | |
| "grad_norm": 80.0, | |
| "learning_rate": 6.5800000000000005e-06, | |
| "loss": 12.5265, | |
| "mean_token_accuracy": 0.7850641790777445, | |
| "num_tokens": 2743933.0, | |
| "step": 330 | |
| }, | |
| { | |
| "entropy": 0.7665272958576679, | |
| "epoch": 0.136, | |
| "grad_norm": 58.0, | |
| "learning_rate": 6.780000000000001e-06, | |
| "loss": 12.1612, | |
| "mean_token_accuracy": 0.792558753862977, | |
| "num_tokens": 2829720.0, | |
| "step": 340 | |
| }, | |
| { | |
| "entropy": 0.7513320775702595, | |
| "epoch": 0.14, | |
| "grad_norm": 63.5, | |
| "learning_rate": 6.98e-06, | |
| "loss": 12.1271, | |
| "mean_token_accuracy": 0.7889207687228919, | |
| "num_tokens": 2915814.0, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 0.7429860392585397, | |
| "epoch": 0.144, | |
| "grad_norm": 93.0, | |
| "learning_rate": 7.180000000000001e-06, | |
| "loss": 11.8571, | |
| "mean_token_accuracy": 0.7946608085185289, | |
| "num_tokens": 2997824.0, | |
| "step": 360 | |
| }, | |
| { | |
| "entropy": 0.7470190897583961, | |
| "epoch": 0.148, | |
| "grad_norm": 78.0, | |
| "learning_rate": 7.3800000000000005e-06, | |
| "loss": 12.0544, | |
| "mean_token_accuracy": 0.7908409368246794, | |
| "num_tokens": 3076422.0, | |
| "step": 370 | |
| }, | |
| { | |
| "entropy": 0.705881585367024, | |
| "epoch": 0.152, | |
| "grad_norm": 67.0, | |
| "learning_rate": 7.58e-06, | |
| "loss": 11.3247, | |
| "mean_token_accuracy": 0.799928180873394, | |
| "num_tokens": 3159946.0, | |
| "step": 380 | |
| }, | |
| { | |
| "entropy": 0.7021555813029409, | |
| "epoch": 0.156, | |
| "grad_norm": 71.0, | |
| "learning_rate": 7.78e-06, | |
| "loss": 11.1229, | |
| "mean_token_accuracy": 0.8013802103698253, | |
| "num_tokens": 3247218.0, | |
| "step": 390 | |
| }, | |
| { | |
| "entropy": 0.7140809871256352, | |
| "epoch": 0.16, | |
| "grad_norm": 70.5, | |
| "learning_rate": 7.980000000000002e-06, | |
| "loss": 11.5837, | |
| "mean_token_accuracy": 0.7982432089745999, | |
| "num_tokens": 3330597.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "eval_biology_entropy": 1.2522204325199127, | |
| "eval_biology_loss": 1.4104059934616089, | |
| "eval_biology_mean_token_accuracy": 0.6601863117814064, | |
| "eval_biology_num_tokens": 3330597.0, | |
| "eval_biology_runtime": 35.3004, | |
| "eval_biology_samples_per_second": 14.164, | |
| "eval_biology_steps_per_second": 14.164, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "eval_chemistry_entropy": 0.8451869760751725, | |
| "eval_chemistry_loss": 0.8661983609199524, | |
| "eval_chemistry_mean_token_accuracy": 0.7661204301714897, | |
| "eval_chemistry_num_tokens": 3330597.0, | |
| "eval_chemistry_runtime": 39.7099, | |
| "eval_chemistry_samples_per_second": 12.591, | |
| "eval_chemistry_steps_per_second": 12.591, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "eval_math_entropy": 0.7037465732693672, | |
| "eval_math_loss": 0.8592318296432495, | |
| "eval_math_mean_token_accuracy": 0.7752037541866302, | |
| "eval_math_num_tokens": 3330597.0, | |
| "eval_math_runtime": 40.5913, | |
| "eval_math_samples_per_second": 12.318, | |
| "eval_math_steps_per_second": 12.318, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "eval_physics_entropy": 0.7166930376887322, | |
| "eval_physics_loss": 0.7220315933227539, | |
| "eval_physics_mean_token_accuracy": 0.797553953409195, | |
| "eval_physics_num_tokens": 3330597.0, | |
| "eval_physics_runtime": 45.1326, | |
| "eval_physics_samples_per_second": 11.078, | |
| "eval_physics_steps_per_second": 11.078, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 0.7665172951295972, | |
| "epoch": 0.164, | |
| "grad_norm": 76.0, | |
| "learning_rate": 8.18e-06, | |
| "loss": 12.0954, | |
| "mean_token_accuracy": 0.7892688918858767, | |
| "num_tokens": 3414445.0, | |
| "step": 410 | |
| }, | |
| { | |
| "entropy": 0.7116625974886119, | |
| "epoch": 0.168, | |
| "grad_norm": 82.0, | |
| "learning_rate": 8.380000000000001e-06, | |
| "loss": 11.522, | |
| "mean_token_accuracy": 0.7954209111630917, | |
| "num_tokens": 3496104.0, | |
| "step": 420 | |
| }, | |
| { | |
| "entropy": 0.7325116093270481, | |
| "epoch": 0.172, | |
| "grad_norm": 63.75, | |
| "learning_rate": 8.580000000000001e-06, | |
| "loss": 11.8769, | |
| "mean_token_accuracy": 0.7929634388536215, | |
| "num_tokens": 3576979.0, | |
| "step": 430 | |
| }, | |
| { | |
| "entropy": 0.713999939430505, | |
| "epoch": 0.176, | |
| "grad_norm": 87.5, | |
| "learning_rate": 8.78e-06, | |
| "loss": 11.4054, | |
| "mean_token_accuracy": 0.7975817665457725, | |
| "num_tokens": 3658264.0, | |
| "step": 440 | |
| }, | |
| { | |
| "entropy": 0.7327057829126715, | |
| "epoch": 0.18, | |
| "grad_norm": 80.5, | |
| "learning_rate": 8.98e-06, | |
| "loss": 11.7006, | |
| "mean_token_accuracy": 0.7944537442177534, | |
| "num_tokens": 3739142.0, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 0.7196770435199141, | |
| "epoch": 0.184, | |
| "grad_norm": 72.5, | |
| "learning_rate": 9.180000000000002e-06, | |
| "loss": 11.5975, | |
| "mean_token_accuracy": 0.7943054404109716, | |
| "num_tokens": 3821352.0, | |
| "step": 460 | |
| }, | |
| { | |
| "entropy": 0.6925434624776244, | |
| "epoch": 0.188, | |
| "grad_norm": 65.0, | |
| "learning_rate": 9.38e-06, | |
| "loss": 11.1089, | |
| "mean_token_accuracy": 0.8022922430187464, | |
| "num_tokens": 3905291.0, | |
| "step": 470 | |
| }, | |
| { | |
| "entropy": 0.6977481247857213, | |
| "epoch": 0.192, | |
| "grad_norm": 59.5, | |
| "learning_rate": 9.58e-06, | |
| "loss": 11.1687, | |
| "mean_token_accuracy": 0.803613081201911, | |
| "num_tokens": 3995568.0, | |
| "step": 480 | |
| }, | |
| { | |
| "entropy": 0.7005923493765295, | |
| "epoch": 0.196, | |
| "grad_norm": 73.5, | |
| "learning_rate": 9.780000000000001e-06, | |
| "loss": 11.3477, | |
| "mean_token_accuracy": 0.8010011687874794, | |
| "num_tokens": 4075670.0, | |
| "step": 490 | |
| }, | |
| { | |
| "entropy": 0.6876647426746786, | |
| "epoch": 0.2, | |
| "grad_norm": 70.0, | |
| "learning_rate": 9.980000000000001e-06, | |
| "loss": 10.9642, | |
| "mean_token_accuracy": 0.8049134809523821, | |
| "num_tokens": 4160918.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "eval_biology_entropy": 1.2019921021461486, | |
| "eval_biology_loss": 1.389614462852478, | |
| "eval_biology_mean_token_accuracy": 0.6654457822442055, | |
| "eval_biology_num_tokens": 4160918.0, | |
| "eval_biology_runtime": 35.0884, | |
| "eval_biology_samples_per_second": 14.25, | |
| "eval_biology_steps_per_second": 14.25, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "eval_chemistry_entropy": 0.8132011856138707, | |
| "eval_chemistry_loss": 0.8497000932693481, | |
| "eval_chemistry_mean_token_accuracy": 0.7708191834092141, | |
| "eval_chemistry_num_tokens": 4160918.0, | |
| "eval_chemistry_runtime": 39.8091, | |
| "eval_chemistry_samples_per_second": 12.56, | |
| "eval_chemistry_steps_per_second": 12.56, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "eval_math_entropy": 0.6891986537277699, | |
| "eval_math_loss": 0.8553754687309265, | |
| "eval_math_mean_token_accuracy": 0.7759535777568817, | |
| "eval_math_num_tokens": 4160918.0, | |
| "eval_math_runtime": 40.7076, | |
| "eval_math_samples_per_second": 12.283, | |
| "eval_math_steps_per_second": 12.283, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "eval_physics_entropy": 0.6950600004792213, | |
| "eval_physics_loss": 0.7005596160888672, | |
| "eval_physics_mean_token_accuracy": 0.8021351820230485, | |
| "eval_physics_num_tokens": 4160918.0, | |
| "eval_physics_runtime": 45.2305, | |
| "eval_physics_samples_per_second": 11.054, | |
| "eval_physics_steps_per_second": 11.054, | |
| "step": 500 | |
| }, | |
| { | |
| "entropy": 0.6887986609712243, | |
| "epoch": 0.204, | |
| "grad_norm": 61.75, | |
| "learning_rate": 1.018e-05, | |
| "loss": 11.1871, | |
| "mean_token_accuracy": 0.8051418982446193, | |
| "num_tokens": 4238810.0, | |
| "step": 510 | |
| }, | |
| { | |
| "entropy": 0.6818487482145429, | |
| "epoch": 0.208, | |
| "grad_norm": 64.0, | |
| "learning_rate": 1.038e-05, | |
| "loss": 10.9951, | |
| "mean_token_accuracy": 0.8058457836508751, | |
| "num_tokens": 4321436.0, | |
| "step": 520 | |
| }, | |
| { | |
| "entropy": 0.7008362987078727, | |
| "epoch": 0.212, | |
| "grad_norm": 67.5, | |
| "learning_rate": 1.0580000000000002e-05, | |
| "loss": 11.0955, | |
| "mean_token_accuracy": 0.8010448336601257, | |
| "num_tokens": 4401710.0, | |
| "step": 530 | |
| }, | |
| { | |
| "entropy": 0.7055313820019364, | |
| "epoch": 0.216, | |
| "grad_norm": 62.25, | |
| "learning_rate": 1.0780000000000002e-05, | |
| "loss": 11.4932, | |
| "mean_token_accuracy": 0.79723966345191, | |
| "num_tokens": 4482225.0, | |
| "step": 540 | |
| }, | |
| { | |
| "entropy": 0.6803635071963072, | |
| "epoch": 0.22, | |
| "grad_norm": 68.5, | |
| "learning_rate": 1.0980000000000002e-05, | |
| "loss": 11.0034, | |
| "mean_token_accuracy": 0.804273284226656, | |
| "num_tokens": 4564662.0, | |
| "step": 550 | |
| }, | |
| { | |
| "entropy": 0.667565680295229, | |
| "epoch": 0.224, | |
| "grad_norm": 52.0, | |
| "learning_rate": 1.1180000000000001e-05, | |
| "loss": 10.6657, | |
| "mean_token_accuracy": 0.8107417456805706, | |
| "num_tokens": 4648491.0, | |
| "step": 560 | |
| }, | |
| { | |
| "entropy": 0.6490520000457763, | |
| "epoch": 0.228, | |
| "grad_norm": 56.5, | |
| "learning_rate": 1.138e-05, | |
| "loss": 10.5546, | |
| "mean_token_accuracy": 0.8112937267869711, | |
| "num_tokens": 4731596.0, | |
| "step": 570 | |
| }, | |
| { | |
| "entropy": 0.7194359874352813, | |
| "epoch": 0.232, | |
| "grad_norm": 58.25, | |
| "learning_rate": 1.1580000000000001e-05, | |
| "loss": 11.4255, | |
| "mean_token_accuracy": 0.7990602564066649, | |
| "num_tokens": 4813449.0, | |
| "step": 580 | |
| }, | |
| { | |
| "entropy": 0.6862584982067347, | |
| "epoch": 0.236, | |
| "grad_norm": 50.25, | |
| "learning_rate": 1.178e-05, | |
| "loss": 11.0007, | |
| "mean_token_accuracy": 0.8042842660099268, | |
| "num_tokens": 4899144.0, | |
| "step": 590 | |
| }, | |
| { | |
| "entropy": 0.6895454197190702, | |
| "epoch": 0.24, | |
| "grad_norm": 58.5, | |
| "learning_rate": 1.198e-05, | |
| "loss": 11.1171, | |
| "mean_token_accuracy": 0.8028610210865736, | |
| "num_tokens": 4986175.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "eval_biology_entropy": 1.2410894528031349, | |
| "eval_biology_loss": 1.3766074180603027, | |
| "eval_biology_mean_token_accuracy": 0.6672532483935356, | |
| "eval_biology_num_tokens": 4986175.0, | |
| "eval_biology_runtime": 35.1026, | |
| "eval_biology_samples_per_second": 14.244, | |
| "eval_biology_steps_per_second": 14.244, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "eval_chemistry_entropy": 0.8104133826196194, | |
| "eval_chemistry_loss": 0.8397422432899475, | |
| "eval_chemistry_mean_token_accuracy": 0.7722517918944359, | |
| "eval_chemistry_num_tokens": 4986175.0, | |
| "eval_chemistry_runtime": 39.6397, | |
| "eval_chemistry_samples_per_second": 12.614, | |
| "eval_chemistry_steps_per_second": 12.614, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "eval_math_entropy": 0.6969810444712639, | |
| "eval_math_loss": 0.8505039215087891, | |
| "eval_math_mean_token_accuracy": 0.7772268445491791, | |
| "eval_math_num_tokens": 4986175.0, | |
| "eval_math_runtime": 40.4861, | |
| "eval_math_samples_per_second": 12.35, | |
| "eval_math_steps_per_second": 12.35, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "eval_physics_entropy": 0.6853992799520493, | |
| "eval_physics_loss": 0.6912311911582947, | |
| "eval_physics_mean_token_accuracy": 0.8048652278184891, | |
| "eval_physics_num_tokens": 4986175.0, | |
| "eval_physics_runtime": 45.4233, | |
| "eval_physics_samples_per_second": 11.008, | |
| "eval_physics_steps_per_second": 11.008, | |
| "step": 600 | |
| }, | |
| { | |
| "entropy": 0.6969290849752724, | |
| "epoch": 0.244, | |
| "grad_norm": 65.0, | |
| "learning_rate": 1.218e-05, | |
| "loss": 11.151, | |
| "mean_token_accuracy": 0.8016246553510428, | |
| "num_tokens": 5071994.0, | |
| "step": 610 | |
| }, | |
| { | |
| "entropy": 0.6583109201863409, | |
| "epoch": 0.248, | |
| "grad_norm": 57.5, | |
| "learning_rate": 1.2380000000000002e-05, | |
| "loss": 10.5979, | |
| "mean_token_accuracy": 0.8095750134438277, | |
| "num_tokens": 5164741.0, | |
| "step": 620 | |
| }, | |
| { | |
| "entropy": 0.7111584974452854, | |
| "epoch": 0.252, | |
| "grad_norm": 81.5, | |
| "learning_rate": 1.2580000000000002e-05, | |
| "loss": 11.3705, | |
| "mean_token_accuracy": 0.7996403712779283, | |
| "num_tokens": 5245878.0, | |
| "step": 630 | |
| }, | |
| { | |
| "entropy": 0.6798671390861273, | |
| "epoch": 0.256, | |
| "grad_norm": 65.5, | |
| "learning_rate": 1.2780000000000001e-05, | |
| "loss": 11.0375, | |
| "mean_token_accuracy": 0.802841005846858, | |
| "num_tokens": 5329320.0, | |
| "step": 640 | |
| }, | |
| { | |
| "entropy": 0.6868512833490967, | |
| "epoch": 0.26, | |
| "grad_norm": 55.0, | |
| "learning_rate": 1.2980000000000001e-05, | |
| "loss": 10.9356, | |
| "mean_token_accuracy": 0.8048643987625838, | |
| "num_tokens": 5412557.0, | |
| "step": 650 | |
| }, | |
| { | |
| "entropy": 0.660436293296516, | |
| "epoch": 0.264, | |
| "grad_norm": 65.5, | |
| "learning_rate": 1.3180000000000001e-05, | |
| "loss": 10.8415, | |
| "mean_token_accuracy": 0.8079600531607867, | |
| "num_tokens": 5497259.0, | |
| "step": 660 | |
| }, | |
| { | |
| "entropy": 0.7050647180527448, | |
| "epoch": 0.268, | |
| "grad_norm": 60.5, | |
| "learning_rate": 1.3380000000000002e-05, | |
| "loss": 11.106, | |
| "mean_token_accuracy": 0.8046324852854013, | |
| "num_tokens": 5576721.0, | |
| "step": 670 | |
| }, | |
| { | |
| "entropy": 0.7028336159884929, | |
| "epoch": 0.272, | |
| "grad_norm": 61.25, | |
| "learning_rate": 1.3580000000000002e-05, | |
| "loss": 11.3741, | |
| "mean_token_accuracy": 0.7989816222339868, | |
| "num_tokens": 5658796.0, | |
| "step": 680 | |
| }, | |
| { | |
| "entropy": 0.713030344620347, | |
| "epoch": 0.276, | |
| "grad_norm": 62.0, | |
| "learning_rate": 1.378e-05, | |
| "loss": 11.4372, | |
| "mean_token_accuracy": 0.797571299597621, | |
| "num_tokens": 5737499.0, | |
| "step": 690 | |
| }, | |
| { | |
| "entropy": 0.6879976620897651, | |
| "epoch": 0.28, | |
| "grad_norm": 78.5, | |
| "learning_rate": 1.398e-05, | |
| "loss": 11.1153, | |
| "mean_token_accuracy": 0.8056350216269493, | |
| "num_tokens": 5819027.0, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "eval_biology_entropy": 1.2951917058229447, | |
| "eval_biology_loss": 1.374377727508545, | |
| "eval_biology_mean_token_accuracy": 0.6666482761502266, | |
| "eval_biology_num_tokens": 5819027.0, | |
| "eval_biology_runtime": 35.3803, | |
| "eval_biology_samples_per_second": 14.132, | |
| "eval_biology_steps_per_second": 14.132, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "eval_chemistry_entropy": 0.8508687050342559, | |
| "eval_chemistry_loss": 0.8420527577400208, | |
| "eval_chemistry_mean_token_accuracy": 0.7695601610541344, | |
| "eval_chemistry_num_tokens": 5819027.0, | |
| "eval_chemistry_runtime": 39.6948, | |
| "eval_chemistry_samples_per_second": 12.596, | |
| "eval_chemistry_steps_per_second": 12.596, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "eval_math_entropy": 0.7296031129956245, | |
| "eval_math_loss": 0.8581300377845764, | |
| "eval_math_mean_token_accuracy": 0.7747315044403076, | |
| "eval_math_num_tokens": 5819027.0, | |
| "eval_math_runtime": 40.6002, | |
| "eval_math_samples_per_second": 12.315, | |
| "eval_math_steps_per_second": 12.315, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "eval_physics_entropy": 0.7137228677272797, | |
| "eval_physics_loss": 0.6899552941322327, | |
| "eval_physics_mean_token_accuracy": 0.804570896744728, | |
| "eval_physics_num_tokens": 5819027.0, | |
| "eval_physics_runtime": 45.1753, | |
| "eval_physics_samples_per_second": 11.068, | |
| "eval_physics_steps_per_second": 11.068, | |
| "step": 700 | |
| }, | |
| { | |
| "entropy": 0.7131443534046411, | |
| "epoch": 0.284, | |
| "grad_norm": 76.5, | |
| "learning_rate": 1.418e-05, | |
| "loss": 11.5042, | |
| "mean_token_accuracy": 0.7973460745066404, | |
| "num_tokens": 5898693.0, | |
| "step": 710 | |
| }, | |
| { | |
| "entropy": 0.6842026851139963, | |
| "epoch": 0.288, | |
| "grad_norm": 58.5, | |
| "learning_rate": 1.4380000000000001e-05, | |
| "loss": 10.9569, | |
| "mean_token_accuracy": 0.8062832679599523, | |
| "num_tokens": 5980201.0, | |
| "step": 720 | |
| }, | |
| { | |
| "entropy": 0.6580634312704206, | |
| "epoch": 0.292, | |
| "grad_norm": 58.0, | |
| "learning_rate": 1.4580000000000001e-05, | |
| "loss": 10.6812, | |
| "mean_token_accuracy": 0.8109602797776461, | |
| "num_tokens": 6059507.0, | |
| "step": 730 | |
| }, | |
| { | |
| "entropy": 0.6885316792875529, | |
| "epoch": 0.296, | |
| "grad_norm": 59.0, | |
| "learning_rate": 1.478e-05, | |
| "loss": 11.1324, | |
| "mean_token_accuracy": 0.8033004485070705, | |
| "num_tokens": 6139830.0, | |
| "step": 740 | |
| }, | |
| { | |
| "entropy": 0.6904184087179601, | |
| "epoch": 0.3, | |
| "grad_norm": 54.0, | |
| "learning_rate": 1.498e-05, | |
| "loss": 10.9676, | |
| "mean_token_accuracy": 0.8058249101042747, | |
| "num_tokens": 6218134.0, | |
| "step": 750 | |
| }, | |
| { | |
| "entropy": 0.693754238076508, | |
| "epoch": 0.304, | |
| "grad_norm": 59.5, | |
| "learning_rate": 1.5180000000000002e-05, | |
| "loss": 11.1453, | |
| "mean_token_accuracy": 0.804497017711401, | |
| "num_tokens": 6298858.0, | |
| "step": 760 | |
| }, | |
| { | |
| "entropy": 0.7230889040045441, | |
| "epoch": 0.308, | |
| "grad_norm": 66.5, | |
| "learning_rate": 1.5380000000000002e-05, | |
| "loss": 11.775, | |
| "mean_token_accuracy": 0.7913905128836631, | |
| "num_tokens": 6375761.0, | |
| "step": 770 | |
| }, | |
| { | |
| "entropy": 0.6630848026834428, | |
| "epoch": 0.312, | |
| "grad_norm": 50.0, | |
| "learning_rate": 1.5580000000000003e-05, | |
| "loss": 10.6675, | |
| "mean_token_accuracy": 0.8106919646263122, | |
| "num_tokens": 6456502.0, | |
| "step": 780 | |
| }, | |
| { | |
| "entropy": 0.6884183536283672, | |
| "epoch": 0.316, | |
| "grad_norm": 65.5, | |
| "learning_rate": 1.578e-05, | |
| "loss": 11.2036, | |
| "mean_token_accuracy": 0.8040056221187115, | |
| "num_tokens": 6537521.0, | |
| "step": 790 | |
| }, | |
| { | |
| "entropy": 0.7094144577160477, | |
| "epoch": 0.32, | |
| "grad_norm": 51.75, | |
| "learning_rate": 1.5980000000000003e-05, | |
| "loss": 11.3893, | |
| "mean_token_accuracy": 0.7988939873874188, | |
| "num_tokens": 6622798.0, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "eval_biology_entropy": 1.272662544965744, | |
| "eval_biology_loss": 1.3795855045318604, | |
| "eval_biology_mean_token_accuracy": 0.6654499787688255, | |
| "eval_biology_num_tokens": 6622798.0, | |
| "eval_biology_runtime": 35.1835, | |
| "eval_biology_samples_per_second": 14.211, | |
| "eval_biology_steps_per_second": 14.211, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "eval_chemistry_entropy": 0.8212787560224533, | |
| "eval_chemistry_loss": 0.8387787938117981, | |
| "eval_chemistry_mean_token_accuracy": 0.7721875888109208, | |
| "eval_chemistry_num_tokens": 6622798.0, | |
| "eval_chemistry_runtime": 39.5827, | |
| "eval_chemistry_samples_per_second": 12.632, | |
| "eval_chemistry_steps_per_second": 12.632, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "eval_math_entropy": 0.6975619449019432, | |
| "eval_math_loss": 0.8566721081733704, | |
| "eval_math_mean_token_accuracy": 0.7762556113004685, | |
| "eval_math_num_tokens": 6622798.0, | |
| "eval_math_runtime": 40.4554, | |
| "eval_math_samples_per_second": 12.359, | |
| "eval_math_steps_per_second": 12.359, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "eval_physics_entropy": 0.6843295809030533, | |
| "eval_physics_loss": 0.6862691640853882, | |
| "eval_physics_mean_token_accuracy": 0.8047665995955467, | |
| "eval_physics_num_tokens": 6622798.0, | |
| "eval_physics_runtime": 45.1382, | |
| "eval_physics_samples_per_second": 11.077, | |
| "eval_physics_steps_per_second": 11.077, | |
| "step": 800 | |
| }, | |
| { | |
| "entropy": 0.6872260123491287, | |
| "epoch": 0.324, | |
| "grad_norm": 61.75, | |
| "learning_rate": 1.618e-05, | |
| "loss": 11.1453, | |
| "mean_token_accuracy": 0.8037573281675577, | |
| "num_tokens": 6699373.0, | |
| "step": 810 | |
| }, | |
| { | |
| "entropy": 0.7004622181877493, | |
| "epoch": 0.328, | |
| "grad_norm": 53.0, | |
| "learning_rate": 1.638e-05, | |
| "loss": 11.2569, | |
| "mean_token_accuracy": 0.8005565378814936, | |
| "num_tokens": 6782985.0, | |
| "step": 820 | |
| }, | |
| { | |
| "entropy": 0.683347325026989, | |
| "epoch": 0.332, | |
| "grad_norm": 67.0, | |
| "learning_rate": 1.658e-05, | |
| "loss": 10.9209, | |
| "mean_token_accuracy": 0.8052662838250398, | |
| "num_tokens": 6868578.0, | |
| "step": 830 | |
| }, | |
| { | |
| "entropy": 0.6550024008378387, | |
| "epoch": 0.336, | |
| "grad_norm": 49.0, | |
| "learning_rate": 1.6780000000000002e-05, | |
| "loss": 10.4463, | |
| "mean_token_accuracy": 0.8156119327992201, | |
| "num_tokens": 6953611.0, | |
| "step": 840 | |
| }, | |
| { | |
| "entropy": 0.6940796528942883, | |
| "epoch": 0.34, | |
| "grad_norm": 60.75, | |
| "learning_rate": 1.698e-05, | |
| "loss": 11.4328, | |
| "mean_token_accuracy": 0.7994465328752994, | |
| "num_tokens": 7037831.0, | |
| "step": 850 | |
| }, | |
| { | |
| "entropy": 0.6509441762231291, | |
| "epoch": 0.344, | |
| "grad_norm": 68.0, | |
| "learning_rate": 1.718e-05, | |
| "loss": 10.5903, | |
| "mean_token_accuracy": 0.8100398641079665, | |
| "num_tokens": 7121949.0, | |
| "step": 860 | |
| }, | |
| { | |
| "entropy": 0.68049411540851, | |
| "epoch": 0.348, | |
| "grad_norm": 50.75, | |
| "learning_rate": 1.7380000000000003e-05, | |
| "loss": 10.7995, | |
| "mean_token_accuracy": 0.8092381667345763, | |
| "num_tokens": 7206056.0, | |
| "step": 870 | |
| }, | |
| { | |
| "entropy": 0.6681747000664473, | |
| "epoch": 0.352, | |
| "grad_norm": 59.0, | |
| "learning_rate": 1.758e-05, | |
| "loss": 10.7902, | |
| "mean_token_accuracy": 0.8085832469165325, | |
| "num_tokens": 7290378.0, | |
| "step": 880 | |
| }, | |
| { | |
| "entropy": 0.6910556216724217, | |
| "epoch": 0.356, | |
| "grad_norm": 46.5, | |
| "learning_rate": 1.7780000000000003e-05, | |
| "loss": 11.072, | |
| "mean_token_accuracy": 0.8025941159576178, | |
| "num_tokens": 7367952.0, | |
| "step": 890 | |
| }, | |
| { | |
| "entropy": 0.6846336699090898, | |
| "epoch": 0.36, | |
| "grad_norm": 69.0, | |
| "learning_rate": 1.798e-05, | |
| "loss": 10.9941, | |
| "mean_token_accuracy": 0.8069840248674154, | |
| "num_tokens": 7452920.0, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "eval_biology_entropy": 1.3079615205526351, | |
| "eval_biology_loss": 1.3845115900039673, | |
| "eval_biology_mean_token_accuracy": 0.6659645406007767, | |
| "eval_biology_num_tokens": 7452920.0, | |
| "eval_biology_runtime": 35.2058, | |
| "eval_biology_samples_per_second": 14.202, | |
| "eval_biology_steps_per_second": 14.202, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "eval_chemistry_entropy": 0.855868073105812, | |
| "eval_chemistry_loss": 0.8524179458618164, | |
| "eval_chemistry_mean_token_accuracy": 0.7686776904463768, | |
| "eval_chemistry_num_tokens": 7452920.0, | |
| "eval_chemistry_runtime": 39.6971, | |
| "eval_chemistry_samples_per_second": 12.595, | |
| "eval_chemistry_steps_per_second": 12.595, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "eval_math_entropy": 0.7238874210417271, | |
| "eval_math_loss": 0.8637869954109192, | |
| "eval_math_mean_token_accuracy": 0.774171804189682, | |
| "eval_math_num_tokens": 7452920.0, | |
| "eval_math_runtime": 40.502, | |
| "eval_math_samples_per_second": 12.345, | |
| "eval_math_steps_per_second": 12.345, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "eval_physics_entropy": 0.703471056163311, | |
| "eval_physics_loss": 0.6876726150512695, | |
| "eval_physics_mean_token_accuracy": 0.8049070924520493, | |
| "eval_physics_num_tokens": 7452920.0, | |
| "eval_physics_runtime": 45.1347, | |
| "eval_physics_samples_per_second": 11.078, | |
| "eval_physics_steps_per_second": 11.078, | |
| "step": 900 | |
| }, | |
| { | |
| "entropy": 0.6624960891902447, | |
| "epoch": 0.364, | |
| "grad_norm": 50.25, | |
| "learning_rate": 1.8180000000000002e-05, | |
| "loss": 10.6947, | |
| "mean_token_accuracy": 0.8089043792337179, | |
| "num_tokens": 7535961.0, | |
| "step": 910 | |
| }, | |
| { | |
| "entropy": 0.6663541577756404, | |
| "epoch": 0.368, | |
| "grad_norm": 51.0, | |
| "learning_rate": 1.8380000000000004e-05, | |
| "loss": 10.5684, | |
| "mean_token_accuracy": 0.810727647319436, | |
| "num_tokens": 7621082.0, | |
| "step": 920 | |
| }, | |
| { | |
| "entropy": 0.6552547904662788, | |
| "epoch": 0.372, | |
| "grad_norm": 51.5, | |
| "learning_rate": 1.858e-05, | |
| "loss": 10.6444, | |
| "mean_token_accuracy": 0.8090339493006468, | |
| "num_tokens": 7705651.0, | |
| "step": 930 | |
| }, | |
| { | |
| "entropy": 0.7489686900749802, | |
| "epoch": 0.376, | |
| "grad_norm": 51.5, | |
| "learning_rate": 1.878e-05, | |
| "loss": 12.0319, | |
| "mean_token_accuracy": 0.7858695086091757, | |
| "num_tokens": 7787036.0, | |
| "step": 940 | |
| }, | |
| { | |
| "entropy": 0.6922270046547055, | |
| "epoch": 0.38, | |
| "grad_norm": 63.0, | |
| "learning_rate": 1.898e-05, | |
| "loss": 11.1499, | |
| "mean_token_accuracy": 0.7991227209568024, | |
| "num_tokens": 7869437.0, | |
| "step": 950 | |
| }, | |
| { | |
| "entropy": 0.6693172385916114, | |
| "epoch": 0.384, | |
| "grad_norm": 45.75, | |
| "learning_rate": 1.918e-05, | |
| "loss": 10.7571, | |
| "mean_token_accuracy": 0.8078656267374754, | |
| "num_tokens": 7955570.0, | |
| "step": 960 | |
| }, | |
| { | |
| "entropy": 0.6446146180853247, | |
| "epoch": 0.388, | |
| "grad_norm": 43.5, | |
| "learning_rate": 1.938e-05, | |
| "loss": 10.4611, | |
| "mean_token_accuracy": 0.8115016505122185, | |
| "num_tokens": 8038818.0, | |
| "step": 970 | |
| }, | |
| { | |
| "entropy": 0.6791700626723468, | |
| "epoch": 0.392, | |
| "grad_norm": 45.0, | |
| "learning_rate": 1.9580000000000002e-05, | |
| "loss": 10.8502, | |
| "mean_token_accuracy": 0.8048314619809389, | |
| "num_tokens": 8123527.0, | |
| "step": 980 | |
| }, | |
| { | |
| "entropy": 0.684117188770324, | |
| "epoch": 0.396, | |
| "grad_norm": 48.75, | |
| "learning_rate": 1.978e-05, | |
| "loss": 10.9286, | |
| "mean_token_accuracy": 0.8045736275613308, | |
| "num_tokens": 8206501.0, | |
| "step": 990 | |
| }, | |
| { | |
| "entropy": 0.6638183539733291, | |
| "epoch": 0.4, | |
| "grad_norm": 45.25, | |
| "learning_rate": 1.9980000000000002e-05, | |
| "loss": 10.7878, | |
| "mean_token_accuracy": 0.8090838421136141, | |
| "num_tokens": 8291049.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "eval_biology_entropy": 1.3014859555363656, | |
| "eval_biology_loss": 1.4013315439224243, | |
| "eval_biology_mean_token_accuracy": 0.6621385813355446, | |
| "eval_biology_num_tokens": 8291049.0, | |
| "eval_biology_runtime": 35.3423, | |
| "eval_biology_samples_per_second": 14.147, | |
| "eval_biology_steps_per_second": 14.147, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "eval_chemistry_entropy": 0.8515431408882141, | |
| "eval_chemistry_loss": 0.8530916571617126, | |
| "eval_chemistry_mean_token_accuracy": 0.7688064026236534, | |
| "eval_chemistry_num_tokens": 8291049.0, | |
| "eval_chemistry_runtime": 39.7131, | |
| "eval_chemistry_samples_per_second": 12.59, | |
| "eval_chemistry_steps_per_second": 12.59, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "eval_math_entropy": 0.7349339348077774, | |
| "eval_math_loss": 0.8596640825271606, | |
| "eval_math_mean_token_accuracy": 0.7750666145086288, | |
| "eval_math_num_tokens": 8291049.0, | |
| "eval_math_runtime": 40.5711, | |
| "eval_math_samples_per_second": 12.324, | |
| "eval_math_steps_per_second": 12.324, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "eval_physics_entropy": 0.698657899916172, | |
| "eval_physics_loss": 0.690846860408783, | |
| "eval_physics_mean_token_accuracy": 0.8048570597171784, | |
| "eval_physics_num_tokens": 8291049.0, | |
| "eval_physics_runtime": 45.18, | |
| "eval_physics_samples_per_second": 11.067, | |
| "eval_physics_steps_per_second": 11.067, | |
| "step": 1000 | |
| }, | |
| { | |
| "entropy": 0.6724585631862283, | |
| "epoch": 0.404, | |
| "grad_norm": 47.0, | |
| "learning_rate": 1.9980000000000002e-05, | |
| "loss": 10.7513, | |
| "mean_token_accuracy": 0.8090190943330526, | |
| "num_tokens": 8373687.0, | |
| "step": 1010 | |
| }, | |
| { | |
| "entropy": 0.6880609506741167, | |
| "epoch": 0.408, | |
| "grad_norm": 49.0, | |
| "learning_rate": 1.995777777777778e-05, | |
| "loss": 11.0507, | |
| "mean_token_accuracy": 0.8037724006921053, | |
| "num_tokens": 8459795.0, | |
| "step": 1020 | |
| }, | |
| { | |
| "entropy": 0.6609848150983453, | |
| "epoch": 0.412, | |
| "grad_norm": 50.75, | |
| "learning_rate": 1.9935555555555557e-05, | |
| "loss": 10.803, | |
| "mean_token_accuracy": 0.8078120674937963, | |
| "num_tokens": 8544948.0, | |
| "step": 1030 | |
| }, | |
| { | |
| "entropy": 0.6650119775906205, | |
| "epoch": 0.416, | |
| "grad_norm": 49.25, | |
| "learning_rate": 1.9913333333333335e-05, | |
| "loss": 10.794, | |
| "mean_token_accuracy": 0.8062687937170268, | |
| "num_tokens": 8627310.0, | |
| "step": 1040 | |
| }, | |
| { | |
| "entropy": 0.7647646913304925, | |
| "epoch": 0.42, | |
| "grad_norm": 50.25, | |
| "learning_rate": 1.9891111111111112e-05, | |
| "loss": 12.2348, | |
| "mean_token_accuracy": 0.7883469216525555, | |
| "num_tokens": 8707798.0, | |
| "step": 1050 | |
| }, | |
| { | |
| "entropy": 0.6614232028834521, | |
| "epoch": 0.424, | |
| "grad_norm": 46.0, | |
| "learning_rate": 1.986888888888889e-05, | |
| "loss": 10.6404, | |
| "mean_token_accuracy": 0.8107621461153031, | |
| "num_tokens": 8792574.0, | |
| "step": 1060 | |
| }, | |
| { | |
| "entropy": 0.6635355862788856, | |
| "epoch": 0.428, | |
| "grad_norm": 54.25, | |
| "learning_rate": 1.9846666666666668e-05, | |
| "loss": 10.7408, | |
| "mean_token_accuracy": 0.8067193511873484, | |
| "num_tokens": 8878637.0, | |
| "step": 1070 | |
| }, | |
| { | |
| "entropy": 0.6831195020116866, | |
| "epoch": 0.432, | |
| "grad_norm": 56.75, | |
| "learning_rate": 1.9824444444444445e-05, | |
| "loss": 10.9817, | |
| "mean_token_accuracy": 0.8036571219563484, | |
| "num_tokens": 8958371.0, | |
| "step": 1080 | |
| }, | |
| { | |
| "entropy": 0.6557174487970769, | |
| "epoch": 0.436, | |
| "grad_norm": 56.25, | |
| "learning_rate": 1.9802222222222226e-05, | |
| "loss": 10.5087, | |
| "mean_token_accuracy": 0.8101301193237305, | |
| "num_tokens": 9040649.0, | |
| "step": 1090 | |
| }, | |
| { | |
| "entropy": 0.6296142281033099, | |
| "epoch": 0.44, | |
| "grad_norm": 46.25, | |
| "learning_rate": 1.978e-05, | |
| "loss": 10.1617, | |
| "mean_token_accuracy": 0.8164261173456907, | |
| "num_tokens": 9125564.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "eval_biology_entropy": 1.2857338542938233, | |
| "eval_biology_loss": 1.4026042222976685, | |
| "eval_biology_mean_token_accuracy": 0.6611523522734642, | |
| "eval_biology_num_tokens": 9125564.0, | |
| "eval_biology_runtime": 35.3074, | |
| "eval_biology_samples_per_second": 14.161, | |
| "eval_biology_steps_per_second": 14.161, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "eval_chemistry_entropy": 0.8301562021374702, | |
| "eval_chemistry_loss": 0.8592454195022583, | |
| "eval_chemistry_mean_token_accuracy": 0.7674569791555405, | |
| "eval_chemistry_num_tokens": 9125564.0, | |
| "eval_chemistry_runtime": 39.7025, | |
| "eval_chemistry_samples_per_second": 12.594, | |
| "eval_chemistry_steps_per_second": 12.594, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "eval_math_entropy": 0.704224185705185, | |
| "eval_math_loss": 0.8685066103935242, | |
| "eval_math_mean_token_accuracy": 0.7726707045435905, | |
| "eval_math_num_tokens": 9125564.0, | |
| "eval_math_runtime": 40.5031, | |
| "eval_math_samples_per_second": 12.345, | |
| "eval_math_steps_per_second": 12.345, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "eval_physics_entropy": 0.676539529800415, | |
| "eval_physics_loss": 0.6832760572433472, | |
| "eval_physics_mean_token_accuracy": 0.8052293303608894, | |
| "eval_physics_num_tokens": 9125564.0, | |
| "eval_physics_runtime": 45.1101, | |
| "eval_physics_samples_per_second": 11.084, | |
| "eval_physics_steps_per_second": 11.084, | |
| "step": 1100 | |
| }, | |
| { | |
| "entropy": 0.7054770017042756, | |
| "epoch": 0.444, | |
| "grad_norm": 47.75, | |
| "learning_rate": 1.975777777777778e-05, | |
| "loss": 11.3801, | |
| "mean_token_accuracy": 0.8028026439249516, | |
| "num_tokens": 9210443.0, | |
| "step": 1110 | |
| }, | |
| { | |
| "entropy": 0.6552120961248875, | |
| "epoch": 0.448, | |
| "grad_norm": 42.0, | |
| "learning_rate": 1.9735555555555556e-05, | |
| "loss": 10.7148, | |
| "mean_token_accuracy": 0.8113164242357016, | |
| "num_tokens": 9294388.0, | |
| "step": 1120 | |
| }, | |
| { | |
| "entropy": 0.6474918988533318, | |
| "epoch": 0.452, | |
| "grad_norm": 49.0, | |
| "learning_rate": 1.9713333333333337e-05, | |
| "loss": 10.1143, | |
| "mean_token_accuracy": 0.8196653757244349, | |
| "num_tokens": 9371203.0, | |
| "step": 1130 | |
| }, | |
| { | |
| "entropy": 0.6594600110314787, | |
| "epoch": 0.456, | |
| "grad_norm": 43.5, | |
| "learning_rate": 1.969111111111111e-05, | |
| "loss": 10.7859, | |
| "mean_token_accuracy": 0.806324940547347, | |
| "num_tokens": 9455428.0, | |
| "step": 1140 | |
| }, | |
| { | |
| "entropy": 0.6459505766630173, | |
| "epoch": 0.46, | |
| "grad_norm": 41.25, | |
| "learning_rate": 1.9668888888888892e-05, | |
| "loss": 10.2758, | |
| "mean_token_accuracy": 0.8145925845950842, | |
| "num_tokens": 9539605.0, | |
| "step": 1150 | |
| }, | |
| { | |
| "entropy": 0.6633252296596766, | |
| "epoch": 0.464, | |
| "grad_norm": 46.75, | |
| "learning_rate": 1.9646666666666666e-05, | |
| "loss": 10.746, | |
| "mean_token_accuracy": 0.8067185960710048, | |
| "num_tokens": 9620366.0, | |
| "step": 1160 | |
| }, | |
| { | |
| "entropy": 0.6536956983618438, | |
| "epoch": 0.468, | |
| "grad_norm": 54.75, | |
| "learning_rate": 1.9624444444444447e-05, | |
| "loss": 10.5255, | |
| "mean_token_accuracy": 0.8116643182933331, | |
| "num_tokens": 9706621.0, | |
| "step": 1170 | |
| }, | |
| { | |
| "entropy": 0.6528704442083836, | |
| "epoch": 0.472, | |
| "grad_norm": 42.5, | |
| "learning_rate": 1.9602222222222225e-05, | |
| "loss": 10.4167, | |
| "mean_token_accuracy": 0.8110667478293181, | |
| "num_tokens": 9788925.0, | |
| "step": 1180 | |
| }, | |
| { | |
| "entropy": 0.6303915046155453, | |
| "epoch": 0.476, | |
| "grad_norm": 40.75, | |
| "learning_rate": 1.9580000000000002e-05, | |
| "loss": 10.2211, | |
| "mean_token_accuracy": 0.8153577871620655, | |
| "num_tokens": 9869908.0, | |
| "step": 1190 | |
| }, | |
| { | |
| "entropy": 0.7140287417918444, | |
| "epoch": 0.48, | |
| "grad_norm": 50.5, | |
| "learning_rate": 1.955777777777778e-05, | |
| "loss": 11.4393, | |
| "mean_token_accuracy": 0.7977926902472973, | |
| "num_tokens": 9955431.0, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "eval_biology_entropy": 1.2901287813782691, | |
| "eval_biology_loss": 1.4074944257736206, | |
| "eval_biology_mean_token_accuracy": 0.6630856924653054, | |
| "eval_biology_num_tokens": 9955431.0, | |
| "eval_biology_runtime": 35.1485, | |
| "eval_biology_samples_per_second": 14.225, | |
| "eval_biology_steps_per_second": 14.225, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "eval_chemistry_entropy": 0.8402542336285115, | |
| "eval_chemistry_loss": 0.8475250005722046, | |
| "eval_chemistry_mean_token_accuracy": 0.7706258528828621, | |
| "eval_chemistry_num_tokens": 9955431.0, | |
| "eval_chemistry_runtime": 39.5963, | |
| "eval_chemistry_samples_per_second": 12.627, | |
| "eval_chemistry_steps_per_second": 12.627, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "eval_math_entropy": 0.7095566186904907, | |
| "eval_math_loss": 0.8719534873962402, | |
| "eval_math_mean_token_accuracy": 0.7730017136335373, | |
| "eval_math_num_tokens": 9955431.0, | |
| "eval_math_runtime": 40.4499, | |
| "eval_math_samples_per_second": 12.361, | |
| "eval_math_steps_per_second": 12.361, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "eval_physics_entropy": 0.6804338894784451, | |
| "eval_physics_loss": 0.6772474646568298, | |
| "eval_physics_mean_token_accuracy": 0.8075065380334854, | |
| "eval_physics_num_tokens": 9955431.0, | |
| "eval_physics_runtime": 45.0788, | |
| "eval_physics_samples_per_second": 11.092, | |
| "eval_physics_steps_per_second": 11.092, | |
| "step": 1200 | |
| }, | |
| { | |
| "entropy": 0.6472635091282427, | |
| "epoch": 0.484, | |
| "grad_norm": 50.75, | |
| "learning_rate": 1.9535555555555557e-05, | |
| "loss": 10.4858, | |
| "mean_token_accuracy": 0.8127345737069845, | |
| "num_tokens": 10038293.0, | |
| "step": 1210 | |
| }, | |
| { | |
| "entropy": 0.6718290301039815, | |
| "epoch": 0.488, | |
| "grad_norm": 43.25, | |
| "learning_rate": 1.9513333333333335e-05, | |
| "loss": 10.7595, | |
| "mean_token_accuracy": 0.8104257199913263, | |
| "num_tokens": 10119599.0, | |
| "step": 1220 | |
| }, | |
| { | |
| "entropy": 0.7013039434328675, | |
| "epoch": 0.492, | |
| "grad_norm": 49.25, | |
| "learning_rate": 1.9491111111111113e-05, | |
| "loss": 11.2505, | |
| "mean_token_accuracy": 0.8000239260494709, | |
| "num_tokens": 10204514.0, | |
| "step": 1230 | |
| }, | |
| { | |
| "entropy": 0.6948929887264967, | |
| "epoch": 0.496, | |
| "grad_norm": 42.25, | |
| "learning_rate": 1.946888888888889e-05, | |
| "loss": 11.0562, | |
| "mean_token_accuracy": 0.8022828463464975, | |
| "num_tokens": 10289883.0, | |
| "step": 1240 | |
| }, | |
| { | |
| "entropy": 0.6491774883121252, | |
| "epoch": 0.5, | |
| "grad_norm": 48.25, | |
| "learning_rate": 1.9446666666666668e-05, | |
| "loss": 10.5893, | |
| "mean_token_accuracy": 0.8121375739574432, | |
| "num_tokens": 10368762.0, | |
| "step": 1250 | |
| }, | |
| { | |
| "entropy": 0.6618938606232405, | |
| "epoch": 0.504, | |
| "grad_norm": 43.5, | |
| "learning_rate": 1.9424444444444446e-05, | |
| "loss": 10.5939, | |
| "mean_token_accuracy": 0.8118096552789211, | |
| "num_tokens": 10454683.0, | |
| "step": 1260 | |
| }, | |
| { | |
| "entropy": 0.6799033954739571, | |
| "epoch": 0.508, | |
| "grad_norm": 47.0, | |
| "learning_rate": 1.9402222222222223e-05, | |
| "loss": 11.031, | |
| "mean_token_accuracy": 0.804159976914525, | |
| "num_tokens": 10539515.0, | |
| "step": 1270 | |
| }, | |
| { | |
| "entropy": 0.7304276229813695, | |
| "epoch": 0.512, | |
| "grad_norm": 51.5, | |
| "learning_rate": 1.938e-05, | |
| "loss": 11.6009, | |
| "mean_token_accuracy": 0.7962873011827469, | |
| "num_tokens": 10619468.0, | |
| "step": 1280 | |
| }, | |
| { | |
| "entropy": 0.6799104714766144, | |
| "epoch": 0.516, | |
| "grad_norm": 45.75, | |
| "learning_rate": 1.935777777777778e-05, | |
| "loss": 11.0138, | |
| "mean_token_accuracy": 0.805281289294362, | |
| "num_tokens": 10698439.0, | |
| "step": 1290 | |
| }, | |
| { | |
| "entropy": 0.6859343230724335, | |
| "epoch": 0.52, | |
| "grad_norm": 42.25, | |
| "learning_rate": 1.9335555555555556e-05, | |
| "loss": 11.027, | |
| "mean_token_accuracy": 0.8035755015909671, | |
| "num_tokens": 10781285.0, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "eval_biology_entropy": 1.2939175048470497, | |
| "eval_biology_loss": 1.4010000228881836, | |
| "eval_biology_mean_token_accuracy": 0.6629679874181748, | |
| "eval_biology_num_tokens": 10781285.0, | |
| "eval_biology_runtime": 35.2822, | |
| "eval_biology_samples_per_second": 14.171, | |
| "eval_biology_steps_per_second": 14.171, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "eval_chemistry_entropy": 0.8264291672706604, | |
| "eval_chemistry_loss": 0.847792387008667, | |
| "eval_chemistry_mean_token_accuracy": 0.7705023067593575, | |
| "eval_chemistry_num_tokens": 10781285.0, | |
| "eval_chemistry_runtime": 39.6216, | |
| "eval_chemistry_samples_per_second": 12.619, | |
| "eval_chemistry_steps_per_second": 12.619, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "eval_math_entropy": 0.7105307767689228, | |
| "eval_math_loss": 0.8610548973083496, | |
| "eval_math_mean_token_accuracy": 0.7741031144857407, | |
| "eval_math_num_tokens": 10781285.0, | |
| "eval_math_runtime": 40.4357, | |
| "eval_math_samples_per_second": 12.365, | |
| "eval_math_steps_per_second": 12.365, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "eval_physics_entropy": 0.6724484626352787, | |
| "eval_physics_loss": 0.6681262254714966, | |
| "eval_physics_mean_token_accuracy": 0.8100028432011604, | |
| "eval_physics_num_tokens": 10781285.0, | |
| "eval_physics_runtime": 45.0631, | |
| "eval_physics_samples_per_second": 11.096, | |
| "eval_physics_steps_per_second": 11.096, | |
| "step": 1300 | |
| }, | |
| { | |
| "entropy": 0.6645524255931378, | |
| "epoch": 0.524, | |
| "grad_norm": 47.0, | |
| "learning_rate": 1.9313333333333334e-05, | |
| "loss": 10.7825, | |
| "mean_token_accuracy": 0.8092559453099966, | |
| "num_tokens": 10864773.0, | |
| "step": 1310 | |
| }, | |
| { | |
| "entropy": 0.6470350125804544, | |
| "epoch": 0.528, | |
| "grad_norm": 41.0, | |
| "learning_rate": 1.9291111111111115e-05, | |
| "loss": 10.3184, | |
| "mean_token_accuracy": 0.815019316598773, | |
| "num_tokens": 10944904.0, | |
| "step": 1320 | |
| }, | |
| { | |
| "entropy": 0.6757032671943307, | |
| "epoch": 0.532, | |
| "grad_norm": 47.75, | |
| "learning_rate": 1.926888888888889e-05, | |
| "loss": 10.8658, | |
| "mean_token_accuracy": 0.8063097450882196, | |
| "num_tokens": 11023958.0, | |
| "step": 1330 | |
| }, | |
| { | |
| "entropy": 0.6243293879553675, | |
| "epoch": 0.536, | |
| "grad_norm": 46.25, | |
| "learning_rate": 1.924666666666667e-05, | |
| "loss": 10.1411, | |
| "mean_token_accuracy": 0.8193823467940092, | |
| "num_tokens": 11106538.0, | |
| "step": 1340 | |
| }, | |
| { | |
| "entropy": 0.6877710536122322, | |
| "epoch": 0.54, | |
| "grad_norm": 54.5, | |
| "learning_rate": 1.9224444444444444e-05, | |
| "loss": 10.9827, | |
| "mean_token_accuracy": 0.8047556940466165, | |
| "num_tokens": 11187142.0, | |
| "step": 1350 | |
| }, | |
| { | |
| "entropy": 0.7018949103541672, | |
| "epoch": 0.544, | |
| "grad_norm": 55.75, | |
| "learning_rate": 1.9202222222222225e-05, | |
| "loss": 11.3787, | |
| "mean_token_accuracy": 0.7996319092810154, | |
| "num_tokens": 11266409.0, | |
| "step": 1360 | |
| }, | |
| { | |
| "entropy": 0.6630384393967688, | |
| "epoch": 0.548, | |
| "grad_norm": 50.75, | |
| "learning_rate": 1.918e-05, | |
| "loss": 10.6384, | |
| "mean_token_accuracy": 0.8089293787255883, | |
| "num_tokens": 11350094.0, | |
| "step": 1370 | |
| }, | |
| { | |
| "entropy": 0.6659614780917764, | |
| "epoch": 0.552, | |
| "grad_norm": 45.75, | |
| "learning_rate": 1.915777777777778e-05, | |
| "loss": 10.7485, | |
| "mean_token_accuracy": 0.8104357924312353, | |
| "num_tokens": 11438391.0, | |
| "step": 1380 | |
| }, | |
| { | |
| "entropy": 0.6783489585854113, | |
| "epoch": 0.556, | |
| "grad_norm": 54.0, | |
| "learning_rate": 1.9135555555555555e-05, | |
| "loss": 10.738, | |
| "mean_token_accuracy": 0.8090345006436109, | |
| "num_tokens": 11521583.0, | |
| "step": 1390 | |
| }, | |
| { | |
| "entropy": 0.6332698877900839, | |
| "epoch": 0.56, | |
| "grad_norm": 42.0, | |
| "learning_rate": 1.9113333333333336e-05, | |
| "loss": 10.1535, | |
| "mean_token_accuracy": 0.817741920426488, | |
| "num_tokens": 11605544.0, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "eval_biology_entropy": 1.2966757586598396, | |
| "eval_biology_loss": 1.4016809463500977, | |
| "eval_biology_mean_token_accuracy": 0.6630619642138481, | |
| "eval_biology_num_tokens": 11605544.0, | |
| "eval_biology_runtime": 35.3173, | |
| "eval_biology_samples_per_second": 14.157, | |
| "eval_biology_steps_per_second": 14.157, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "eval_chemistry_entropy": 0.8152842739224434, | |
| "eval_chemistry_loss": 0.8491419553756714, | |
| "eval_chemistry_mean_token_accuracy": 0.7716290903687477, | |
| "eval_chemistry_num_tokens": 11605544.0, | |
| "eval_chemistry_runtime": 39.6109, | |
| "eval_chemistry_samples_per_second": 12.623, | |
| "eval_chemistry_steps_per_second": 12.623, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "eval_math_entropy": 0.7092237346470356, | |
| "eval_math_loss": 0.8697728514671326, | |
| "eval_math_mean_token_accuracy": 0.7739188714027405, | |
| "eval_math_num_tokens": 11605544.0, | |
| "eval_math_runtime": 40.4481, | |
| "eval_math_samples_per_second": 12.362, | |
| "eval_math_steps_per_second": 12.362, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "eval_physics_entropy": 0.6652475365400314, | |
| "eval_physics_loss": 0.6614710688591003, | |
| "eval_physics_mean_token_accuracy": 0.8117705038189889, | |
| "eval_physics_num_tokens": 11605544.0, | |
| "eval_physics_runtime": 45.1827, | |
| "eval_physics_samples_per_second": 11.066, | |
| "eval_physics_steps_per_second": 11.066, | |
| "step": 1400 | |
| }, | |
| { | |
| "entropy": 0.6269684578292072, | |
| "epoch": 0.564, | |
| "grad_norm": 42.5, | |
| "learning_rate": 1.9091111111111113e-05, | |
| "loss": 10.2481, | |
| "mean_token_accuracy": 0.8174567829817534, | |
| "num_tokens": 11686173.0, | |
| "step": 1410 | |
| }, | |
| { | |
| "entropy": 0.6821280302479863, | |
| "epoch": 0.568, | |
| "grad_norm": 38.25, | |
| "learning_rate": 1.906888888888889e-05, | |
| "loss": 10.6848, | |
| "mean_token_accuracy": 0.8115028716623783, | |
| "num_tokens": 11765430.0, | |
| "step": 1420 | |
| }, | |
| { | |
| "entropy": 0.6392442825250327, | |
| "epoch": 0.572, | |
| "grad_norm": 44.0, | |
| "learning_rate": 1.904666666666667e-05, | |
| "loss": 10.5018, | |
| "mean_token_accuracy": 0.8134302724152803, | |
| "num_tokens": 11850973.0, | |
| "step": 1430 | |
| }, | |
| { | |
| "entropy": 0.6551524167880416, | |
| "epoch": 0.576, | |
| "grad_norm": 56.25, | |
| "learning_rate": 1.9024444444444446e-05, | |
| "loss": 10.4141, | |
| "mean_token_accuracy": 0.8127408139407635, | |
| "num_tokens": 11931715.0, | |
| "step": 1440 | |
| }, | |
| { | |
| "entropy": 0.6323688972741366, | |
| "epoch": 0.58, | |
| "grad_norm": 47.75, | |
| "learning_rate": 1.9002222222222224e-05, | |
| "loss": 9.9927, | |
| "mean_token_accuracy": 0.8190583731979132, | |
| "num_tokens": 12012155.0, | |
| "step": 1450 | |
| }, | |
| { | |
| "entropy": 0.6468079431913794, | |
| "epoch": 0.584, | |
| "grad_norm": 44.25, | |
| "learning_rate": 1.898e-05, | |
| "loss": 10.5496, | |
| "mean_token_accuracy": 0.8117645151913166, | |
| "num_tokens": 12093686.0, | |
| "step": 1460 | |
| }, | |
| { | |
| "entropy": 0.6574975445866584, | |
| "epoch": 0.588, | |
| "grad_norm": 46.5, | |
| "learning_rate": 1.895777777777778e-05, | |
| "loss": 10.4738, | |
| "mean_token_accuracy": 0.81148545704782, | |
| "num_tokens": 12177581.0, | |
| "step": 1470 | |
| }, | |
| { | |
| "entropy": 0.6759683048352599, | |
| "epoch": 0.592, | |
| "grad_norm": 46.25, | |
| "learning_rate": 1.8935555555555556e-05, | |
| "loss": 10.844, | |
| "mean_token_accuracy": 0.8061960492283106, | |
| "num_tokens": 12257243.0, | |
| "step": 1480 | |
| }, | |
| { | |
| "entropy": 0.6535733859054744, | |
| "epoch": 0.596, | |
| "grad_norm": 58.75, | |
| "learning_rate": 1.8913333333333334e-05, | |
| "loss": 10.5604, | |
| "mean_token_accuracy": 0.8106869570910931, | |
| "num_tokens": 12338544.0, | |
| "step": 1490 | |
| }, | |
| { | |
| "entropy": 0.635299691837281, | |
| "epoch": 0.6, | |
| "grad_norm": 45.25, | |
| "learning_rate": 1.8891111111111115e-05, | |
| "loss": 10.1003, | |
| "mean_token_accuracy": 0.8157124288380146, | |
| "num_tokens": 12422008.0, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "eval_biology_entropy": 1.2644084550142287, | |
| "eval_biology_loss": 1.403598666191101, | |
| "eval_biology_mean_token_accuracy": 0.662936574101448, | |
| "eval_biology_num_tokens": 12422008.0, | |
| "eval_biology_runtime": 35.0607, | |
| "eval_biology_samples_per_second": 14.261, | |
| "eval_biology_steps_per_second": 14.261, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "eval_chemistry_entropy": 0.7927452449202538, | |
| "eval_chemistry_loss": 0.8444129824638367, | |
| "eval_chemistry_mean_token_accuracy": 0.7716013901829719, | |
| "eval_chemistry_num_tokens": 12422008.0, | |
| "eval_chemistry_runtime": 39.6183, | |
| "eval_chemistry_samples_per_second": 12.62, | |
| "eval_chemistry_steps_per_second": 12.62, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "eval_math_entropy": 0.6911805932819843, | |
| "eval_math_loss": 0.8791993260383606, | |
| "eval_math_mean_token_accuracy": 0.7729427699446678, | |
| "eval_math_num_tokens": 12422008.0, | |
| "eval_math_runtime": 40.5514, | |
| "eval_math_samples_per_second": 12.33, | |
| "eval_math_steps_per_second": 12.33, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "eval_physics_entropy": 0.635497625797987, | |
| "eval_physics_loss": 0.6566235423088074, | |
| "eval_physics_mean_token_accuracy": 0.8124136482477188, | |
| "eval_physics_num_tokens": 12422008.0, | |
| "eval_physics_runtime": 45.1105, | |
| "eval_physics_samples_per_second": 11.084, | |
| "eval_physics_steps_per_second": 11.084, | |
| "step": 1500 | |
| }, | |
| { | |
| "entropy": 0.6438517822884023, | |
| "epoch": 0.604, | |
| "grad_norm": 46.25, | |
| "learning_rate": 1.886888888888889e-05, | |
| "loss": 10.4253, | |
| "mean_token_accuracy": 0.8152689874172211, | |
| "num_tokens": 12505134.0, | |
| "step": 1510 | |
| }, | |
| { | |
| "entropy": 0.6165852569974959, | |
| "epoch": 0.608, | |
| "grad_norm": 50.25, | |
| "learning_rate": 1.884666666666667e-05, | |
| "loss": 9.8857, | |
| "mean_token_accuracy": 0.8216613974422217, | |
| "num_tokens": 12583154.0, | |
| "step": 1520 | |
| }, | |
| { | |
| "entropy": 0.6570021582767367, | |
| "epoch": 0.612, | |
| "grad_norm": 42.75, | |
| "learning_rate": 1.8824444444444445e-05, | |
| "loss": 10.4566, | |
| "mean_token_accuracy": 0.8129478100687265, | |
| "num_tokens": 12666601.0, | |
| "step": 1530 | |
| }, | |
| { | |
| "entropy": 0.6499744065105915, | |
| "epoch": 0.616, | |
| "grad_norm": 42.5, | |
| "learning_rate": 1.8802222222222226e-05, | |
| "loss": 10.3697, | |
| "mean_token_accuracy": 0.8139124307781458, | |
| "num_tokens": 12746458.0, | |
| "step": 1540 | |
| }, | |
| { | |
| "entropy": 0.6602890061214566, | |
| "epoch": 0.62, | |
| "grad_norm": 39.0, | |
| "learning_rate": 1.878e-05, | |
| "loss": 10.7086, | |
| "mean_token_accuracy": 0.8104711420834064, | |
| "num_tokens": 12831572.0, | |
| "step": 1550 | |
| }, | |
| { | |
| "entropy": 0.6643684912472964, | |
| "epoch": 0.624, | |
| "grad_norm": 47.75, | |
| "learning_rate": 1.875777777777778e-05, | |
| "loss": 10.5879, | |
| "mean_token_accuracy": 0.8116141181439162, | |
| "num_tokens": 12905392.0, | |
| "step": 1560 | |
| }, | |
| { | |
| "entropy": 0.6584846514277161, | |
| "epoch": 0.628, | |
| "grad_norm": 43.75, | |
| "learning_rate": 1.873555555555556e-05, | |
| "loss": 10.5432, | |
| "mean_token_accuracy": 0.8107300482690334, | |
| "num_tokens": 12986513.0, | |
| "step": 1570 | |
| }, | |
| { | |
| "entropy": 0.638776770234108, | |
| "epoch": 0.632, | |
| "grad_norm": 39.5, | |
| "learning_rate": 1.8713333333333336e-05, | |
| "loss": 10.2261, | |
| "mean_token_accuracy": 0.814822031930089, | |
| "num_tokens": 13071058.0, | |
| "step": 1580 | |
| }, | |
| { | |
| "entropy": 0.6551781664602458, | |
| "epoch": 0.636, | |
| "grad_norm": 44.0, | |
| "learning_rate": 1.8691111111111114e-05, | |
| "loss": 10.5182, | |
| "mean_token_accuracy": 0.8119142647832632, | |
| "num_tokens": 13151842.0, | |
| "step": 1590 | |
| }, | |
| { | |
| "entropy": 0.7266039110720157, | |
| "epoch": 0.64, | |
| "grad_norm": 46.75, | |
| "learning_rate": 1.866888888888889e-05, | |
| "loss": 11.6713, | |
| "mean_token_accuracy": 0.7968564338982105, | |
| "num_tokens": 13232198.0, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "eval_biology_entropy": 1.3351151769161225, | |
| "eval_biology_loss": 1.3962006568908691, | |
| "eval_biology_mean_token_accuracy": 0.6624402161240578, | |
| "eval_biology_num_tokens": 13232198.0, | |
| "eval_biology_runtime": 35.2387, | |
| "eval_biology_samples_per_second": 14.189, | |
| "eval_biology_steps_per_second": 14.189, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "eval_chemistry_entropy": 0.8378922098278999, | |
| "eval_chemistry_loss": 0.8399096727371216, | |
| "eval_chemistry_mean_token_accuracy": 0.771498102247715, | |
| "eval_chemistry_num_tokens": 13232198.0, | |
| "eval_chemistry_runtime": 39.6463, | |
| "eval_chemistry_samples_per_second": 12.612, | |
| "eval_chemistry_steps_per_second": 12.612, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "eval_math_entropy": 0.7215032837092876, | |
| "eval_math_loss": 0.8692191243171692, | |
| "eval_math_mean_token_accuracy": 0.7733984060287475, | |
| "eval_math_num_tokens": 13232198.0, | |
| "eval_math_runtime": 40.5046, | |
| "eval_math_samples_per_second": 12.344, | |
| "eval_math_steps_per_second": 12.344, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "eval_physics_entropy": 0.6693824680745601, | |
| "eval_physics_loss": 0.6483619213104248, | |
| "eval_physics_mean_token_accuracy": 0.8146560984253883, | |
| "eval_physics_num_tokens": 13232198.0, | |
| "eval_physics_runtime": 45.1205, | |
| "eval_physics_samples_per_second": 11.081, | |
| "eval_physics_steps_per_second": 11.081, | |
| "step": 1600 | |
| }, | |
| { | |
| "entropy": 0.6329503210261465, | |
| "epoch": 0.644, | |
| "grad_norm": 41.5, | |
| "learning_rate": 1.864666666666667e-05, | |
| "loss": 10.1519, | |
| "mean_token_accuracy": 0.8150339797139168, | |
| "num_tokens": 13318037.0, | |
| "step": 1610 | |
| }, | |
| { | |
| "entropy": 0.6718782410956919, | |
| "epoch": 0.648, | |
| "grad_norm": 48.25, | |
| "learning_rate": 1.8624444444444446e-05, | |
| "loss": 10.6876, | |
| "mean_token_accuracy": 0.8110412795096635, | |
| "num_tokens": 13401064.0, | |
| "step": 1620 | |
| }, | |
| { | |
| "entropy": 0.6298892620950938, | |
| "epoch": 0.652, | |
| "grad_norm": 52.0, | |
| "learning_rate": 1.8602222222222224e-05, | |
| "loss": 10.2865, | |
| "mean_token_accuracy": 0.8168573569506407, | |
| "num_tokens": 13488222.0, | |
| "step": 1630 | |
| }, | |
| { | |
| "entropy": 0.6405643708072603, | |
| "epoch": 0.656, | |
| "grad_norm": 39.75, | |
| "learning_rate": 1.858e-05, | |
| "loss": 10.0944, | |
| "mean_token_accuracy": 0.8184370543807745, | |
| "num_tokens": 13575902.0, | |
| "step": 1640 | |
| }, | |
| { | |
| "entropy": 0.6571455264464021, | |
| "epoch": 0.66, | |
| "grad_norm": 41.25, | |
| "learning_rate": 1.855777777777778e-05, | |
| "loss": 10.5153, | |
| "mean_token_accuracy": 0.8104129206389189, | |
| "num_tokens": 13655826.0, | |
| "step": 1650 | |
| }, | |
| { | |
| "entropy": 0.6566264102235436, | |
| "epoch": 0.664, | |
| "grad_norm": 55.5, | |
| "learning_rate": 1.8535555555555557e-05, | |
| "loss": 10.5224, | |
| "mean_token_accuracy": 0.8120165556669235, | |
| "num_tokens": 13733192.0, | |
| "step": 1660 | |
| }, | |
| { | |
| "entropy": 0.6292093011550606, | |
| "epoch": 0.668, | |
| "grad_norm": 37.0, | |
| "learning_rate": 1.8513333333333335e-05, | |
| "loss": 10.0179, | |
| "mean_token_accuracy": 0.8215251605957746, | |
| "num_tokens": 13816716.0, | |
| "step": 1670 | |
| }, | |
| { | |
| "entropy": 0.6717162574641407, | |
| "epoch": 0.672, | |
| "grad_norm": 42.5, | |
| "learning_rate": 1.8491111111111112e-05, | |
| "loss": 10.9238, | |
| "mean_token_accuracy": 0.8044366929680109, | |
| "num_tokens": 13895997.0, | |
| "step": 1680 | |
| }, | |
| { | |
| "entropy": 0.6257681073620915, | |
| "epoch": 0.676, | |
| "grad_norm": 41.75, | |
| "learning_rate": 1.846888888888889e-05, | |
| "loss": 9.9056, | |
| "mean_token_accuracy": 0.8205471843481064, | |
| "num_tokens": 13988168.0, | |
| "step": 1690 | |
| }, | |
| { | |
| "entropy": 0.6243995727039874, | |
| "epoch": 0.68, | |
| "grad_norm": 45.5, | |
| "learning_rate": 1.8446666666666667e-05, | |
| "loss": 10.0775, | |
| "mean_token_accuracy": 0.8218194592744112, | |
| "num_tokens": 14074579.0, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "eval_biology_entropy": 1.3095218172669412, | |
| "eval_biology_loss": 1.3988749980926514, | |
| "eval_biology_mean_token_accuracy": 0.6642519612908363, | |
| "eval_biology_num_tokens": 14074579.0, | |
| "eval_biology_runtime": 35.376, | |
| "eval_biology_samples_per_second": 14.134, | |
| "eval_biology_steps_per_second": 14.134, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "eval_chemistry_entropy": 0.810583723127842, | |
| "eval_chemistry_loss": 0.8362264633178711, | |
| "eval_chemistry_mean_token_accuracy": 0.7733853607773781, | |
| "eval_chemistry_num_tokens": 14074579.0, | |
| "eval_chemistry_runtime": 39.7857, | |
| "eval_chemistry_samples_per_second": 12.567, | |
| "eval_chemistry_steps_per_second": 12.567, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "eval_math_entropy": 0.7038493918180466, | |
| "eval_math_loss": 0.8700717687606812, | |
| "eval_math_mean_token_accuracy": 0.7738546322584152, | |
| "eval_math_num_tokens": 14074579.0, | |
| "eval_math_runtime": 40.5553, | |
| "eval_math_samples_per_second": 12.329, | |
| "eval_math_steps_per_second": 12.329, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "eval_physics_entropy": 0.6500247009396553, | |
| "eval_physics_loss": 0.6439645290374756, | |
| "eval_physics_mean_token_accuracy": 0.8147145962715149, | |
| "eval_physics_num_tokens": 14074579.0, | |
| "eval_physics_runtime": 45.1157, | |
| "eval_physics_samples_per_second": 11.083, | |
| "eval_physics_steps_per_second": 11.083, | |
| "step": 1700 | |
| }, | |
| { | |
| "entropy": 0.701755482237786, | |
| "epoch": 0.684, | |
| "grad_norm": 38.0, | |
| "learning_rate": 1.842444444444445e-05, | |
| "loss": 11.2097, | |
| "mean_token_accuracy": 0.8015010356903076, | |
| "num_tokens": 14153649.0, | |
| "step": 1710 | |
| }, | |
| { | |
| "entropy": 0.6312699088826775, | |
| "epoch": 0.688, | |
| "grad_norm": 41.5, | |
| "learning_rate": 1.8402222222222223e-05, | |
| "loss": 10.0591, | |
| "mean_token_accuracy": 0.8177381068468094, | |
| "num_tokens": 14234888.0, | |
| "step": 1720 | |
| }, | |
| { | |
| "entropy": 0.6341699136421084, | |
| "epoch": 0.692, | |
| "grad_norm": 49.75, | |
| "learning_rate": 1.8380000000000004e-05, | |
| "loss": 10.2054, | |
| "mean_token_accuracy": 0.8159505747258663, | |
| "num_tokens": 14323183.0, | |
| "step": 1730 | |
| }, | |
| { | |
| "entropy": 0.6394278825260699, | |
| "epoch": 0.696, | |
| "grad_norm": 44.0, | |
| "learning_rate": 1.8357777777777778e-05, | |
| "loss": 10.2597, | |
| "mean_token_accuracy": 0.8152242347598075, | |
| "num_tokens": 14411150.0, | |
| "step": 1740 | |
| }, | |
| { | |
| "entropy": 0.6502743780612945, | |
| "epoch": 0.7, | |
| "grad_norm": 38.0, | |
| "learning_rate": 1.833555555555556e-05, | |
| "loss": 10.4132, | |
| "mean_token_accuracy": 0.8118325512856245, | |
| "num_tokens": 14490628.0, | |
| "step": 1750 | |
| }, | |
| { | |
| "entropy": 0.6663031128235162, | |
| "epoch": 0.704, | |
| "grad_norm": 45.0, | |
| "learning_rate": 1.8313333333333333e-05, | |
| "loss": 10.7733, | |
| "mean_token_accuracy": 0.8083227630704641, | |
| "num_tokens": 14567908.0, | |
| "step": 1760 | |
| }, | |
| { | |
| "entropy": 0.6487898311577738, | |
| "epoch": 0.708, | |
| "grad_norm": 40.0, | |
| "learning_rate": 1.8291111111111114e-05, | |
| "loss": 10.2458, | |
| "mean_token_accuracy": 0.81355558373034, | |
| "num_tokens": 14647291.0, | |
| "step": 1770 | |
| }, | |
| { | |
| "entropy": 0.6143384758383036, | |
| "epoch": 0.712, | |
| "grad_norm": 42.75, | |
| "learning_rate": 1.8268888888888888e-05, | |
| "loss": 9.8792, | |
| "mean_token_accuracy": 0.8224807776510715, | |
| "num_tokens": 14728975.0, | |
| "step": 1780 | |
| }, | |
| { | |
| "entropy": 0.6461778385564685, | |
| "epoch": 0.716, | |
| "grad_norm": 42.5, | |
| "learning_rate": 1.824666666666667e-05, | |
| "loss": 10.16, | |
| "mean_token_accuracy": 0.8192640010267496, | |
| "num_tokens": 14804586.0, | |
| "step": 1790 | |
| }, | |
| { | |
| "entropy": 0.6274462300352752, | |
| "epoch": 0.72, | |
| "grad_norm": 43.5, | |
| "learning_rate": 1.8224444444444447e-05, | |
| "loss": 10.2187, | |
| "mean_token_accuracy": 0.8162837028503418, | |
| "num_tokens": 14882927.0, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "eval_biology_entropy": 1.26099048101902, | |
| "eval_biology_loss": 1.4029406309127808, | |
| "eval_biology_mean_token_accuracy": 0.6639119000434875, | |
| "eval_biology_num_tokens": 14882927.0, | |
| "eval_biology_runtime": 35.2293, | |
| "eval_biology_samples_per_second": 14.193, | |
| "eval_biology_steps_per_second": 14.193, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "eval_chemistry_entropy": 0.793392687290907, | |
| "eval_chemistry_loss": 0.832188606262207, | |
| "eval_chemistry_mean_token_accuracy": 0.7744238600730896, | |
| "eval_chemistry_num_tokens": 14882927.0, | |
| "eval_chemistry_runtime": 39.6195, | |
| "eval_chemistry_samples_per_second": 12.62, | |
| "eval_chemistry_steps_per_second": 12.62, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "eval_math_entropy": 0.6914696083068848, | |
| "eval_math_loss": 0.8720027804374695, | |
| "eval_math_mean_token_accuracy": 0.7736118195652961, | |
| "eval_math_num_tokens": 14882927.0, | |
| "eval_math_runtime": 40.5243, | |
| "eval_math_samples_per_second": 12.338, | |
| "eval_math_steps_per_second": 12.338, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "eval_physics_entropy": 0.6349251216948032, | |
| "eval_physics_loss": 0.6378234624862671, | |
| "eval_physics_mean_token_accuracy": 0.8165309005975723, | |
| "eval_physics_num_tokens": 14882927.0, | |
| "eval_physics_runtime": 45.1781, | |
| "eval_physics_samples_per_second": 11.067, | |
| "eval_physics_steps_per_second": 11.067, | |
| "step": 1800 | |
| }, | |
| { | |
| "entropy": 0.6608743364922702, | |
| "epoch": 0.724, | |
| "grad_norm": 44.5, | |
| "learning_rate": 1.8202222222222225e-05, | |
| "loss": 10.4493, | |
| "mean_token_accuracy": 0.8107382688671351, | |
| "num_tokens": 14966717.0, | |
| "step": 1810 | |
| }, | |
| { | |
| "entropy": 0.5937761082313955, | |
| "epoch": 0.728, | |
| "grad_norm": 43.25, | |
| "learning_rate": 1.8180000000000002e-05, | |
| "loss": 9.6358, | |
| "mean_token_accuracy": 0.8237139344215393, | |
| "num_tokens": 15052770.0, | |
| "step": 1820 | |
| }, | |
| { | |
| "entropy": 0.6625786048360169, | |
| "epoch": 0.732, | |
| "grad_norm": 37.0, | |
| "learning_rate": 1.815777777777778e-05, | |
| "loss": 10.5736, | |
| "mean_token_accuracy": 0.8131286226212978, | |
| "num_tokens": 15135882.0, | |
| "step": 1830 | |
| }, | |
| { | |
| "entropy": 0.6253975971601904, | |
| "epoch": 0.736, | |
| "grad_norm": 49.5, | |
| "learning_rate": 1.8135555555555557e-05, | |
| "loss": 9.9433, | |
| "mean_token_accuracy": 0.8202268972992897, | |
| "num_tokens": 15217342.0, | |
| "step": 1840 | |
| }, | |
| { | |
| "entropy": 0.6260659927502275, | |
| "epoch": 0.74, | |
| "grad_norm": 45.75, | |
| "learning_rate": 1.8113333333333335e-05, | |
| "loss": 10.0495, | |
| "mean_token_accuracy": 0.819375942274928, | |
| "num_tokens": 15303623.0, | |
| "step": 1850 | |
| }, | |
| { | |
| "entropy": 0.6029338354244829, | |
| "epoch": 0.744, | |
| "grad_norm": 46.75, | |
| "learning_rate": 1.8091111111111113e-05, | |
| "loss": 9.5649, | |
| "mean_token_accuracy": 0.8268326785415411, | |
| "num_tokens": 15385161.0, | |
| "step": 1860 | |
| }, | |
| { | |
| "entropy": 0.6492635687813163, | |
| "epoch": 0.748, | |
| "grad_norm": 47.0, | |
| "learning_rate": 1.806888888888889e-05, | |
| "loss": 10.568, | |
| "mean_token_accuracy": 0.810694944486022, | |
| "num_tokens": 15466126.0, | |
| "step": 1870 | |
| }, | |
| { | |
| "entropy": 0.5932938390411436, | |
| "epoch": 0.752, | |
| "grad_norm": 43.25, | |
| "learning_rate": 1.8046666666666668e-05, | |
| "loss": 9.4157, | |
| "mean_token_accuracy": 0.8285971570760011, | |
| "num_tokens": 15547710.0, | |
| "step": 1880 | |
| }, | |
| { | |
| "entropy": 0.6472898460924625, | |
| "epoch": 0.756, | |
| "grad_norm": 47.75, | |
| "learning_rate": 1.8024444444444445e-05, | |
| "loss": 10.2805, | |
| "mean_token_accuracy": 0.8147294465452433, | |
| "num_tokens": 15625904.0, | |
| "step": 1890 | |
| }, | |
| { | |
| "entropy": 0.6560673840343952, | |
| "epoch": 0.76, | |
| "grad_norm": 43.75, | |
| "learning_rate": 1.8002222222222223e-05, | |
| "loss": 10.5391, | |
| "mean_token_accuracy": 0.8101910080760717, | |
| "num_tokens": 15704494.0, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "eval_biology_entropy": 1.3165188397169114, | |
| "eval_biology_loss": 1.4028364419937134, | |
| "eval_biology_mean_token_accuracy": 0.6636078572869301, | |
| "eval_biology_num_tokens": 15704494.0, | |
| "eval_biology_runtime": 35.6648, | |
| "eval_biology_samples_per_second": 14.019, | |
| "eval_biology_steps_per_second": 14.019, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "eval_chemistry_entropy": 0.8100777663588524, | |
| "eval_chemistry_loss": 0.8284242749214172, | |
| "eval_chemistry_mean_token_accuracy": 0.7751321754455567, | |
| "eval_chemistry_num_tokens": 15704494.0, | |
| "eval_chemistry_runtime": 39.604, | |
| "eval_chemistry_samples_per_second": 12.625, | |
| "eval_chemistry_steps_per_second": 12.625, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "eval_math_entropy": 0.7177843990921974, | |
| "eval_math_loss": 0.8692219853401184, | |
| "eval_math_mean_token_accuracy": 0.7739469853639602, | |
| "eval_math_num_tokens": 15704494.0, | |
| "eval_math_runtime": 40.511, | |
| "eval_math_samples_per_second": 12.342, | |
| "eval_math_steps_per_second": 12.342, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "eval_physics_entropy": 0.6514912424087524, | |
| "eval_physics_loss": 0.6343158483505249, | |
| "eval_physics_mean_token_accuracy": 0.8171744772791862, | |
| "eval_physics_num_tokens": 15704494.0, | |
| "eval_physics_runtime": 45.1095, | |
| "eval_physics_samples_per_second": 11.084, | |
| "eval_physics_steps_per_second": 11.084, | |
| "step": 1900 | |
| }, | |
| { | |
| "entropy": 0.6272192076779902, | |
| "epoch": 0.764, | |
| "grad_norm": 38.25, | |
| "learning_rate": 1.798e-05, | |
| "loss": 9.9781, | |
| "mean_token_accuracy": 0.8175393354147673, | |
| "num_tokens": 15792219.0, | |
| "step": 1910 | |
| }, | |
| { | |
| "entropy": 0.619711946696043, | |
| "epoch": 0.768, | |
| "grad_norm": 39.25, | |
| "learning_rate": 1.7957777777777778e-05, | |
| "loss": 10.014, | |
| "mean_token_accuracy": 0.8173364765942097, | |
| "num_tokens": 15877177.0, | |
| "step": 1920 | |
| }, | |
| { | |
| "entropy": 0.6192873228341341, | |
| "epoch": 0.772, | |
| "grad_norm": 38.0, | |
| "learning_rate": 1.7935555555555556e-05, | |
| "loss": 9.8355, | |
| "mean_token_accuracy": 0.8235324405133724, | |
| "num_tokens": 15965749.0, | |
| "step": 1930 | |
| }, | |
| { | |
| "entropy": 0.6311636022292078, | |
| "epoch": 0.776, | |
| "grad_norm": 39.0, | |
| "learning_rate": 1.7913333333333337e-05, | |
| "loss": 10.0725, | |
| "mean_token_accuracy": 0.8187215372920036, | |
| "num_tokens": 16050530.0, | |
| "step": 1940 | |
| }, | |
| { | |
| "entropy": 0.6458089170977473, | |
| "epoch": 0.78, | |
| "grad_norm": 40.25, | |
| "learning_rate": 1.789111111111111e-05, | |
| "loss": 10.399, | |
| "mean_token_accuracy": 0.8167071305215359, | |
| "num_tokens": 16136804.0, | |
| "step": 1950 | |
| }, | |
| { | |
| "entropy": 0.6231075780466199, | |
| "epoch": 0.784, | |
| "grad_norm": 43.25, | |
| "learning_rate": 1.7868888888888892e-05, | |
| "loss": 9.9612, | |
| "mean_token_accuracy": 0.8209680158644914, | |
| "num_tokens": 16219640.0, | |
| "step": 1960 | |
| }, | |
| { | |
| "entropy": 0.6302494045346976, | |
| "epoch": 0.788, | |
| "grad_norm": 39.0, | |
| "learning_rate": 1.7846666666666666e-05, | |
| "loss": 9.9731, | |
| "mean_token_accuracy": 0.8196312204003334, | |
| "num_tokens": 16299595.0, | |
| "step": 1970 | |
| }, | |
| { | |
| "entropy": 0.6404457478784025, | |
| "epoch": 0.792, | |
| "grad_norm": 51.5, | |
| "learning_rate": 1.7824444444444447e-05, | |
| "loss": 10.3109, | |
| "mean_token_accuracy": 0.8141773957759142, | |
| "num_tokens": 16378820.0, | |
| "step": 1980 | |
| }, | |
| { | |
| "entropy": 0.644137370865792, | |
| "epoch": 0.796, | |
| "grad_norm": 44.25, | |
| "learning_rate": 1.780222222222222e-05, | |
| "loss": 10.3859, | |
| "mean_token_accuracy": 0.8138818342238665, | |
| "num_tokens": 16464611.0, | |
| "step": 1990 | |
| }, | |
| { | |
| "entropy": 0.6414632762782275, | |
| "epoch": 0.8, | |
| "grad_norm": 35.0, | |
| "learning_rate": 1.7780000000000003e-05, | |
| "loss": 10.2348, | |
| "mean_token_accuracy": 0.8169484097510576, | |
| "num_tokens": 16548261.0, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "eval_biology_entropy": 1.310195390045643, | |
| "eval_biology_loss": 1.3905400037765503, | |
| "eval_biology_mean_token_accuracy": 0.6651642812490464, | |
| "eval_biology_num_tokens": 16548261.0, | |
| "eval_biology_runtime": 35.3115, | |
| "eval_biology_samples_per_second": 14.16, | |
| "eval_biology_steps_per_second": 14.16, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "eval_chemistry_entropy": 0.7988228353559971, | |
| "eval_chemistry_loss": 0.8260599374771118, | |
| "eval_chemistry_mean_token_accuracy": 0.7754636185765267, | |
| "eval_chemistry_num_tokens": 16548261.0, | |
| "eval_chemistry_runtime": 39.4167, | |
| "eval_chemistry_samples_per_second": 12.685, | |
| "eval_chemistry_steps_per_second": 12.685, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "eval_math_entropy": 0.6841731403172017, | |
| "eval_math_loss": 0.87116539478302, | |
| "eval_math_mean_token_accuracy": 0.7759169373512268, | |
| "eval_math_num_tokens": 16548261.0, | |
| "eval_math_runtime": 40.3625, | |
| "eval_math_samples_per_second": 12.388, | |
| "eval_math_steps_per_second": 12.388, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "eval_physics_entropy": 0.6220660482347011, | |
| "eval_physics_loss": 0.6307912468910217, | |
| "eval_physics_mean_token_accuracy": 0.818623253762722, | |
| "eval_physics_num_tokens": 16548261.0, | |
| "eval_physics_runtime": 45.0164, | |
| "eval_physics_samples_per_second": 11.107, | |
| "eval_physics_steps_per_second": 11.107, | |
| "step": 2000 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 10000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 4, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.1096331558472863e+18, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |