Text Generation
Transformers
Safetensors
qwen2
llama-factory
full
Generated from Trainer
conversational
text-generation-inference
Instructions to use lemonhat/Qwen2.5-7B-Instruct-agenttuning with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use lemonhat/Qwen2.5-7B-Instruct-agenttuning with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="lemonhat/Qwen2.5-7B-Instruct-agenttuning") messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("lemonhat/Qwen2.5-7B-Instruct-agenttuning") model = AutoModelForCausalLM.from_pretrained("lemonhat/Qwen2.5-7B-Instruct-agenttuning") messages = [ {"role": "user", "content": "Who are you?"}, ] inputs = tokenizer.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use lemonhat/Qwen2.5-7B-Instruct-agenttuning with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "lemonhat/Qwen2.5-7B-Instruct-agenttuning" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "lemonhat/Qwen2.5-7B-Instruct-agenttuning", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/lemonhat/Qwen2.5-7B-Instruct-agenttuning
- SGLang
How to use lemonhat/Qwen2.5-7B-Instruct-agenttuning with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "lemonhat/Qwen2.5-7B-Instruct-agenttuning" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "lemonhat/Qwen2.5-7B-Instruct-agenttuning", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "lemonhat/Qwen2.5-7B-Instruct-agenttuning" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "lemonhat/Qwen2.5-7B-Instruct-agenttuning", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use lemonhat/Qwen2.5-7B-Instruct-agenttuning with Docker Model Runner:
docker model run hf.co/lemonhat/Qwen2.5-7B-Instruct-agenttuning
| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 100, | |
| "global_step": 466, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.002145922746781116, | |
| "grad_norm": 76.18501490873153, | |
| "learning_rate": 4.999943188496862e-06, | |
| "loss": 1.6093, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.004291845493562232, | |
| "grad_norm": 14.009670404971743, | |
| "learning_rate": 4.999772756569482e-06, | |
| "loss": 0.4008, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.006437768240343348, | |
| "grad_norm": 6.1214346980998515, | |
| "learning_rate": 4.999488711963857e-06, | |
| "loss": 0.3077, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.008583690987124463, | |
| "grad_norm": 14.429243581164625, | |
| "learning_rate": 4.999091067589587e-06, | |
| "loss": 0.4675, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.01072961373390558, | |
| "grad_norm": 7.580155180829176, | |
| "learning_rate": 4.998579841519292e-06, | |
| "loss": 0.4538, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.012875536480686695, | |
| "grad_norm": 7.832206035781183, | |
| "learning_rate": 4.99795505698779e-06, | |
| "loss": 0.2233, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.015021459227467811, | |
| "grad_norm": 6.259035686547294, | |
| "learning_rate": 4.997216742391038e-06, | |
| "loss": 0.2383, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.017167381974248927, | |
| "grad_norm": 6.2512799288622, | |
| "learning_rate": 4.996364931284847e-06, | |
| "loss": 0.2912, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.019313304721030045, | |
| "grad_norm": 7.589384412275137, | |
| "learning_rate": 4.995399662383352e-06, | |
| "loss": 0.2463, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.02145922746781116, | |
| "grad_norm": 10.710971905656887, | |
| "learning_rate": 4.994320979557256e-06, | |
| "loss": 0.3021, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.023605150214592276, | |
| "grad_norm": 9.561570863346544, | |
| "learning_rate": 4.99312893183183e-06, | |
| "loss": 0.3147, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.02575107296137339, | |
| "grad_norm": 38.042377389708484, | |
| "learning_rate": 4.991823573384695e-06, | |
| "loss": 0.8302, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.027896995708154508, | |
| "grad_norm": 8.86173699044075, | |
| "learning_rate": 4.990404963543352e-06, | |
| "loss": 0.4076, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.030042918454935622, | |
| "grad_norm": 21.537285072501607, | |
| "learning_rate": 4.988873166782485e-06, | |
| "loss": 0.7611, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.032188841201716736, | |
| "grad_norm": 6.047109119735588, | |
| "learning_rate": 4.987228252721037e-06, | |
| "loss": 0.3053, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.034334763948497854, | |
| "grad_norm": 14.447362163381339, | |
| "learning_rate": 4.985470296119038e-06, | |
| "loss": 0.5285, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.03648068669527897, | |
| "grad_norm": 7.74290518703786, | |
| "learning_rate": 4.983599376874216e-06, | |
| "loss": 0.2236, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.03862660944206009, | |
| "grad_norm": 5.727455325582489, | |
| "learning_rate": 4.981615580018358e-06, | |
| "loss": 0.204, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.0407725321888412, | |
| "grad_norm": 16.691846221989785, | |
| "learning_rate": 4.979518995713448e-06, | |
| "loss": 0.4196, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.04291845493562232, | |
| "grad_norm": 16.720958919193354, | |
| "learning_rate": 4.977309719247571e-06, | |
| "loss": 0.7494, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.045064377682403435, | |
| "grad_norm": 6.612052621566363, | |
| "learning_rate": 4.974987851030581e-06, | |
| "loss": 0.3287, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.04721030042918455, | |
| "grad_norm": 7.3652166724204315, | |
| "learning_rate": 4.972553496589537e-06, | |
| "loss": 0.3473, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.04935622317596566, | |
| "grad_norm": 5.025111473292376, | |
| "learning_rate": 4.970006766563906e-06, | |
| "loss": 0.3725, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.05150214592274678, | |
| "grad_norm": 11.718331155399566, | |
| "learning_rate": 4.967347776700538e-06, | |
| "loss": 0.6273, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.0536480686695279, | |
| "grad_norm": 4.873321533780134, | |
| "learning_rate": 4.964576647848401e-06, | |
| "loss": 0.5175, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.055793991416309016, | |
| "grad_norm": 4.204769507730203, | |
| "learning_rate": 4.9616935059530915e-06, | |
| "loss": 0.1414, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.05793991416309013, | |
| "grad_norm": 7.206330363974099, | |
| "learning_rate": 4.958698482051109e-06, | |
| "loss": 0.2285, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.060085836909871244, | |
| "grad_norm": 4.7858940585296095, | |
| "learning_rate": 4.955591712263901e-06, | |
| "loss": 0.2037, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.06223175965665236, | |
| "grad_norm": 12.477235363576892, | |
| "learning_rate": 4.952373337791678e-06, | |
| "loss": 0.5673, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.06437768240343347, | |
| "grad_norm": 3.797571893331321, | |
| "learning_rate": 4.9490435049069925e-06, | |
| "loss": 0.2233, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.06652360515021459, | |
| "grad_norm": 3.3156309080116158, | |
| "learning_rate": 4.9456023649480935e-06, | |
| "loss": 0.1981, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.06866952789699571, | |
| "grad_norm": 4.531878367746364, | |
| "learning_rate": 4.942050074312048e-06, | |
| "loss": 0.1946, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.07081545064377683, | |
| "grad_norm": 5.78159856239121, | |
| "learning_rate": 4.9383867944476325e-06, | |
| "loss": 0.2749, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.07296137339055794, | |
| "grad_norm": 3.7408734670014794, | |
| "learning_rate": 4.934612691847995e-06, | |
| "loss": 0.3626, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.07510729613733906, | |
| "grad_norm": 15.815816851091464, | |
| "learning_rate": 4.930727938043091e-06, | |
| "loss": 0.4734, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.07725321888412018, | |
| "grad_norm": 5.5369981240766935, | |
| "learning_rate": 4.926732709591879e-06, | |
| "loss": 0.2659, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.07939914163090128, | |
| "grad_norm": 12.910997188700815, | |
| "learning_rate": 4.9226271880743086e-06, | |
| "loss": 0.4348, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.0815450643776824, | |
| "grad_norm": 8.020574277619785, | |
| "learning_rate": 4.918411560083058e-06, | |
| "loss": 0.3308, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.08369098712446352, | |
| "grad_norm": 6.453487445532046, | |
| "learning_rate": 4.914086017215059e-06, | |
| "loss": 0.4054, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.08583690987124463, | |
| "grad_norm": 6.115766672764467, | |
| "learning_rate": 4.909650756062782e-06, | |
| "loss": 0.332, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.08798283261802575, | |
| "grad_norm": 6.319229087749166, | |
| "learning_rate": 4.9051059782053125e-06, | |
| "loss": 0.2929, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.09012875536480687, | |
| "grad_norm": 14.438350682047952, | |
| "learning_rate": 4.900451890199179e-06, | |
| "loss": 0.7799, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.09227467811158799, | |
| "grad_norm": 6.79883162868288, | |
| "learning_rate": 4.895688703568968e-06, | |
| "loss": 0.3752, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.0944206008583691, | |
| "grad_norm": 5.690387611301978, | |
| "learning_rate": 4.890816634797716e-06, | |
| "loss": 0.2059, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.09656652360515021, | |
| "grad_norm": 4.431043352022045, | |
| "learning_rate": 4.885835905317061e-06, | |
| "loss": 0.2755, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.09871244635193133, | |
| "grad_norm": 6.846760154412704, | |
| "learning_rate": 4.880746741497187e-06, | |
| "loss": 0.319, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.10085836909871244, | |
| "grad_norm": 5.075521155502137, | |
| "learning_rate": 4.87554937463653e-06, | |
| "loss": 0.2944, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.10300429184549356, | |
| "grad_norm": 5.000407636367204, | |
| "learning_rate": 4.87024404095127e-06, | |
| "loss": 0.244, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.10515021459227468, | |
| "grad_norm": 5.5213729215970515, | |
| "learning_rate": 4.8648309815645915e-06, | |
| "loss": 0.3535, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.1072961373390558, | |
| "grad_norm": 5.162325783820257, | |
| "learning_rate": 4.8593104424957275e-06, | |
| "loss": 0.3763, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.10944206008583691, | |
| "grad_norm": 5.241667905913422, | |
| "learning_rate": 4.853682674648775e-06, | |
| "loss": 0.256, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.11158798283261803, | |
| "grad_norm": 5.009026381629991, | |
| "learning_rate": 4.847947933801296e-06, | |
| "loss": 0.3455, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.11373390557939914, | |
| "grad_norm": 12.563430161222314, | |
| "learning_rate": 4.842106480592687e-06, | |
| "loss": 0.6362, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.11587982832618025, | |
| "grad_norm": 6.315768827715915, | |
| "learning_rate": 4.836158580512339e-06, | |
| "loss": 0.2889, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.11802575107296137, | |
| "grad_norm": 5.218075678965081, | |
| "learning_rate": 4.8301045038875665e-06, | |
| "loss": 0.1951, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.12017167381974249, | |
| "grad_norm": 7.3960507355002605, | |
| "learning_rate": 4.823944525871324e-06, | |
| "loss": 0.3214, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.1223175965665236, | |
| "grad_norm": 7.4185340069371986, | |
| "learning_rate": 4.817678926429702e-06, | |
| "loss": 0.3852, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.12446351931330472, | |
| "grad_norm": 4.9490761074888825, | |
| "learning_rate": 4.8113079903291955e-06, | |
| "loss": 0.4695, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.12660944206008584, | |
| "grad_norm": 5.408957891954199, | |
| "learning_rate": 4.804832007123771e-06, | |
| "loss": 0.3385, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.12875536480686695, | |
| "grad_norm": 12.766577641000062, | |
| "learning_rate": 4.7982512711416995e-06, | |
| "loss": 0.4539, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.13090128755364808, | |
| "grad_norm": 6.338330185457858, | |
| "learning_rate": 4.791566081472185e-06, | |
| "loss": 0.4034, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.13304721030042918, | |
| "grad_norm": 6.151730476584575, | |
| "learning_rate": 4.784776741951766e-06, | |
| "loss": 0.3683, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.1351931330472103, | |
| "grad_norm": 4.345928683775522, | |
| "learning_rate": 4.777883561150508e-06, | |
| "loss": 0.2088, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.13733905579399142, | |
| "grad_norm": 6.444874963490508, | |
| "learning_rate": 4.770886852357983e-06, | |
| "loss": 0.2917, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.13948497854077252, | |
| "grad_norm": 5.531328541781978, | |
| "learning_rate": 4.763786933569025e-06, | |
| "loss": 0.279, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.14163090128755365, | |
| "grad_norm": 4.080884948693169, | |
| "learning_rate": 4.75658412746928e-06, | |
| "loss": 0.195, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.14377682403433475, | |
| "grad_norm": 10.700949555936232, | |
| "learning_rate": 4.7492787614205425e-06, | |
| "loss": 0.4479, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.1459227467811159, | |
| "grad_norm": 5.481209106182085, | |
| "learning_rate": 4.7418711674458735e-06, | |
| "loss": 0.2758, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.148068669527897, | |
| "grad_norm": 6.189118615859581, | |
| "learning_rate": 4.734361682214511e-06, | |
| "loss": 0.2218, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.15021459227467812, | |
| "grad_norm": 6.34535278449602, | |
| "learning_rate": 4.726750647026569e-06, | |
| "loss": 0.4419, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.15236051502145923, | |
| "grad_norm": 5.239918053202764, | |
| "learning_rate": 4.719038407797529e-06, | |
| "loss": 0.2557, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.15450643776824036, | |
| "grad_norm": 6.26390385146301, | |
| "learning_rate": 4.711225315042513e-06, | |
| "loss": 0.5612, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.15665236051502146, | |
| "grad_norm": 6.195157191377425, | |
| "learning_rate": 4.703311723860356e-06, | |
| "loss": 0.3057, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.15879828326180256, | |
| "grad_norm": 16.357650722017635, | |
| "learning_rate": 4.695297993917465e-06, | |
| "loss": 0.8227, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.1609442060085837, | |
| "grad_norm": 5.339299574193258, | |
| "learning_rate": 4.687184489431476e-06, | |
| "loss": 0.2878, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.1630901287553648, | |
| "grad_norm": 14.30043548697042, | |
| "learning_rate": 4.678971579154698e-06, | |
| "loss": 0.8272, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.16523605150214593, | |
| "grad_norm": 6.315372884620644, | |
| "learning_rate": 4.670659636357352e-06, | |
| "loss": 0.2812, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.16738197424892703, | |
| "grad_norm": 4.0158091796680075, | |
| "learning_rate": 4.66224903881061e-06, | |
| "loss": 0.3212, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.16952789699570817, | |
| "grad_norm": 4.741126690857027, | |
| "learning_rate": 4.653740168769424e-06, | |
| "loss": 0.2627, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.17167381974248927, | |
| "grad_norm": 4.654686079147113, | |
| "learning_rate": 4.64513341295515e-06, | |
| "loss": 0.1007, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.17381974248927037, | |
| "grad_norm": 2.858109471127793, | |
| "learning_rate": 4.6364291625379785e-06, | |
| "loss": 0.1819, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.1759656652360515, | |
| "grad_norm": 13.540162342975938, | |
| "learning_rate": 4.627627813119147e-06, | |
| "loss": 0.5442, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.1781115879828326, | |
| "grad_norm": 14.029611827699513, | |
| "learning_rate": 4.618729764712969e-06, | |
| "loss": 0.9108, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.18025751072961374, | |
| "grad_norm": 4.457304810520309, | |
| "learning_rate": 4.609735421728647e-06, | |
| "loss": 0.2638, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.18240343347639484, | |
| "grad_norm": 5.935629670933359, | |
| "learning_rate": 4.600645192951898e-06, | |
| "loss": 0.2403, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.18454935622317598, | |
| "grad_norm": 5.136446320474782, | |
| "learning_rate": 4.591459491526371e-06, | |
| "loss": 0.246, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.18669527896995708, | |
| "grad_norm": 5.684377700354834, | |
| "learning_rate": 4.582178734934869e-06, | |
| "loss": 0.2751, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.1888412017167382, | |
| "grad_norm": 15.146436017458708, | |
| "learning_rate": 4.572803344980378e-06, | |
| "loss": 0.8044, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.19098712446351931, | |
| "grad_norm": 4.636355059101864, | |
| "learning_rate": 4.563333747766896e-06, | |
| "loss": 0.2279, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.19313304721030042, | |
| "grad_norm": 4.963353306284944, | |
| "learning_rate": 4.553770373680062e-06, | |
| "loss": 0.1785, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.19527896995708155, | |
| "grad_norm": 3.887562311073866, | |
| "learning_rate": 4.544113657367604e-06, | |
| "loss": 0.1408, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.19742489270386265, | |
| "grad_norm": 4.996113376607759, | |
| "learning_rate": 4.5343640377195766e-06, | |
| "loss": 0.2715, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.19957081545064378, | |
| "grad_norm": 5.484264961569098, | |
| "learning_rate": 4.524521957848416e-06, | |
| "loss": 0.3287, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.2017167381974249, | |
| "grad_norm": 5.08682917163341, | |
| "learning_rate": 4.514587865068806e-06, | |
| "loss": 0.366, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.20386266094420602, | |
| "grad_norm": 6.8043618981585725, | |
| "learning_rate": 4.504562210877338e-06, | |
| "loss": 0.3089, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.20600858369098712, | |
| "grad_norm": 14.009371009062516, | |
| "learning_rate": 4.494445450932003e-06, | |
| "loss": 0.5487, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.20815450643776823, | |
| "grad_norm": 13.77182300710328, | |
| "learning_rate": 4.484238045031471e-06, | |
| "loss": 0.6681, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.21030042918454936, | |
| "grad_norm": 5.373599093252736, | |
| "learning_rate": 4.473940457094199e-06, | |
| "loss": 0.2824, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.21244635193133046, | |
| "grad_norm": 7.7339638883523145, | |
| "learning_rate": 4.463553155137348e-06, | |
| "loss": 0.3773, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.2145922746781116, | |
| "grad_norm": 4.358486041621687, | |
| "learning_rate": 4.453076611255507e-06, | |
| "loss": 0.1968, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.2145922746781116, | |
| "eval_loss": 1.61643385887146, | |
| "eval_runtime": 0.5589, | |
| "eval_samples_per_second": 3.578, | |
| "eval_steps_per_second": 1.789, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.2167381974248927, | |
| "grad_norm": 10.51123783356939, | |
| "learning_rate": 4.442511301599241e-06, | |
| "loss": 0.6631, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.21888412017167383, | |
| "grad_norm": 6.204285891841481, | |
| "learning_rate": 4.431857706353449e-06, | |
| "loss": 0.1981, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.22103004291845493, | |
| "grad_norm": 3.509886696140534, | |
| "learning_rate": 4.4211163097155375e-06, | |
| "loss": 0.2514, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.22317596566523606, | |
| "grad_norm": 5.794240091360383, | |
| "learning_rate": 4.4102875998734176e-06, | |
| "loss": 0.4743, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.22532188841201717, | |
| "grad_norm": 7.1780216249839475, | |
| "learning_rate": 4.399372068983317e-06, | |
| "loss": 0.3754, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.22746781115879827, | |
| "grad_norm": 3.4853873904835835, | |
| "learning_rate": 4.388370213147409e-06, | |
| "loss": 0.1217, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.2296137339055794, | |
| "grad_norm": 5.282583393202969, | |
| "learning_rate": 4.377282532391267e-06, | |
| "loss": 0.2422, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.2317596566523605, | |
| "grad_norm": 5.917106273735147, | |
| "learning_rate": 4.36610953064114e-06, | |
| "loss": 0.3188, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.23390557939914164, | |
| "grad_norm": 5.49762193840351, | |
| "learning_rate": 4.354851715701046e-06, | |
| "loss": 0.2603, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.23605150214592274, | |
| "grad_norm": 5.113114203550232, | |
| "learning_rate": 4.343509599229697e-06, | |
| "loss": 0.2025, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.23819742489270387, | |
| "grad_norm": 3.9665916022215733, | |
| "learning_rate": 4.332083696717242e-06, | |
| "loss": 0.2608, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.24034334763948498, | |
| "grad_norm": 5.001407701927511, | |
| "learning_rate": 4.3205745274618365e-06, | |
| "loss": 0.3543, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.24248927038626608, | |
| "grad_norm": 4.783668888273014, | |
| "learning_rate": 4.308982614546045e-06, | |
| "loss": 0.1367, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.2446351931330472, | |
| "grad_norm": 13.339939405425257, | |
| "learning_rate": 4.297308484813067e-06, | |
| "loss": 0.4563, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.24678111587982832, | |
| "grad_norm": 6.774593211018968, | |
| "learning_rate": 4.2855526688427875e-06, | |
| "loss": 0.2344, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.24892703862660945, | |
| "grad_norm": 6.032144655648605, | |
| "learning_rate": 4.273715700927666e-06, | |
| "loss": 0.3559, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.2510729613733906, | |
| "grad_norm": 11.663752172897125, | |
| "learning_rate": 4.261798119048456e-06, | |
| "loss": 0.5593, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.2532188841201717, | |
| "grad_norm": 11.248638669710262, | |
| "learning_rate": 4.249800464849751e-06, | |
| "loss": 0.4252, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.2553648068669528, | |
| "grad_norm": 5.542239291058805, | |
| "learning_rate": 4.2377232836153635e-06, | |
| "loss": 0.2671, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.2575107296137339, | |
| "grad_norm": 5.6163735157751935, | |
| "learning_rate": 4.22556712424355e-06, | |
| "loss": 0.2948, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.259656652360515, | |
| "grad_norm": 7.094248362569725, | |
| "learning_rate": 4.21333253922206e-06, | |
| "loss": 0.3536, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.26180257510729615, | |
| "grad_norm": 6.177984494937913, | |
| "learning_rate": 4.201020084603027e-06, | |
| "loss": 0.3318, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.26394849785407726, | |
| "grad_norm": 5.442893696217635, | |
| "learning_rate": 4.1886303199776924e-06, | |
| "loss": 0.3083, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.26609442060085836, | |
| "grad_norm": 5.3971104766895355, | |
| "learning_rate": 4.176163808450978e-06, | |
| "loss": 0.2955, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.26824034334763946, | |
| "grad_norm": 4.524019146701118, | |
| "learning_rate": 4.163621116615892e-06, | |
| "loss": 0.3251, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.2703862660944206, | |
| "grad_norm": 5.615564703579776, | |
| "learning_rate": 4.151002814527774e-06, | |
| "loss": 0.3306, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.27253218884120173, | |
| "grad_norm": 5.126082022354096, | |
| "learning_rate": 4.138309475678393e-06, | |
| "loss": 0.2977, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.27467811158798283, | |
| "grad_norm": 11.215215705069811, | |
| "learning_rate": 4.125541676969876e-06, | |
| "loss": 0.4561, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.27682403433476394, | |
| "grad_norm": 6.185141044554984, | |
| "learning_rate": 4.112699998688492e-06, | |
| "loss": 0.3652, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.27896995708154504, | |
| "grad_norm": 14.23422453207559, | |
| "learning_rate": 4.099785024478276e-06, | |
| "loss": 0.5595, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.2811158798283262, | |
| "grad_norm": 4.138921969139266, | |
| "learning_rate": 4.086797341314509e-06, | |
| "loss": 0.2577, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.2832618025751073, | |
| "grad_norm": 4.766155830963139, | |
| "learning_rate": 4.073737539477033e-06, | |
| "loss": 0.266, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.2854077253218884, | |
| "grad_norm": 16.59914611751421, | |
| "learning_rate": 4.060606212523425e-06, | |
| "loss": 0.7351, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.2875536480686695, | |
| "grad_norm": 12.70203249759127, | |
| "learning_rate": 4.047403957262024e-06, | |
| "loss": 0.7393, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.28969957081545067, | |
| "grad_norm": 5.854857495039337, | |
| "learning_rate": 4.034131373724802e-06, | |
| "loss": 0.3114, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.2918454935622318, | |
| "grad_norm": 7.054371064646026, | |
| "learning_rate": 4.020789065140097e-06, | |
| "loss": 0.3954, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.2939914163090129, | |
| "grad_norm": 5.777740234720859, | |
| "learning_rate": 4.0073776379051945e-06, | |
| "loss": 0.3613, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.296137339055794, | |
| "grad_norm": 6.807856815251271, | |
| "learning_rate": 3.993897701558764e-06, | |
| "loss": 0.2781, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.2982832618025751, | |
| "grad_norm": 4.274362381043211, | |
| "learning_rate": 3.980349868753166e-06, | |
| "loss": 0.2904, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.30042918454935624, | |
| "grad_norm": 4.166342301745076, | |
| "learning_rate": 3.9667347552265945e-06, | |
| "loss": 0.2234, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.30257510729613735, | |
| "grad_norm": 4.702643904412771, | |
| "learning_rate": 3.953052979775103e-06, | |
| "loss": 0.2765, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.30472103004291845, | |
| "grad_norm": 15.066466935467592, | |
| "learning_rate": 3.939305164224474e-06, | |
| "loss": 0.7204, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.30686695278969955, | |
| "grad_norm": 4.817588986196399, | |
| "learning_rate": 3.925491933401961e-06, | |
| "loss": 0.2919, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.3090128755364807, | |
| "grad_norm": 4.841054379645271, | |
| "learning_rate": 3.911613915107888e-06, | |
| "loss": 0.2624, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.3111587982832618, | |
| "grad_norm": 5.54460036441131, | |
| "learning_rate": 3.89767174008712e-06, | |
| "loss": 0.3137, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.3133047210300429, | |
| "grad_norm": 4.6296008910638164, | |
| "learning_rate": 3.883666042000392e-06, | |
| "loss": 0.2172, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.315450643776824, | |
| "grad_norm": 5.738911770360598, | |
| "learning_rate": 3.869597457395514e-06, | |
| "loss": 0.2998, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.31759656652360513, | |
| "grad_norm": 4.284288974195585, | |
| "learning_rate": 3.855466625678435e-06, | |
| "loss": 0.2037, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.3197424892703863, | |
| "grad_norm": 15.839881512200966, | |
| "learning_rate": 3.841274189084189e-06, | |
| "loss": 0.7527, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.3218884120171674, | |
| "grad_norm": 6.689386040699, | |
| "learning_rate": 3.8270207926477e-06, | |
| "loss": 0.3388, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.3240343347639485, | |
| "grad_norm": 5.131159476750089, | |
| "learning_rate": 3.8127070841744695e-06, | |
| "loss": 0.21, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.3261802575107296, | |
| "grad_norm": 12.20701029064903, | |
| "learning_rate": 3.798333714211132e-06, | |
| "loss": 0.5333, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.3283261802575107, | |
| "grad_norm": 2.8374089434590966, | |
| "learning_rate": 3.7839013360158904e-06, | |
| "loss": 0.2125, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.33047210300429186, | |
| "grad_norm": 13.285466340975823, | |
| "learning_rate": 3.769410605528824e-06, | |
| "loss": 0.5934, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.33261802575107297, | |
| "grad_norm": 5.3446998985506875, | |
| "learning_rate": 3.7548621813420765e-06, | |
| "loss": 0.3581, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.33476394849785407, | |
| "grad_norm": 7.274618884484683, | |
| "learning_rate": 3.7402567246699257e-06, | |
| "loss": 0.3321, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.3369098712446352, | |
| "grad_norm": 6.497928899578661, | |
| "learning_rate": 3.72559489931873e-06, | |
| "loss": 0.4336, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.33905579399141633, | |
| "grad_norm": 5.931060877134416, | |
| "learning_rate": 3.710877371656757e-06, | |
| "loss": 0.2499, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.34120171673819744, | |
| "grad_norm": 11.662557566869323, | |
| "learning_rate": 3.696104810583904e-06, | |
| "loss": 0.521, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.34334763948497854, | |
| "grad_norm": 10.738868357386822, | |
| "learning_rate": 3.68127788750129e-06, | |
| "loss": 0.3935, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.34549356223175964, | |
| "grad_norm": 10.27557863180008, | |
| "learning_rate": 3.6663972762807453e-06, | |
| "loss": 0.3763, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.34763948497854075, | |
| "grad_norm": 3.4237960233644866, | |
| "learning_rate": 3.6514636532341825e-06, | |
| "loss": 0.2778, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.3497854077253219, | |
| "grad_norm": 3.6998517744689763, | |
| "learning_rate": 3.6364776970828586e-06, | |
| "loss": 0.1564, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.351931330472103, | |
| "grad_norm": 4.614921333239657, | |
| "learning_rate": 3.621440088926531e-06, | |
| "loss": 0.272, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.3540772532188841, | |
| "grad_norm": 6.050087330401235, | |
| "learning_rate": 3.6063515122124975e-06, | |
| "loss": 0.3652, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.3562231759656652, | |
| "grad_norm": 4.387527359728692, | |
| "learning_rate": 3.5912126527045368e-06, | |
| "loss": 0.2172, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.3583690987124464, | |
| "grad_norm": 15.020253643431802, | |
| "learning_rate": 3.5760241984517397e-06, | |
| "loss": 0.8165, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.3605150214592275, | |
| "grad_norm": 4.553283039366036, | |
| "learning_rate": 3.560786839757242e-06, | |
| "loss": 0.3647, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.3626609442060086, | |
| "grad_norm": 7.402637796397458, | |
| "learning_rate": 3.5455012691468417e-06, | |
| "loss": 0.3558, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.3648068669527897, | |
| "grad_norm": 9.457334627147352, | |
| "learning_rate": 3.5301681813375343e-06, | |
| "loss": 0.3833, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.3669527896995708, | |
| "grad_norm": 4.692028452001265, | |
| "learning_rate": 3.5147882732059323e-06, | |
| "loss": 0.2337, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.36909871244635195, | |
| "grad_norm": 8.372452946921305, | |
| "learning_rate": 3.4993622437565955e-06, | |
| "loss": 0.2836, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.37124463519313305, | |
| "grad_norm": 3.3319166006121783, | |
| "learning_rate": 3.4838907940902607e-06, | |
| "loss": 0.1544, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.37339055793991416, | |
| "grad_norm": 13.281904749983129, | |
| "learning_rate": 3.4683746273719754e-06, | |
| "loss": 0.4684, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.37553648068669526, | |
| "grad_norm": 4.672588539349313, | |
| "learning_rate": 3.4528144487991448e-06, | |
| "loss": 0.2533, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.3776824034334764, | |
| "grad_norm": 9.892923641996738, | |
| "learning_rate": 3.437210965569475e-06, | |
| "loss": 0.2724, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.3798283261802575, | |
| "grad_norm": 11.942875136773626, | |
| "learning_rate": 3.421564886848835e-06, | |
| "loss": 0.5145, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.38197424892703863, | |
| "grad_norm": 5.004292816588128, | |
| "learning_rate": 3.4058769237390254e-06, | |
| "loss": 0.2421, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.38412017167381973, | |
| "grad_norm": 6.348610825705306, | |
| "learning_rate": 3.3901477892454583e-06, | |
| "loss": 0.5266, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.38626609442060084, | |
| "grad_norm": 11.476145919220956, | |
| "learning_rate": 3.3743781982447533e-06, | |
| "loss": 0.5593, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.388412017167382, | |
| "grad_norm": 4.384950541640652, | |
| "learning_rate": 3.3585688674522438e-06, | |
| "loss": 0.1745, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.3905579399141631, | |
| "grad_norm": 12.738421839026058, | |
| "learning_rate": 3.3427205153894088e-06, | |
| "loss": 0.8569, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.3927038626609442, | |
| "grad_norm": 4.770469502184926, | |
| "learning_rate": 3.3268338623512094e-06, | |
| "loss": 0.3608, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.3948497854077253, | |
| "grad_norm": 5.608829333428552, | |
| "learning_rate": 3.3109096303733564e-06, | |
| "loss": 0.2229, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.3969957081545064, | |
| "grad_norm": 14.271945687755156, | |
| "learning_rate": 3.2949485431994945e-06, | |
| "loss": 0.7949, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.39914163090128757, | |
| "grad_norm": 17.320126868800134, | |
| "learning_rate": 3.2789513262483053e-06, | |
| "loss": 0.7768, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.4012875536480687, | |
| "grad_norm": 6.304445282235124, | |
| "learning_rate": 3.2629187065805445e-06, | |
| "loss": 0.3341, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.4034334763948498, | |
| "grad_norm": 4.772846096021666, | |
| "learning_rate": 3.2468514128659884e-06, | |
| "loss": 0.2169, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.4055793991416309, | |
| "grad_norm": 12.095944359895759, | |
| "learning_rate": 3.230750175350324e-06, | |
| "loss": 0.5972, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.40772532188841204, | |
| "grad_norm": 17.046644577375776, | |
| "learning_rate": 3.2146157258219534e-06, | |
| "loss": 0.8746, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.40987124463519314, | |
| "grad_norm": 5.155144213224812, | |
| "learning_rate": 3.1984487975787433e-06, | |
| "loss": 0.403, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.41201716738197425, | |
| "grad_norm": 3.2493116945473273, | |
| "learning_rate": 3.1822501253946875e-06, | |
| "loss": 0.2046, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.41416309012875535, | |
| "grad_norm": 4.232233097969361, | |
| "learning_rate": 3.1660204454865203e-06, | |
| "loss": 0.3811, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.41630901287553645, | |
| "grad_norm": 4.72756957325817, | |
| "learning_rate": 3.1497604954802485e-06, | |
| "loss": 0.3, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.4184549356223176, | |
| "grad_norm": 12.946178614407032, | |
| "learning_rate": 3.1334710143776346e-06, | |
| "loss": 0.5872, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.4206008583690987, | |
| "grad_norm": 4.61301032231011, | |
| "learning_rate": 3.1171527425226027e-06, | |
| "loss": 0.2081, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.4227467811158798, | |
| "grad_norm": 12.907938814433235, | |
| "learning_rate": 3.100806421567596e-06, | |
| "loss": 0.6287, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.4248927038626609, | |
| "grad_norm": 5.546582248819232, | |
| "learning_rate": 3.084432794439865e-06, | |
| "loss": 0.3105, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.4270386266094421, | |
| "grad_norm": 7.245512728978741, | |
| "learning_rate": 3.0680326053077047e-06, | |
| "loss": 0.3016, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.4291845493562232, | |
| "grad_norm": 10.84493687480769, | |
| "learning_rate": 3.0516065995466336e-06, | |
| "loss": 0.3219, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.4291845493562232, | |
| "eval_loss": 1.4745992422103882, | |
| "eval_runtime": 0.5572, | |
| "eval_samples_per_second": 3.59, | |
| "eval_steps_per_second": 1.795, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.4313304721030043, | |
| "grad_norm": 4.10172843012159, | |
| "learning_rate": 3.0351555237055135e-06, | |
| "loss": 0.1799, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.4334763948497854, | |
| "grad_norm": 5.306456302714684, | |
| "learning_rate": 3.0186801254726213e-06, | |
| "loss": 0.2365, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.4356223175965665, | |
| "grad_norm": 5.299147474065381, | |
| "learning_rate": 3.0021811536416676e-06, | |
| "loss": 0.2406, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.43776824034334766, | |
| "grad_norm": 4.14896225307621, | |
| "learning_rate": 2.985659358077765e-06, | |
| "loss": 0.1931, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.43991416309012876, | |
| "grad_norm": 13.03761590039539, | |
| "learning_rate": 2.9691154896833454e-06, | |
| "loss": 0.325, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.44206008583690987, | |
| "grad_norm": 10.573666701230737, | |
| "learning_rate": 2.9525503003640336e-06, | |
| "loss": 0.324, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.44420600858369097, | |
| "grad_norm": 11.926884819820218, | |
| "learning_rate": 2.935964542994475e-06, | |
| "loss": 0.5481, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.44635193133047213, | |
| "grad_norm": 10.151629428230573, | |
| "learning_rate": 2.9193589713841132e-06, | |
| "loss": 0.5082, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.44849785407725323, | |
| "grad_norm": 12.486051719027907, | |
| "learning_rate": 2.902734340242937e-06, | |
| "loss": 0.3599, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.45064377682403434, | |
| "grad_norm": 15.529513845799773, | |
| "learning_rate": 2.8860914051471722e-06, | |
| "loss": 0.7298, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.45278969957081544, | |
| "grad_norm": 5.461786086907061, | |
| "learning_rate": 2.869430922504947e-06, | |
| "loss": 0.2674, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.45493562231759654, | |
| "grad_norm": 5.376224720954328, | |
| "learning_rate": 2.852753649521911e-06, | |
| "loss": 0.3557, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.4570815450643777, | |
| "grad_norm": 17.151457129512085, | |
| "learning_rate": 2.836060344166821e-06, | |
| "loss": 0.6405, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.4592274678111588, | |
| "grad_norm": 4.757476534660833, | |
| "learning_rate": 2.8193517651370934e-06, | |
| "loss": 0.3124, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.4613733905579399, | |
| "grad_norm": 5.64585393975328, | |
| "learning_rate": 2.80262867182432e-06, | |
| "loss": 0.2888, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.463519313304721, | |
| "grad_norm": 13.765994790690012, | |
| "learning_rate": 2.785891824279755e-06, | |
| "loss": 0.5319, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.4656652360515021, | |
| "grad_norm": 3.4645720688637702, | |
| "learning_rate": 2.7691419831797724e-06, | |
| "loss": 0.1354, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.4678111587982833, | |
| "grad_norm": 4.728651796593724, | |
| "learning_rate": 2.7523799097912905e-06, | |
| "loss": 0.2457, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.4699570815450644, | |
| "grad_norm": 5.579222939328203, | |
| "learning_rate": 2.73560636593718e-06, | |
| "loss": 0.3542, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.4721030042918455, | |
| "grad_norm": 5.988497269511321, | |
| "learning_rate": 2.7188221139616303e-06, | |
| "loss": 0.4184, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.4742489270386266, | |
| "grad_norm": 6.257959157029455, | |
| "learning_rate": 2.70202791669551e-06, | |
| "loss": 0.2553, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.47639484978540775, | |
| "grad_norm": 12.558388707787643, | |
| "learning_rate": 2.68522453742169e-06, | |
| "loss": 0.541, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.47854077253218885, | |
| "grad_norm": 5.81966073574648, | |
| "learning_rate": 2.66841273984036e-06, | |
| "loss": 0.3147, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 0.48068669527896996, | |
| "grad_norm": 12.408285990600806, | |
| "learning_rate": 2.6515932880343103e-06, | |
| "loss": 0.4052, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.48283261802575106, | |
| "grad_norm": 6.652286326696977, | |
| "learning_rate": 2.634766946434214e-06, | |
| "loss": 0.2372, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.48497854077253216, | |
| "grad_norm": 11.13575728119692, | |
| "learning_rate": 2.6179344797838775e-06, | |
| "loss": 0.3067, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.4871244635193133, | |
| "grad_norm": 11.134631212750397, | |
| "learning_rate": 2.6010966531054852e-06, | |
| "loss": 0.8026, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 0.4892703862660944, | |
| "grad_norm": 3.8667307700150446, | |
| "learning_rate": 2.5842542316648333e-06, | |
| "loss": 0.2596, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.49141630901287553, | |
| "grad_norm": 6.443730023062394, | |
| "learning_rate": 2.5674079809365443e-06, | |
| "loss": 0.288, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 0.49356223175965663, | |
| "grad_norm": 2.9440291648892987, | |
| "learning_rate": 2.550558666569279e-06, | |
| "loss": 0.1493, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.4957081545064378, | |
| "grad_norm": 4.462532793208141, | |
| "learning_rate": 2.533707054350938e-06, | |
| "loss": 0.1839, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 0.4978540772532189, | |
| "grad_norm": 4.9063569155392015, | |
| "learning_rate": 2.5168539101738576e-06, | |
| "loss": 0.2802, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 10.022637557306455, | |
| "learning_rate": 2.5e-06, | |
| "loss": 0.3349, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 0.5021459227467812, | |
| "grad_norm": 4.3433358033272, | |
| "learning_rate": 2.4831460898261428e-06, | |
| "loss": 0.2626, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.5042918454935622, | |
| "grad_norm": 9.069133083700263, | |
| "learning_rate": 2.4662929456490633e-06, | |
| "loss": 0.2204, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.5064377682403434, | |
| "grad_norm": 4.611418200226192, | |
| "learning_rate": 2.449441333430722e-06, | |
| "loss": 0.1723, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.5085836909871244, | |
| "grad_norm": 4.915343701063927, | |
| "learning_rate": 2.432592019063456e-06, | |
| "loss": 0.28, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 0.5107296137339056, | |
| "grad_norm": 4.878529367689273, | |
| "learning_rate": 2.415745768335167e-06, | |
| "loss": 0.3336, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.5128755364806867, | |
| "grad_norm": 5.589922591964495, | |
| "learning_rate": 2.398903346894515e-06, | |
| "loss": 0.2363, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 0.5150214592274678, | |
| "grad_norm": 4.07202657981208, | |
| "learning_rate": 2.3820655202161237e-06, | |
| "loss": 0.2744, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.5171673819742489, | |
| "grad_norm": 3.9798028023766703, | |
| "learning_rate": 2.365233053565787e-06, | |
| "loss": 0.2538, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 0.51931330472103, | |
| "grad_norm": 4.2713689569349, | |
| "learning_rate": 2.3484067119656905e-06, | |
| "loss": 0.2072, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.5214592274678111, | |
| "grad_norm": 4.132159038986101, | |
| "learning_rate": 2.331587260159641e-06, | |
| "loss": 0.2537, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 0.5236051502145923, | |
| "grad_norm": 4.7509182744825695, | |
| "learning_rate": 2.31477546257831e-06, | |
| "loss": 0.2475, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.5257510729613734, | |
| "grad_norm": 12.573882879618065, | |
| "learning_rate": 2.297972083304491e-06, | |
| "loss": 0.5451, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.5278969957081545, | |
| "grad_norm": 4.265194868971971, | |
| "learning_rate": 2.28117788603837e-06, | |
| "loss": 0.2038, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.5300429184549357, | |
| "grad_norm": 5.327237193336404, | |
| "learning_rate": 2.2643936340628205e-06, | |
| "loss": 0.2953, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 0.5321888412017167, | |
| "grad_norm": 3.830903874010895, | |
| "learning_rate": 2.24762009020871e-06, | |
| "loss": 0.2118, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.5343347639484979, | |
| "grad_norm": 3.692771359304331, | |
| "learning_rate": 2.2308580168202284e-06, | |
| "loss": 0.2862, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 0.5364806866952789, | |
| "grad_norm": 4.46004020199315, | |
| "learning_rate": 2.214108175720246e-06, | |
| "loss": 0.1485, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.5386266094420601, | |
| "grad_norm": 4.194077533120119, | |
| "learning_rate": 2.197371328175681e-06, | |
| "loss": 0.18, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 0.5407725321888412, | |
| "grad_norm": 4.670466175503201, | |
| "learning_rate": 2.1806482348629065e-06, | |
| "loss": 0.2563, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.5429184549356223, | |
| "grad_norm": 11.620363098035735, | |
| "learning_rate": 2.1639396558331794e-06, | |
| "loss": 0.4257, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 0.5450643776824035, | |
| "grad_norm": 5.143937972622028, | |
| "learning_rate": 2.1472463504780893e-06, | |
| "loss": 0.2771, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 0.5472103004291845, | |
| "grad_norm": 4.790083924982343, | |
| "learning_rate": 2.1305690774950543e-06, | |
| "loss": 0.3618, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.5493562231759657, | |
| "grad_norm": 4.122103689477043, | |
| "learning_rate": 2.1139085948528286e-06, | |
| "loss": 0.1102, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.5515021459227468, | |
| "grad_norm": 4.807538148349604, | |
| "learning_rate": 2.097265659757064e-06, | |
| "loss": 0.2713, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 0.5536480686695279, | |
| "grad_norm": 3.2834156049564003, | |
| "learning_rate": 2.080641028615888e-06, | |
| "loss": 0.1608, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 0.555793991416309, | |
| "grad_norm": 4.843114090474319, | |
| "learning_rate": 2.064035457005526e-06, | |
| "loss": 0.2053, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 0.5579399141630901, | |
| "grad_norm": 4.187255961249243, | |
| "learning_rate": 2.0474496996359676e-06, | |
| "loss": 0.3847, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.5600858369098712, | |
| "grad_norm": 4.835958039304007, | |
| "learning_rate": 2.0308845103166555e-06, | |
| "loss": 0.2324, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 0.5622317596566524, | |
| "grad_norm": 4.612281021599447, | |
| "learning_rate": 2.0143406419222354e-06, | |
| "loss": 0.1589, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 0.5643776824034334, | |
| "grad_norm": 4.679769267914521, | |
| "learning_rate": 1.997818846358333e-06, | |
| "loss": 0.2734, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 0.5665236051502146, | |
| "grad_norm": 15.988595800455592, | |
| "learning_rate": 1.98131987452738e-06, | |
| "loss": 0.5932, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.5686695278969958, | |
| "grad_norm": 6.13615019112239, | |
| "learning_rate": 1.964844476294487e-06, | |
| "loss": 0.3048, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.5708154506437768, | |
| "grad_norm": 5.642706295976752, | |
| "learning_rate": 1.948393400453367e-06, | |
| "loss": 0.2217, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 0.572961373390558, | |
| "grad_norm": 4.00696562989142, | |
| "learning_rate": 1.9319673946922953e-06, | |
| "loss": 0.142, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 0.575107296137339, | |
| "grad_norm": 4.431532041750428, | |
| "learning_rate": 1.9155672055601364e-06, | |
| "loss": 0.2385, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 0.5772532188841202, | |
| "grad_norm": 4.7544298296222305, | |
| "learning_rate": 1.8991935784324048e-06, | |
| "loss": 0.2229, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 0.5793991416309013, | |
| "grad_norm": 14.806103600453497, | |
| "learning_rate": 1.882847257477398e-06, | |
| "loss": 0.6235, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.5815450643776824, | |
| "grad_norm": 3.608004765245334, | |
| "learning_rate": 1.8665289856223662e-06, | |
| "loss": 0.1837, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 0.5836909871244635, | |
| "grad_norm": 4.269748144395558, | |
| "learning_rate": 1.8502395045197522e-06, | |
| "loss": 0.1686, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 0.5858369098712446, | |
| "grad_norm": 3.4279537261830897, | |
| "learning_rate": 1.8339795545134814e-06, | |
| "loss": 0.176, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 0.5879828326180258, | |
| "grad_norm": 3.693149204798503, | |
| "learning_rate": 1.8177498746053129e-06, | |
| "loss": 0.2255, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 0.5901287553648069, | |
| "grad_norm": 7.0891066905956714, | |
| "learning_rate": 1.8015512024212573e-06, | |
| "loss": 0.3132, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.592274678111588, | |
| "grad_norm": 4.371558241354916, | |
| "learning_rate": 1.7853842741780474e-06, | |
| "loss": 0.2161, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 0.5944206008583691, | |
| "grad_norm": 3.7439703795742423, | |
| "learning_rate": 1.769249824649677e-06, | |
| "loss": 0.1681, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 0.5965665236051502, | |
| "grad_norm": 4.954905160767698, | |
| "learning_rate": 1.7531485871340122e-06, | |
| "loss": 0.2169, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 0.5987124463519313, | |
| "grad_norm": 4.509338306319588, | |
| "learning_rate": 1.7370812934194565e-06, | |
| "loss": 0.3037, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 0.6008583690987125, | |
| "grad_norm": 3.5630209392348315, | |
| "learning_rate": 1.7210486737516947e-06, | |
| "loss": 0.1491, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.6030042918454935, | |
| "grad_norm": 4.721486742009494, | |
| "learning_rate": 1.7050514568005072e-06, | |
| "loss": 0.2978, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 0.6051502145922747, | |
| "grad_norm": 4.438705208811552, | |
| "learning_rate": 1.6890903696266447e-06, | |
| "loss": 0.2172, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 0.6072961373390557, | |
| "grad_norm": 10.633440687114604, | |
| "learning_rate": 1.6731661376487923e-06, | |
| "loss": 0.2509, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 0.6094420600858369, | |
| "grad_norm": 9.519241352604435, | |
| "learning_rate": 1.6572794846105919e-06, | |
| "loss": 0.3126, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 0.6115879828326181, | |
| "grad_norm": 6.236116887316222, | |
| "learning_rate": 1.6414311325477567e-06, | |
| "loss": 0.3433, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.6137339055793991, | |
| "grad_norm": 6.652929433249715, | |
| "learning_rate": 1.6256218017552484e-06, | |
| "loss": 0.3924, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 0.6158798283261803, | |
| "grad_norm": 4.230578383590287, | |
| "learning_rate": 1.6098522107545426e-06, | |
| "loss": 0.1969, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 0.6180257510729614, | |
| "grad_norm": 4.584849008876823, | |
| "learning_rate": 1.594123076260975e-06, | |
| "loss": 0.1952, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.6201716738197425, | |
| "grad_norm": 10.774923368712571, | |
| "learning_rate": 1.578435113151166e-06, | |
| "loss": 0.3867, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 0.6223175965665236, | |
| "grad_norm": 3.7956827550236603, | |
| "learning_rate": 1.5627890344305256e-06, | |
| "loss": 0.1921, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.6244635193133047, | |
| "grad_norm": 2.7587416399153897, | |
| "learning_rate": 1.547185551200856e-06, | |
| "loss": 0.1014, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 0.6266094420600858, | |
| "grad_norm": 5.27854274924106, | |
| "learning_rate": 1.531625372628025e-06, | |
| "loss": 0.2008, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 0.628755364806867, | |
| "grad_norm": 8.635037982539608, | |
| "learning_rate": 1.5161092059097399e-06, | |
| "loss": 0.5101, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 0.630901287553648, | |
| "grad_norm": 3.3848075411641076, | |
| "learning_rate": 1.500637756243405e-06, | |
| "loss": 0.1936, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 0.6330472103004292, | |
| "grad_norm": 10.323449388666509, | |
| "learning_rate": 1.485211726794068e-06, | |
| "loss": 0.3156, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.6351931330472103, | |
| "grad_norm": 3.3898443020378823, | |
| "learning_rate": 1.469831818662467e-06, | |
| "loss": 0.4251, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 0.6373390557939914, | |
| "grad_norm": 5.429823091284252, | |
| "learning_rate": 1.4544987308531594e-06, | |
| "loss": 0.2863, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 0.6394849785407726, | |
| "grad_norm": 6.489533944140564, | |
| "learning_rate": 1.439213160242759e-06, | |
| "loss": 0.2321, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 0.6416309012875536, | |
| "grad_norm": 5.491165521890711, | |
| "learning_rate": 1.4239758015482607e-06, | |
| "loss": 0.5229, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 0.6437768240343348, | |
| "grad_norm": 10.960007456788807, | |
| "learning_rate": 1.4087873472954638e-06, | |
| "loss": 0.3474, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.6437768240343348, | |
| "eval_loss": 1.4280835390090942, | |
| "eval_runtime": 0.5582, | |
| "eval_samples_per_second": 3.583, | |
| "eval_steps_per_second": 1.792, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.6459227467811158, | |
| "grad_norm": 4.097125958239756, | |
| "learning_rate": 1.393648487787504e-06, | |
| "loss": 0.175, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 0.648068669527897, | |
| "grad_norm": 9.06788832499044, | |
| "learning_rate": 1.37855991107347e-06, | |
| "loss": 0.3413, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 0.6502145922746781, | |
| "grad_norm": 4.058418163873224, | |
| "learning_rate": 1.3635223029171418e-06, | |
| "loss": 0.1829, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 0.6523605150214592, | |
| "grad_norm": 148.2865695887899, | |
| "learning_rate": 1.3485363467658186e-06, | |
| "loss": 6.7839, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 0.6545064377682404, | |
| "grad_norm": 5.592326984498724, | |
| "learning_rate": 1.3336027237192551e-06, | |
| "loss": 0.2715, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.6566523605150214, | |
| "grad_norm": 11.675146672194217, | |
| "learning_rate": 1.3187221124987107e-06, | |
| "loss": 0.3927, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 0.6587982832618026, | |
| "grad_norm": 6.303487956163143, | |
| "learning_rate": 1.3038951894160962e-06, | |
| "loss": 0.3058, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 0.6609442060085837, | |
| "grad_norm": 3.5714075946161583, | |
| "learning_rate": 1.289122628343244e-06, | |
| "loss": 0.1626, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 0.6630901287553648, | |
| "grad_norm": 2.8795739712912747, | |
| "learning_rate": 1.2744051006812712e-06, | |
| "loss": 0.1164, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 0.6652360515021459, | |
| "grad_norm": 3.784768504884893, | |
| "learning_rate": 1.2597432753300753e-06, | |
| "loss": 0.2305, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.6673819742489271, | |
| "grad_norm": 5.42666962884753, | |
| "learning_rate": 1.245137818657924e-06, | |
| "loss": 0.2721, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 0.6695278969957081, | |
| "grad_norm": 5.933064715860253, | |
| "learning_rate": 1.2305893944711773e-06, | |
| "loss": 0.2627, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 0.6716738197424893, | |
| "grad_norm": 4.451323659006045, | |
| "learning_rate": 1.21609866398411e-06, | |
| "loss": 0.2969, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 0.6738197424892703, | |
| "grad_norm": 4.851157313459204, | |
| "learning_rate": 1.201666285788869e-06, | |
| "loss": 0.2446, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 0.6759656652360515, | |
| "grad_norm": 4.25532850589281, | |
| "learning_rate": 1.187292915825531e-06, | |
| "loss": 0.2488, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.6781115879828327, | |
| "grad_norm": 4.400762031022185, | |
| "learning_rate": 1.1729792073523e-06, | |
| "loss": 0.2463, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 0.6802575107296137, | |
| "grad_norm": 4.1661170844892785, | |
| "learning_rate": 1.1587258109158114e-06, | |
| "loss": 0.2479, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 0.6824034334763949, | |
| "grad_norm": 4.315876128019499, | |
| "learning_rate": 1.1445333743215648e-06, | |
| "loss": 0.3109, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 0.6845493562231759, | |
| "grad_norm": 4.507385647592385, | |
| "learning_rate": 1.1304025426044869e-06, | |
| "loss": 0.2711, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 0.6866952789699571, | |
| "grad_norm": 4.922745847528059, | |
| "learning_rate": 1.116333957999608e-06, | |
| "loss": 0.2441, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.6888412017167382, | |
| "grad_norm": 4.076334007452991, | |
| "learning_rate": 1.1023282599128797e-06, | |
| "loss": 0.2134, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 0.6909871244635193, | |
| "grad_norm": 4.671754190487534, | |
| "learning_rate": 1.0883860848921122e-06, | |
| "loss": 0.2048, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 0.6931330472103004, | |
| "grad_norm": 9.92777165374971, | |
| "learning_rate": 1.07450806659804e-06, | |
| "loss": 0.4585, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 0.6952789699570815, | |
| "grad_norm": 5.249124657109493, | |
| "learning_rate": 1.060694835775527e-06, | |
| "loss": 0.2441, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 0.6974248927038627, | |
| "grad_norm": 3.576374367164296, | |
| "learning_rate": 1.0469470202248976e-06, | |
| "loss": 0.2092, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.6995708154506438, | |
| "grad_norm": 4.1766033624138315, | |
| "learning_rate": 1.0332652447734057e-06, | |
| "loss": 0.2437, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 0.7017167381974249, | |
| "grad_norm": 4.5911217940771865, | |
| "learning_rate": 1.019650131246835e-06, | |
| "loss": 0.3378, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 0.703862660944206, | |
| "grad_norm": 5.049668031588922, | |
| "learning_rate": 1.006102298441236e-06, | |
| "loss": 0.276, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 0.7060085836909872, | |
| "grad_norm": 3.2474445516283366, | |
| "learning_rate": 9.926223620948061e-07, | |
| "loss": 0.1107, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 0.7081545064377682, | |
| "grad_norm": 12.87037417509136, | |
| "learning_rate": 9.792109348599036e-07, | |
| "loss": 0.3857, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.7103004291845494, | |
| "grad_norm": 5.089010214872758, | |
| "learning_rate": 9.65868626275198e-07, | |
| "loss": 0.3078, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 0.7124463519313304, | |
| "grad_norm": 12.594828602227423, | |
| "learning_rate": 9.525960427379772e-07, | |
| "loss": 0.3576, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 0.7145922746781116, | |
| "grad_norm": 13.530682640951623, | |
| "learning_rate": 9.393937874765754e-07, | |
| "loss": 0.5429, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 0.7167381974248928, | |
| "grad_norm": 3.963378157061749, | |
| "learning_rate": 9.262624605229673e-07, | |
| "loss": 0.2409, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 0.7188841201716738, | |
| "grad_norm": 3.7920088399022287, | |
| "learning_rate": 9.132026586854909e-07, | |
| "loss": 0.249, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.721030042918455, | |
| "grad_norm": 13.220884914625854, | |
| "learning_rate": 9.002149755217246e-07, | |
| "loss": 0.654, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 0.723175965665236, | |
| "grad_norm": 3.585564797999746, | |
| "learning_rate": 8.873000013115099e-07, | |
| "loss": 0.1466, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 0.7253218884120172, | |
| "grad_norm": 4.837559263261418, | |
| "learning_rate": 8.744583230301248e-07, | |
| "loss": 0.2767, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 0.7274678111587983, | |
| "grad_norm": 5.252715623197888, | |
| "learning_rate": 8.61690524321607e-07, | |
| "loss": 0.4616, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 0.7296137339055794, | |
| "grad_norm": 4.394456008160045, | |
| "learning_rate": 8.48997185472226e-07, | |
| "loss": 0.3365, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.7317596566523605, | |
| "grad_norm": 4.731413577745255, | |
| "learning_rate": 8.363788833841083e-07, | |
| "loss": 0.2681, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 0.7339055793991416, | |
| "grad_norm": 4.23884351278645, | |
| "learning_rate": 8.238361915490226e-07, | |
| "loss": 0.2179, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 0.7360515021459227, | |
| "grad_norm": 4.991832411605434, | |
| "learning_rate": 8.113696800223084e-07, | |
| "loss": 0.3074, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 0.7381974248927039, | |
| "grad_norm": 11.78247542725481, | |
| "learning_rate": 7.989799153969735e-07, | |
| "loss": 0.7113, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 0.740343347639485, | |
| "grad_norm": 2.677869163204128, | |
| "learning_rate": 7.866674607779401e-07, | |
| "loss": 0.1147, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.7424892703862661, | |
| "grad_norm": 15.72663219186909, | |
| "learning_rate": 7.744328757564501e-07, | |
| "loss": 0.8535, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 0.7446351931330472, | |
| "grad_norm": 5.679767488995843, | |
| "learning_rate": 7.622767163846376e-07, | |
| "loss": 0.2275, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 0.7467811158798283, | |
| "grad_norm": 4.066844515227611, | |
| "learning_rate": 7.501995351502497e-07, | |
| "loss": 0.3943, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 0.7489270386266095, | |
| "grad_norm": 5.645157777694975, | |
| "learning_rate": 7.38201880951544e-07, | |
| "loss": 0.4427, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 0.7510729613733905, | |
| "grad_norm": 4.956779129225836, | |
| "learning_rate": 7.26284299072334e-07, | |
| "loss": 0.202, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.7532188841201717, | |
| "grad_norm": 2.5935658433153566, | |
| "learning_rate": 7.144473311572136e-07, | |
| "loss": 0.1596, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 0.7553648068669528, | |
| "grad_norm": 4.01470514282122, | |
| "learning_rate": 7.026915151869335e-07, | |
| "loss": 0.1781, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 0.7575107296137339, | |
| "grad_norm": 4.6123159242382075, | |
| "learning_rate": 6.910173854539551e-07, | |
| "loss": 0.25, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 0.759656652360515, | |
| "grad_norm": 3.0887466278904703, | |
| "learning_rate": 6.794254725381641e-07, | |
| "loss": 0.2184, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 0.7618025751072961, | |
| "grad_norm": 4.1284793392534995, | |
| "learning_rate": 6.679163032827593e-07, | |
| "loss": 0.2396, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.7639484978540773, | |
| "grad_norm": 4.382346488786086, | |
| "learning_rate": 6.564904007703032e-07, | |
| "loss": 0.2136, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 0.7660944206008584, | |
| "grad_norm": 10.798704798244964, | |
| "learning_rate": 6.45148284298954e-07, | |
| "loss": 0.6572, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 0.7682403433476395, | |
| "grad_norm": 4.878129288407048, | |
| "learning_rate": 6.33890469358861e-07, | |
| "loss": 0.3525, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 0.7703862660944206, | |
| "grad_norm": 5.030026237930303, | |
| "learning_rate": 6.227174676087333e-07, | |
| "loss": 0.2312, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 0.7725321888412017, | |
| "grad_norm": 17.098271284659155, | |
| "learning_rate": 6.11629786852592e-07, | |
| "loss": 0.6886, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.7746781115879828, | |
| "grad_norm": 4.264109038299607, | |
| "learning_rate": 6.006279310166835e-07, | |
| "loss": 0.2899, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 0.776824034334764, | |
| "grad_norm": 4.9567417333092605, | |
| "learning_rate": 5.897124001265822e-07, | |
| "loss": 0.2594, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 0.778969957081545, | |
| "grad_norm": 4.793597034829966, | |
| "learning_rate": 5.788836902844633e-07, | |
| "loss": 0.2406, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 0.7811158798283262, | |
| "grad_norm": 5.338193032183525, | |
| "learning_rate": 5.681422936465522e-07, | |
| "loss": 0.3718, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 0.7832618025751072, | |
| "grad_norm": 4.677431437894929, | |
| "learning_rate": 5.574886984007602e-07, | |
| "loss": 0.1582, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.7854077253218884, | |
| "grad_norm": 5.490552521240752, | |
| "learning_rate": 5.469233887444941e-07, | |
| "loss": 0.3061, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 0.7875536480686696, | |
| "grad_norm": 3.324520040683454, | |
| "learning_rate": 5.36446844862653e-07, | |
| "loss": 0.416, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 0.7896995708154506, | |
| "grad_norm": 4.170197573866644, | |
| "learning_rate": 5.260595429058021e-07, | |
| "loss": 0.182, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 0.7918454935622318, | |
| "grad_norm": 15.271343898793598, | |
| "learning_rate": 5.1576195496853e-07, | |
| "loss": 0.971, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 0.7939914163090128, | |
| "grad_norm": 5.773601400585028, | |
| "learning_rate": 5.055545490679981e-07, | |
| "loss": 0.373, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.796137339055794, | |
| "grad_norm": 5.969373195356161, | |
| "learning_rate": 4.954377891226623e-07, | |
| "loss": 0.1698, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 0.7982832618025751, | |
| "grad_norm": 4.932322793806067, | |
| "learning_rate": 4.854121349311949e-07, | |
| "loss": 0.3279, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 0.8004291845493562, | |
| "grad_norm": 4.577118395812275, | |
| "learning_rate": 4.7547804215158476e-07, | |
| "loss": 0.2336, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 0.8025751072961373, | |
| "grad_norm": 4.347265571148098, | |
| "learning_rate": 4.6563596228042433e-07, | |
| "loss": 0.2708, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 0.8047210300429185, | |
| "grad_norm": 5.494640250591727, | |
| "learning_rate": 4.558863426323962e-07, | |
| "loss": 0.1907, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.8068669527896996, | |
| "grad_norm": 3.3572786766867844, | |
| "learning_rate": 4.462296263199381e-07, | |
| "loss": 0.2496, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 0.8090128755364807, | |
| "grad_norm": 5.900778507191783, | |
| "learning_rate": 4.366662522331053e-07, | |
| "loss": 0.348, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 0.8111587982832618, | |
| "grad_norm": 9.530668791644544, | |
| "learning_rate": 4.27196655019623e-07, | |
| "loss": 0.31, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 0.8133047210300429, | |
| "grad_norm": 11.51101766547589, | |
| "learning_rate": 4.1782126506513196e-07, | |
| "loss": 0.4048, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 0.8154506437768241, | |
| "grad_norm": 4.265212244249617, | |
| "learning_rate": 4.0854050847362966e-07, | |
| "loss": 0.3379, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.8175965665236051, | |
| "grad_norm": 4.266290446089351, | |
| "learning_rate": 3.9935480704810237e-07, | |
| "loss": 0.2869, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 0.8197424892703863, | |
| "grad_norm": 10.069859696254095, | |
| "learning_rate": 3.9026457827135324e-07, | |
| "loss": 0.3259, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 0.8218884120171673, | |
| "grad_norm": 4.789626593387076, | |
| "learning_rate": 3.812702352870321e-07, | |
| "loss": 0.2795, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 0.8240343347639485, | |
| "grad_norm": 7.071090188512615, | |
| "learning_rate": 3.723721868808533e-07, | |
| "loss": 0.3904, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 0.8261802575107297, | |
| "grad_norm": 6.396013480233416, | |
| "learning_rate": 3.6357083746202173e-07, | |
| "loss": 0.3435, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.8283261802575107, | |
| "grad_norm": 6.912201776722396, | |
| "learning_rate": 3.5486658704484977e-07, | |
| "loss": 0.1841, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 0.8304721030042919, | |
| "grad_norm": 3.7884598131328793, | |
| "learning_rate": 3.4625983123057624e-07, | |
| "loss": 0.1977, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 0.8326180257510729, | |
| "grad_norm": 4.539195321014333, | |
| "learning_rate": 3.3775096118939033e-07, | |
| "loss": 0.2573, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 0.8347639484978541, | |
| "grad_norm": 13.401579924542975, | |
| "learning_rate": 3.2934036364264845e-07, | |
| "loss": 0.4396, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 0.8369098712446352, | |
| "grad_norm": 3.4147384980259816, | |
| "learning_rate": 3.2102842084530293e-07, | |
| "loss": 0.1677, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.8390557939914163, | |
| "grad_norm": 12.280073964483028, | |
| "learning_rate": 3.128155105685243e-07, | |
| "loss": 0.6472, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 0.8412017167381974, | |
| "grad_norm": 14.537818479540036, | |
| "learning_rate": 3.0470200608253594e-07, | |
| "loss": 0.6278, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 0.8433476394849786, | |
| "grad_norm": 4.602724381634226, | |
| "learning_rate": 2.96688276139645e-07, | |
| "loss": 0.2403, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 0.8454935622317596, | |
| "grad_norm": 5.352137114560932, | |
| "learning_rate": 2.887746849574877e-07, | |
| "loss": 0.3369, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 0.8476394849785408, | |
| "grad_norm": 6.023327004803226, | |
| "learning_rate": 2.809615922024711e-07, | |
| "loss": 0.2251, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.8497854077253219, | |
| "grad_norm": 5.227321763313939, | |
| "learning_rate": 2.7324935297343146e-07, | |
| "loss": 0.3855, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 0.851931330472103, | |
| "grad_norm": 3.301210094777184, | |
| "learning_rate": 2.6563831778549015e-07, | |
| "loss": 0.1753, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 0.8540772532188842, | |
| "grad_norm": 3.8623722681440125, | |
| "learning_rate": 2.5812883255412704e-07, | |
| "loss": 0.1972, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 0.8562231759656652, | |
| "grad_norm": 11.977134679582875, | |
| "learning_rate": 2.5072123857945773e-07, | |
| "loss": 0.4452, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 0.8583690987124464, | |
| "grad_norm": 11.01382449808334, | |
| "learning_rate": 2.4341587253072035e-07, | |
| "loss": 0.365, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.8583690987124464, | |
| "eval_loss": 1.3694226741790771, | |
| "eval_runtime": 0.5595, | |
| "eval_samples_per_second": 3.575, | |
| "eval_steps_per_second": 1.787, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.8605150214592274, | |
| "grad_norm": 5.5777813351540315, | |
| "learning_rate": 2.3621306643097613e-07, | |
| "loss": 0.2782, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 0.8626609442060086, | |
| "grad_norm": 4.6116367037678785, | |
| "learning_rate": 2.2911314764201775e-07, | |
| "loss": 0.3513, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 0.8648068669527897, | |
| "grad_norm": 4.483486905044895, | |
| "learning_rate": 2.221164388494923e-07, | |
| "loss": 0.4313, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 0.8669527896995708, | |
| "grad_norm": 4.272277428586251, | |
| "learning_rate": 2.1522325804823496e-07, | |
| "loss": 0.2204, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 0.869098712446352, | |
| "grad_norm": 3.896674815860478, | |
| "learning_rate": 2.0843391852781558e-07, | |
| "loss": 0.2286, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.871244635193133, | |
| "grad_norm": 11.404612671959441, | |
| "learning_rate": 2.0174872885830117e-07, | |
| "loss": 0.5745, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 0.8733905579399142, | |
| "grad_norm": 5.761128589810944, | |
| "learning_rate": 1.9516799287622984e-07, | |
| "loss": 0.2869, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 0.8755364806866953, | |
| "grad_norm": 3.268320125330955, | |
| "learning_rate": 1.8869200967080503e-07, | |
| "loss": 0.1103, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 0.8776824034334764, | |
| "grad_norm": 4.38842912004494, | |
| "learning_rate": 1.8232107357029877e-07, | |
| "loss": 0.2272, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 0.8798283261802575, | |
| "grad_norm": 4.643796217859636, | |
| "learning_rate": 1.7605547412867574e-07, | |
| "loss": 0.184, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.8819742489270386, | |
| "grad_norm": 4.90732488089375, | |
| "learning_rate": 1.6989549611243412e-07, | |
| "loss": 0.3627, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 0.8841201716738197, | |
| "grad_norm": 12.17579771573891, | |
| "learning_rate": 1.638414194876617e-07, | |
| "loss": 0.5054, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 0.8862660944206009, | |
| "grad_norm": 7.752186615318354, | |
| "learning_rate": 1.5789351940731334e-07, | |
| "loss": 0.363, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 0.8884120171673819, | |
| "grad_norm": 4.5450019220648, | |
| "learning_rate": 1.520520661987049e-07, | |
| "loss": 0.3259, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 0.8905579399141631, | |
| "grad_norm": 5.488305915443275, | |
| "learning_rate": 1.463173253512251e-07, | |
| "loss": 0.3395, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.8927038626609443, | |
| "grad_norm": 7.094268562494252, | |
| "learning_rate": 1.406895575042727e-07, | |
| "loss": 0.295, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 0.8948497854077253, | |
| "grad_norm": 4.7154597836682965, | |
| "learning_rate": 1.3516901843540876e-07, | |
| "loss": 0.2653, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 0.8969957081545065, | |
| "grad_norm": 11.321571989329978, | |
| "learning_rate": 1.2975595904873073e-07, | |
| "loss": 0.4694, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 0.8991416309012875, | |
| "grad_norm": 4.452790240381536, | |
| "learning_rate": 1.2445062536347057e-07, | |
| "loss": 0.3792, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 0.9012875536480687, | |
| "grad_norm": 4.044178243939312, | |
| "learning_rate": 1.1925325850281416e-07, | |
| "loss": 0.2857, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.9034334763948498, | |
| "grad_norm": 5.509954869782006, | |
| "learning_rate": 1.1416409468293977e-07, | |
| "loss": 0.3078, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 0.9055793991416309, | |
| "grad_norm": 4.69558598631356, | |
| "learning_rate": 1.0918336520228474e-07, | |
| "loss": 0.335, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 0.907725321888412, | |
| "grad_norm": 4.4538465310945385, | |
| "learning_rate": 1.0431129643103193e-07, | |
| "loss": 0.2351, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 0.9098712446351931, | |
| "grad_norm": 5.374815587193687, | |
| "learning_rate": 9.954810980082191e-08, | |
| "loss": 0.2546, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 0.9120171673819742, | |
| "grad_norm": 11.52228630231847, | |
| "learning_rate": 9.489402179468754e-08, | |
| "loss": 0.7185, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.9141630901287554, | |
| "grad_norm": 3.884298143154326, | |
| "learning_rate": 9.034924393721778e-08, | |
| "loss": 0.2261, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 0.9163090128755365, | |
| "grad_norm": 12.582223724723782, | |
| "learning_rate": 8.5913982784942e-08, | |
| "loss": 0.5048, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 0.9184549356223176, | |
| "grad_norm": 4.909917760044249, | |
| "learning_rate": 8.15884399169417e-08, | |
| "loss": 0.1981, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 0.9206008583690987, | |
| "grad_norm": 4.336475429929087, | |
| "learning_rate": 7.737281192569169e-08, | |
| "loss": 0.2551, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 0.9227467811158798, | |
| "grad_norm": 3.489835127648909, | |
| "learning_rate": 7.326729040812136e-08, | |
| "loss": 0.1921, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.924892703862661, | |
| "grad_norm": 4.877570803828318, | |
| "learning_rate": 6.927206195691039e-08, | |
| "loss": 0.2177, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 0.927038626609442, | |
| "grad_norm": 4.717548019415604, | |
| "learning_rate": 6.538730815200483e-08, | |
| "loss": 0.147, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 0.9291845493562232, | |
| "grad_norm": 14.494510709152369, | |
| "learning_rate": 6.1613205552368e-08, | |
| "loss": 0.6247, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 0.9313304721030042, | |
| "grad_norm": 3.5697709692346677, | |
| "learning_rate": 5.79499256879526e-08, | |
| "loss": 0.1588, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 0.9334763948497854, | |
| "grad_norm": 5.135679478772209, | |
| "learning_rate": 5.4397635051907093e-08, | |
| "loss": 0.4196, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.9356223175965666, | |
| "grad_norm": 5.329997658405359, | |
| "learning_rate": 5.095649509300804e-08, | |
| "loss": 0.2701, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 0.9377682403433476, | |
| "grad_norm": 3.760523480520758, | |
| "learning_rate": 4.7626662208322405e-08, | |
| "loss": 0.0939, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 0.9399141630901288, | |
| "grad_norm": 5.291669505677406, | |
| "learning_rate": 4.4408287736099344e-08, | |
| "loss": 0.3128, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 0.9420600858369099, | |
| "grad_norm": 11.680854749305373, | |
| "learning_rate": 4.130151794889181e-08, | |
| "loss": 0.3982, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 0.944206008583691, | |
| "grad_norm": 12.506267427980399, | |
| "learning_rate": 3.830649404690939e-08, | |
| "loss": 0.6572, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.9463519313304721, | |
| "grad_norm": 4.533062512569462, | |
| "learning_rate": 3.5423352151599534e-08, | |
| "loss": 0.2257, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 0.9484978540772532, | |
| "grad_norm": 3.8785495849634173, | |
| "learning_rate": 3.2652223299462214e-08, | |
| "loss": 0.217, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 0.9506437768240343, | |
| "grad_norm": 6.192440279403632, | |
| "learning_rate": 2.9993233436093895e-08, | |
| "loss": 0.362, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 0.9527896995708155, | |
| "grad_norm": 4.267433754596404, | |
| "learning_rate": 2.7446503410463178e-08, | |
| "loss": 0.2044, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 0.9549356223175965, | |
| "grad_norm": 5.592239506957767, | |
| "learning_rate": 2.5012148969419113e-08, | |
| "loss": 0.3607, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.9570815450643777, | |
| "grad_norm": 12.403767416603346, | |
| "learning_rate": 2.2690280752429293e-08, | |
| "loss": 0.5623, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 0.9592274678111588, | |
| "grad_norm": 4.281323531138287, | |
| "learning_rate": 2.0481004286552753e-08, | |
| "loss": 0.2902, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 0.9613733905579399, | |
| "grad_norm": 2.903934981764517, | |
| "learning_rate": 1.8384419981642698e-08, | |
| "loss": 0.1355, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 0.9635193133047211, | |
| "grad_norm": 8.89203528557412, | |
| "learning_rate": 1.6400623125784053e-08, | |
| "loss": 0.3686, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 0.9656652360515021, | |
| "grad_norm": 4.562592593974174, | |
| "learning_rate": 1.452970388096192e-08, | |
| "loss": 0.1926, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.9678111587982833, | |
| "grad_norm": 4.534339307707502, | |
| "learning_rate": 1.2771747278963464e-08, | |
| "loss": 0.3514, | |
| "step": 451 | |
| }, | |
| { | |
| "epoch": 0.9699570815450643, | |
| "grad_norm": 4.556632192151679, | |
| "learning_rate": 1.1126833217514898e-08, | |
| "loss": 0.2418, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 0.9721030042918455, | |
| "grad_norm": 9.909433789309341, | |
| "learning_rate": 9.595036456648277e-09, | |
| "loss": 0.3205, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 0.9742489270386266, | |
| "grad_norm": 3.9689055222382374, | |
| "learning_rate": 8.176426615304767e-09, | |
| "loss": 0.1813, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 0.9763948497854077, | |
| "grad_norm": 5.3691676448201155, | |
| "learning_rate": 6.871068168170237e-09, | |
| "loss": 0.2227, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.9785407725321889, | |
| "grad_norm": 3.1063248198160824, | |
| "learning_rate": 5.679020442745098e-09, | |
| "loss": 0.1993, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 0.98068669527897, | |
| "grad_norm": 3.8439436507436415, | |
| "learning_rate": 4.600337616648131e-09, | |
| "loss": 0.2025, | |
| "step": 457 | |
| }, | |
| { | |
| "epoch": 0.9828326180257511, | |
| "grad_norm": 3.4608789139316882, | |
| "learning_rate": 3.6350687151531782e-09, | |
| "loss": 0.1106, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 0.9849785407725322, | |
| "grad_norm": 4.432907889303436, | |
| "learning_rate": 2.7832576089623086e-09, | |
| "loss": 0.181, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 0.9871244635193133, | |
| "grad_norm": 5.815988851277571, | |
| "learning_rate": 2.044943012210754e-09, | |
| "loss": 0.3236, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.9892703862660944, | |
| "grad_norm": 4.622989507343908, | |
| "learning_rate": 1.4201584807083113e-09, | |
| "loss": 0.2848, | |
| "step": 461 | |
| }, | |
| { | |
| "epoch": 0.9914163090128756, | |
| "grad_norm": 4.812640379426997, | |
| "learning_rate": 9.08932410413621e-10, | |
| "loss": 0.2791, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 0.9935622317596566, | |
| "grad_norm": 11.627724270489814, | |
| "learning_rate": 5.112880361438088e-10, | |
| "loss": 0.7668, | |
| "step": 463 | |
| }, | |
| { | |
| "epoch": 0.9957081545064378, | |
| "grad_norm": 3.8724681206038953, | |
| "learning_rate": 2.2724343051866438e-10, | |
| "loss": 0.1976, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 0.9978540772532188, | |
| "grad_norm": 6.521739917424491, | |
| "learning_rate": 5.681150313907591e-11, | |
| "loss": 0.2869, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 3.883799871571989, | |
| "learning_rate": 0.0, | |
| "loss": 0.1588, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "step": 466, | |
| "total_flos": 4281709953024.0, | |
| "train_loss": 0.34951822290220996, | |
| "train_runtime": 959.7413, | |
| "train_samples_per_second": 1.942, | |
| "train_steps_per_second": 0.486 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 466, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 2000000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4281709953024.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |