LeTue09 commited on Feb 12

Commit

0d26333

verified ·

1 Parent(s): 1e68b37

Upload folder using huggingface_hub

Browse files

Files changed (20) hide show

.gitattributes +1 -0
added_tokens.json +24 -0
chat_template.jinja +54 -0
config.json +58 -0
generation_config.json +13 -0
merges.txt +0 -0
model-00001-of-00007.safetensors +3 -0
model-00002-of-00007.safetensors +3 -0
model-00003-of-00007.safetensors +3 -0
model-00004-of-00007.safetensors +3 -0
model-00005-of-00007.safetensors +3 -0
model-00006-of-00007.safetensors +3 -0
model-00007-of-00007.safetensors +3 -0
model.safetensors.index.json +347 -0
special_tokens_map.json +31 -0
tokenizer.json +3 -0
tokenizer_config.json +207 -0
trainer_state.json +1404 -0
training_args.bin +3 -0
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

config.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "dtype": "float32",
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 3584,
+  "initializer_range": 0.02,
+  "intermediate_size": 18944,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 32768,
+  "max_window_layers": 28,
+  "model_type": "qwen2",
+  "num_attention_heads": 28,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 4,
+  "pad_token_id": 151643,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "transformers_version": "4.57.1",
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vocab_size": 152064
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "repetition_penalty": 1.1,
+  "temperature": 0.7,
+  "top_k": 20,
+  "top_p": 0.8,
+  "transformers_version": "4.57.1"
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model-00001-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dcd99ecbbec307e344ed9413cea2a7ebaf3774f48c5627828de518727cd6a240
+size 4976687216

model-00002-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7cd2b8cbbe1bb9d5852b5206b048e46a601be2558433b9f77996d180c5f85604
+size 4778622352

model-00003-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b20be39887ce0a7e2eae9bb997953c9ace0bac3e1258a789d12ac573d2dae6f7
+size 4932743960

model-00004-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d9a34111297daed13a14d565856a28e4b9e1df3eeea154a66f8fafafa2236059
+size 4932743992

model-00005-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ceb2341410afd3c8b9330e59587468854a50f996f7a8ca4d6c0708c53d7e68ea
+size 4998852296

model-00006-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37bb68cef8737131ac7c86a184ed2fff0b66d76f555ae81510f3e564978adc7e
+size 3662865184

model-00007-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b54a403826df6c856ee5b59c9fc5246a25b4b97795f0c7b2bb007cc2566f669f
+size 2179989632

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,347 @@

+{
+  "metadata": {
+    "total_parameters": 951952064,
+    "total_size": 30462466048
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00007-of-00007.safetensors",
+    "model.embed_tokens.weight": "model-00001-of-00007.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors",
+    "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00007.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00007.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00007.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors",
+    "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00007.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00007.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00007.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.10.self_attn.k_proj.bias": "model-00003-of-00007.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.10.self_attn.q_proj.bias": "model-00003-of-00007.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.10.self_attn.v_proj.bias": "model-00003-of-00007.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.11.self_attn.k_proj.bias": "model-00003-of-00007.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.11.self_attn.q_proj.bias": "model-00003-of-00007.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.11.self_attn.v_proj.bias": "model-00003-of-00007.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.12.self_attn.k_proj.bias": "model-00003-of-00007.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.12.self_attn.q_proj.bias": "model-00003-of-00007.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.12.self_attn.v_proj.bias": "model-00003-of-00007.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.13.self_attn.k_proj.bias": "model-00003-of-00007.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.13.self_attn.q_proj.bias": "model-00003-of-00007.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.13.self_attn.v_proj.bias": "model-00003-of-00007.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.14.self_attn.k_proj.bias": "model-00004-of-00007.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.14.self_attn.q_proj.bias": "model-00004-of-00007.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.14.self_attn.v_proj.bias": "model-00004-of-00007.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.15.self_attn.k_proj.bias": "model-00004-of-00007.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.15.self_attn.q_proj.bias": "model-00004-of-00007.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.15.self_attn.v_proj.bias": "model-00004-of-00007.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.16.self_attn.k_proj.bias": "model-00004-of-00007.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.16.self_attn.q_proj.bias": "model-00004-of-00007.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.16.self_attn.v_proj.bias": "model-00004-of-00007.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.17.self_attn.k_proj.bias": "model-00004-of-00007.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.17.self_attn.q_proj.bias": "model-00004-of-00007.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.17.self_attn.v_proj.bias": "model-00004-of-00007.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.18.self_attn.k_proj.bias": "model-00004-of-00007.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.18.self_attn.q_proj.bias": "model-00004-of-00007.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.18.self_attn.v_proj.bias": "model-00004-of-00007.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.19.self_attn.k_proj.bias": "model-00005-of-00007.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.19.self_attn.q_proj.bias": "model-00005-of-00007.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.19.self_attn.v_proj.bias": "model-00005-of-00007.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors",
+    "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00007.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00007.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00007.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.20.self_attn.k_proj.bias": "model-00005-of-00007.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.20.self_attn.q_proj.bias": "model-00005-of-00007.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.20.self_attn.v_proj.bias": "model-00005-of-00007.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.21.self_attn.k_proj.bias": "model-00005-of-00007.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.21.self_attn.q_proj.bias": "model-00005-of-00007.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.21.self_attn.v_proj.bias": "model-00005-of-00007.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.22.self_attn.k_proj.bias": "model-00005-of-00007.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.22.self_attn.q_proj.bias": "model-00005-of-00007.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.22.self_attn.v_proj.bias": "model-00005-of-00007.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.23.self_attn.k_proj.bias": "model-00005-of-00007.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.23.self_attn.q_proj.bias": "model-00005-of-00007.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.23.self_attn.v_proj.bias": "model-00005-of-00007.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.24.self_attn.k_proj.bias": "model-00005-of-00007.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.24.self_attn.q_proj.bias": "model-00005-of-00007.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.24.self_attn.v_proj.bias": "model-00005-of-00007.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.25.self_attn.k_proj.bias": "model-00006-of-00007.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.25.self_attn.q_proj.bias": "model-00006-of-00007.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.25.self_attn.v_proj.bias": "model-00006-of-00007.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.26.self_attn.k_proj.bias": "model-00006-of-00007.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.26.self_attn.q_proj.bias": "model-00006-of-00007.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.26.self_attn.v_proj.bias": "model-00006-of-00007.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.27.self_attn.k_proj.bias": "model-00006-of-00007.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.27.self_attn.q_proj.bias": "model-00006-of-00007.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.27.self_attn.v_proj.bias": "model-00006-of-00007.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.3.self_attn.k_proj.bias": "model-00002-of-00007.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.3.self_attn.q_proj.bias": "model-00002-of-00007.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.3.self_attn.v_proj.bias": "model-00002-of-00007.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.4.self_attn.k_proj.bias": "model-00002-of-00007.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.4.self_attn.q_proj.bias": "model-00002-of-00007.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.4.self_attn.v_proj.bias": "model-00002-of-00007.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.5.self_attn.k_proj.bias": "model-00002-of-00007.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.5.self_attn.q_proj.bias": "model-00002-of-00007.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.5.self_attn.v_proj.bias": "model-00002-of-00007.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.6.self_attn.k_proj.bias": "model-00002-of-00007.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.6.self_attn.q_proj.bias": "model-00002-of-00007.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.6.self_attn.v_proj.bias": "model-00002-of-00007.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.7.self_attn.k_proj.bias": "model-00002-of-00007.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.7.self_attn.q_proj.bias": "model-00002-of-00007.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.7.self_attn.v_proj.bias": "model-00002-of-00007.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.8.self_attn.k_proj.bias": "model-00002-of-00007.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.8.self_attn.q_proj.bias": "model-00002-of-00007.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.8.self_attn.v_proj.bias": "model-00002-of-00007.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.9.self_attn.k_proj.bias": "model-00003-of-00007.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.9.self_attn.q_proj.bias": "model-00003-of-00007.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.9.self_attn.v_proj.bias": "model-00003-of-00007.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
+    "model.norm.weight": "model-00006-of-00007.safetensors"
+  }
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
+size 11421896

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,207 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] }}\n    {%- else %}\n        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n    {%- endif %}\n    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role }}\n        {%- if message.content %}\n            {{- '\\n' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '\\n<tool_call>\\n{\"name\": \"' }}\n            {{- tool_call.name }}\n            {{- '\", \"arguments\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- '}\\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "model_max_length": 32768,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1404 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.0,
+  "eval_steps": 500,
+  "global_step": 1370,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "entropy": 0.8819891855120658,
+      "epoch": 0.014603870025556773,
+      "grad_norm": 0.8598580956459045,
+      "learning_rate": 1.7475728155339808e-06,
+      "loss": 0.8448,
+      "mean_token_accuracy": 0.7740211486816406,
+      "num_tokens": 770936.0,
+      "step": 10
+    },
+    {
+      "entropy": 0.9709278956055641,
+      "epoch": 0.029207740051113547,
+      "grad_norm": 0.615368127822876,
+      "learning_rate": 3.689320388349515e-06,
+      "loss": 0.8205,
+      "mean_token_accuracy": 0.7773708969354629,
+      "num_tokens": 1541367.0,
+      "step": 20
+    },
+    {
+      "entropy": 1.0815545290708541,
+      "epoch": 0.04381161007667032,
+      "grad_norm": 0.6463781595230103,
+      "learning_rate": 5.631067961165049e-06,
+      "loss": 0.7713,
+      "mean_token_accuracy": 0.7880979612469673,
+      "num_tokens": 2318852.0,
+      "step": 30
+    },
+    {
+      "entropy": 1.0128332734107972,
+      "epoch": 0.058415480102227094,
+      "grad_norm": 0.5948992371559143,
+      "learning_rate": 7.572815533980583e-06,
+      "loss": 0.7764,
+      "mean_token_accuracy": 0.7841672986745835,
+      "num_tokens": 3084308.0,
+      "step": 40
+    },
+    {
+      "entropy": 1.012282955646515,
+      "epoch": 0.07301935012778386,
+      "grad_norm": 0.6310043931007385,
+      "learning_rate": 9.514563106796117e-06,
+      "loss": 0.7564,
+      "mean_token_accuracy": 0.7881451666355133,
+      "num_tokens": 3834841.0,
+      "step": 50
+    },
+    {
+      "entropy": 1.0291071116924286,
+      "epoch": 0.08762322015334063,
+      "grad_norm": 0.6180324554443359,
+      "learning_rate": 1.145631067961165e-05,
+      "loss": 0.7502,
+      "mean_token_accuracy": 0.7902129292488098,
+      "num_tokens": 4596954.0,
+      "step": 60
+    },
+    {
+      "entropy": 1.0330789506435394,
+      "epoch": 0.1022270901788974,
+      "grad_norm": 0.6203945279121399,
+      "learning_rate": 1.3398058252427187e-05,
+      "loss": 0.7596,
+      "mean_token_accuracy": 0.7884851574897767,
+      "num_tokens": 5367587.0,
+      "step": 70
+    },
+    {
+      "entropy": 1.0233020395040513,
+      "epoch": 0.11683096020445419,
+      "grad_norm": 0.5948684811592102,
+      "learning_rate": 1.533980582524272e-05,
+      "loss": 0.7538,
+      "mean_token_accuracy": 0.788135002553463,
+      "num_tokens": 6132539.0,
+      "step": 80
+    },
+    {
+      "entropy": 1.022057643532753,
+      "epoch": 0.13143483023001096,
+      "grad_norm": 0.6073455810546875,
+      "learning_rate": 1.7281553398058253e-05,
+      "loss": 0.7418,
+      "mean_token_accuracy": 0.7916493251919746,
+      "num_tokens": 6910021.0,
+      "step": 90
+    },
+    {
+      "entropy": 1.0418485432863236,
+      "epoch": 0.14603870025556773,
+      "grad_norm": 0.56880784034729,
+      "learning_rate": 1.922330097087379e-05,
+      "loss": 0.7686,
+      "mean_token_accuracy": 0.7854298010468483,
+      "num_tokens": 7672070.0,
+      "step": 100
+    },
+    {
+      "entropy": 1.0249923586845398,
+      "epoch": 0.1606425702811245,
+      "grad_norm": 0.6060709357261658,
+      "learning_rate": 1.9999839020034848e-05,
+      "loss": 0.7376,
+      "mean_token_accuracy": 0.7922860443592071,
+      "num_tokens": 8437851.0,
+      "step": 110
+    },
+    {
+      "entropy": 1.0423746079206466,
+      "epoch": 0.17524644030668127,
+      "grad_norm": 0.5950225591659546,
+      "learning_rate": 1.9998855272350457e-05,
+      "loss": 0.75,
+      "mean_token_accuracy": 0.7883219286799431,
+      "num_tokens": 9220959.0,
+      "step": 120
+    },
+    {
+      "entropy": 1.058434711396694,
+      "epoch": 0.18985031033223804,
+      "grad_norm": 0.5805737972259521,
+      "learning_rate": 1.99969772981684e-05,
+      "loss": 0.754,
+      "mean_token_accuracy": 0.7883959293365479,
+      "num_tokens": 9998137.0,
+      "step": 130
+    },
+    {
+      "entropy": 1.055925799906254,
+      "epoch": 0.2044541803577948,
+      "grad_norm": 0.5844792723655701,
+      "learning_rate": 1.9994205265441328e-05,
+      "loss": 0.7443,
+      "mean_token_accuracy": 0.7903072774410248,
+      "num_tokens": 10776402.0,
+      "step": 140
+    },
+    {
+      "entropy": 1.0465368673205375,
+      "epoch": 0.21905805038335158,
+      "grad_norm": 0.6346914172172546,
+      "learning_rate": 1.9990539422080134e-05,
+      "loss": 0.7423,
+      "mean_token_accuracy": 0.7909073531627655,
+      "num_tokens": 11537876.0,
+      "step": 150
+    },
+    {
+      "entropy": 1.0467117950320244,
+      "epoch": 0.23366192040890837,
+      "grad_norm": 0.555743396282196,
+      "learning_rate": 1.9985980095931774e-05,
+      "loss": 0.7426,
+      "mean_token_accuracy": 0.7902399882674217,
+      "num_tokens": 12313854.0,
+      "step": 160
+    },
+    {
+      "entropy": 1.0474098443984985,
+      "epoch": 0.24826579043446514,
+      "grad_norm": 0.6585262417793274,
+      "learning_rate": 1.9980527694749952e-05,
+      "loss": 0.7502,
+      "mean_token_accuracy": 0.7889217540621758,
+      "num_tokens": 13059556.0,
+      "step": 170
+    },
+    {
+      "entropy": 1.046202352643013,
+      "epoch": 0.2628696604600219,
+      "grad_norm": 0.5887473225593567,
+      "learning_rate": 1.9974182706158646e-05,
+      "loss": 0.7287,
+      "mean_token_accuracy": 0.7938704118132591,
+      "num_tokens": 13829911.0,
+      "step": 180
+    },
+    {
+      "entropy": 1.0543677926063537,
+      "epoch": 0.27747353048557866,
+      "grad_norm": 0.5907500982284546,
+      "learning_rate": 1.996694569760851e-05,
+      "loss": 0.7465,
+      "mean_token_accuracy": 0.790279072523117,
+      "num_tokens": 14595687.0,
+      "step": 190
+    },
+    {
+      "entropy": 1.059071497619152,
+      "epoch": 0.29207740051113545,
+      "grad_norm": 0.5858122706413269,
+      "learning_rate": 1.995881731632611e-05,
+      "loss": 0.7376,
+      "mean_token_accuracy": 0.7931964993476868,
+      "num_tokens": 15362920.0,
+      "step": 200
+    },
+    {
+      "entropy": 1.0532679110765457,
+      "epoch": 0.3066812705366922,
+      "grad_norm": 0.5991299152374268,
+      "learning_rate": 1.9949798289256054e-05,
+      "loss": 0.7578,
+      "mean_token_accuracy": 0.786410291492939,
+      "num_tokens": 16128067.0,
+      "step": 210
+    },
+    {
+      "entropy": 1.0485742643475533,
+      "epoch": 0.321285140562249,
+      "grad_norm": 0.5720910429954529,
+      "learning_rate": 1.993988942299598e-05,
+      "loss": 0.7404,
+      "mean_token_accuracy": 0.7914015546441078,
+      "num_tokens": 16911462.0,
+      "step": 220
+    },
+    {
+      "entropy": 1.0678794473409652,
+      "epoch": 0.3358890105878058,
+      "grad_norm": 0.6100424528121948,
+      "learning_rate": 1.9929091603724404e-05,
+      "loss": 0.7479,
+      "mean_token_accuracy": 0.7890788897871971,
+      "num_tokens": 17697397.0,
+      "step": 230
+    },
+    {
+      "entropy": 1.0550393044948578,
+      "epoch": 0.35049288061336253,
+      "grad_norm": 0.5960067510604858,
+      "learning_rate": 1.9917405797121484e-05,
+      "loss": 0.7518,
+      "mean_token_accuracy": 0.7884634032845497,
+      "num_tokens": 18465317.0,
+      "step": 240
+    },
+    {
+      "entropy": 1.0647842451930045,
+      "epoch": 0.36509675063891933,
+      "grad_norm": 0.61286461353302,
+      "learning_rate": 1.990483304828264e-05,
+      "loss": 0.7568,
+      "mean_token_accuracy": 0.7871849820017814,
+      "num_tokens": 19218071.0,
+      "step": 250
+    },
+    {
+      "entropy": 1.02879488915205,
+      "epoch": 0.3797006206644761,
+      "grad_norm": 0.5549123883247375,
+      "learning_rate": 1.9891374481625112e-05,
+      "loss": 0.7221,
+      "mean_token_accuracy": 0.7956910878419876,
+      "num_tokens": 19985551.0,
+      "step": 260
+    },
+    {
+      "entropy": 1.0511873975396155,
+      "epoch": 0.39430449069003287,
+      "grad_norm": 0.542924702167511,
+      "learning_rate": 1.987703130078737e-05,
+      "loss": 0.7303,
+      "mean_token_accuracy": 0.7930067017674446,
+      "num_tokens": 20759854.0,
+      "step": 270
+    },
+    {
+      "entropy": 1.046246202290058,
+      "epoch": 0.4089083607155896,
+      "grad_norm": 0.5551313161849976,
+      "learning_rate": 1.986180478852149e-05,
+      "loss": 0.7252,
+      "mean_token_accuracy": 0.7956920295953751,
+      "num_tokens": 21543604.0,
+      "step": 280
+    },
+    {
+      "entropy": 1.0610669255256653,
+      "epoch": 0.4235122307411464,
+      "grad_norm": 0.629205048084259,
+      "learning_rate": 1.9845696306578433e-05,
+      "loss": 0.7377,
+      "mean_token_accuracy": 0.7924546420574188,
+      "num_tokens": 22310802.0,
+      "step": 290
+    },
+    {
+      "entropy": 1.0590673327445983,
+      "epoch": 0.43811610076670315,
+      "grad_norm": 0.6042408347129822,
+      "learning_rate": 1.9828707295586253e-05,
+      "loss": 0.7356,
+      "mean_token_accuracy": 0.7922392532229423,
+      "num_tokens": 23088849.0,
+      "step": 300
+    },
+    {
+      "entropy": 1.059952473640442,
+      "epoch": 0.45271997079225995,
+      "grad_norm": 0.6013913750648499,
+      "learning_rate": 1.981083927492125e-05,
+      "loss": 0.7379,
+      "mean_token_accuracy": 0.7928509280085564,
+      "num_tokens": 23858610.0,
+      "step": 310
+    },
+    {
+      "entropy": 1.0460969746112823,
+      "epoch": 0.46732384081781675,
+      "grad_norm": 0.5834254622459412,
+      "learning_rate": 1.9792093842572106e-05,
+      "loss": 0.7335,
+      "mean_token_accuracy": 0.7924273937940598,
+      "num_tokens": 24590316.0,
+      "step": 320
+    },
+    {
+      "entropy": 1.0532741829752923,
+      "epoch": 0.4819277108433735,
+      "grad_norm": 0.5749837160110474,
+      "learning_rate": 1.9772472674996962e-05,
+      "loss": 0.746,
+      "mean_token_accuracy": 0.789176419377327,
+      "num_tokens": 25353089.0,
+      "step": 330
+    },
+    {
+      "entropy": 1.0500716269016266,
+      "epoch": 0.4965315808689303,
+      "grad_norm": 0.5746060609817505,
+      "learning_rate": 1.975197752697349e-05,
+      "loss": 0.7414,
+      "mean_token_accuracy": 0.7907495066523552,
+      "num_tokens": 26117777.0,
+      "step": 340
+    },
+    {
+      "entropy": 1.051976852118969,
+      "epoch": 0.5111354508944871,
+      "grad_norm": 0.5868931412696838,
+      "learning_rate": 1.973061023144196e-05,
+      "loss": 0.7256,
+      "mean_token_accuracy": 0.7959051370620728,
+      "num_tokens": 26885934.0,
+      "step": 350
+    },
+    {
+      "entropy": 1.078227588534355,
+      "epoch": 0.5257393209200438,
+      "grad_norm": 0.5909886360168457,
+      "learning_rate": 1.9708372699341297e-05,
+      "loss": 0.7568,
+      "mean_token_accuracy": 0.7874275401234627,
+      "num_tokens": 27648234.0,
+      "step": 360
+    },
+    {
+      "entropy": 1.0532238066196442,
+      "epoch": 0.5403431909456006,
+      "grad_norm": 0.5472894310951233,
+      "learning_rate": 1.9685266919438208e-05,
+      "loss": 0.7351,
+      "mean_token_accuracy": 0.7938356637954712,
+      "num_tokens": 28411776.0,
+      "step": 370
+    },
+    {
+      "entropy": 1.0588267534971236,
+      "epoch": 0.5549470609711573,
+      "grad_norm": 0.5929501056671143,
+      "learning_rate": 1.9661294958149312e-05,
+      "loss": 0.745,
+      "mean_token_accuracy": 0.7911377623677254,
+      "num_tokens": 29169804.0,
+      "step": 380
+    },
+    {
+      "entropy": 1.0762585639953612,
+      "epoch": 0.5695509309967142,
+      "grad_norm": 0.6019272804260254,
+      "learning_rate": 1.963645895935632e-05,
+      "loss": 0.7458,
+      "mean_token_accuracy": 0.7898744881153107,
+      "num_tokens": 29942397.0,
+      "step": 390
+    },
+    {
+      "entropy": 1.0671012222766876,
+      "epoch": 0.5841548010222709,
+      "grad_norm": 0.6131295561790466,
+      "learning_rate": 1.9610761144214307e-05,
+      "loss": 0.7547,
+      "mean_token_accuracy": 0.7888392016291619,
+      "num_tokens": 30737478.0,
+      "step": 400
+    },
+    {
+      "entropy": 1.0662592992186546,
+      "epoch": 0.5987586710478277,
+      "grad_norm": 0.579192578792572,
+      "learning_rate": 1.9584203810953094e-05,
+      "loss": 0.731,
+      "mean_token_accuracy": 0.7930623203516006,
+      "num_tokens": 31487239.0,
+      "step": 410
+    },
+    {
+      "entropy": 1.0672000512480735,
+      "epoch": 0.6133625410733844,
+      "grad_norm": 0.5550879836082458,
+      "learning_rate": 1.9556789334671668e-05,
+      "loss": 0.7395,
+      "mean_token_accuracy": 0.792091254889965,
+      "num_tokens": 32248396.0,
+      "step": 420
+    },
+    {
+      "entropy": 1.0656457453966142,
+      "epoch": 0.6279664110989412,
+      "grad_norm": 1.1255723237991333,
+      "learning_rate": 1.9528520167125803e-05,
+      "loss": 0.7395,
+      "mean_token_accuracy": 0.7922166392207146,
+      "num_tokens": 33011134.0,
+      "step": 430
+    },
+    {
+      "entropy": 1.0731883138418197,
+      "epoch": 0.642570281124498,
+      "grad_norm": 0.5882017016410828,
+      "learning_rate": 1.9499398836508776e-05,
+      "loss": 0.7493,
+      "mean_token_accuracy": 0.7877946421504021,
+      "num_tokens": 33787966.0,
+      "step": 440
+    },
+    {
+      "entropy": 1.0552370175719261,
+      "epoch": 0.6571741511500547,
+      "grad_norm": 0.6130959987640381,
+      "learning_rate": 1.9469427947225267e-05,
+      "loss": 0.7111,
+      "mean_token_accuracy": 0.7995760783553123,
+      "num_tokens": 34559268.0,
+      "step": 450
+    },
+    {
+      "entropy": 1.069392178952694,
+      "epoch": 0.6717780211756116,
+      "grad_norm": 0.5893939733505249,
+      "learning_rate": 1.9438610179658447e-05,
+      "loss": 0.7364,
+      "mean_token_accuracy": 0.7937344864010811,
+      "num_tokens": 35306495.0,
+      "step": 460
+    },
+    {
+      "entropy": 1.0607215002179147,
+      "epoch": 0.6863818912011683,
+      "grad_norm": 0.5958935022354126,
+      "learning_rate": 1.9406948289930247e-05,
+      "loss": 0.7358,
+      "mean_token_accuracy": 0.7919678211212158,
+      "num_tokens": 36064111.0,
+      "step": 470
+    },
+    {
+      "entropy": 1.079184153676033,
+      "epoch": 0.7009857612267251,
+      "grad_norm": 0.5986253619194031,
+      "learning_rate": 1.9374445109654888e-05,
+      "loss": 0.7421,
+      "mean_token_accuracy": 0.7907416477799416,
+      "num_tokens": 36832765.0,
+      "step": 480
+    },
+    {
+      "entropy": 1.0671533614397049,
+      "epoch": 0.7155896312522818,
+      "grad_norm": 0.5463415384292603,
+      "learning_rate": 1.9341103545685637e-05,
+      "loss": 0.7304,
+      "mean_token_accuracy": 0.7941543251276016,
+      "num_tokens": 37609612.0,
+      "step": 490
+    },
+    {
+      "entropy": 1.0800618380308151,
+      "epoch": 0.7301935012778387,
+      "grad_norm": 0.7118626832962036,
+      "learning_rate": 1.930692657985482e-05,
+      "loss": 0.7289,
+      "mean_token_accuracy": 0.7952305421233177,
+      "num_tokens": 38378623.0,
+      "step": 500
+    },
+    {
+      "entropy": 1.051339966058731,
+      "epoch": 0.7447973713033954,
+      "grad_norm": 0.5645574331283569,
+      "learning_rate": 1.927191726870718e-05,
+      "loss": 0.7091,
+      "mean_token_accuracy": 0.7985311895608902,
+      "num_tokens": 39146551.0,
+      "step": 510
+    },
+    {
+      "entropy": 1.0769214510917664,
+      "epoch": 0.7594012413289521,
+      "grad_norm": 0.6002981066703796,
+      "learning_rate": 1.9236078743226502e-05,
+      "loss": 0.7288,
+      "mean_token_accuracy": 0.7942462310194969,
+      "num_tokens": 39908848.0,
+      "step": 520
+    },
+    {
+      "entropy": 1.0461754769086837,
+      "epoch": 0.7740051113545089,
+      "grad_norm": 0.5525550246238708,
+      "learning_rate": 1.919941420855559e-05,
+      "loss": 0.7058,
+      "mean_token_accuracy": 0.8000339075922966,
+      "num_tokens": 40694755.0,
+      "step": 530
+    },
+    {
+      "entropy": 1.0753471747040748,
+      "epoch": 0.7886089813800657,
+      "grad_norm": 0.5900936722755432,
+      "learning_rate": 1.916192694370965e-05,
+      "loss": 0.7535,
+      "mean_token_accuracy": 0.7890919044613838,
+      "num_tokens": 41468815.0,
+      "step": 540
+    },
+    {
+      "entropy": 1.0729323342442512,
+      "epoch": 0.8032128514056225,
+      "grad_norm": 0.6021299958229065,
+      "learning_rate": 1.912362030128302e-05,
+      "loss": 0.7149,
+      "mean_token_accuracy": 0.7978271067142486,
+      "num_tokens": 42243180.0,
+      "step": 550
+    },
+    {
+      "entropy": 1.0811306938529015,
+      "epoch": 0.8178167214311792,
+      "grad_norm": 0.5898880958557129,
+      "learning_rate": 1.9084497707149337e-05,
+      "loss": 0.7383,
+      "mean_token_accuracy": 0.7915405228734016,
+      "num_tokens": 43002684.0,
+      "step": 560
+    },
+    {
+      "entropy": 1.0548559844493866,
+      "epoch": 0.8324205914567361,
+      "grad_norm": 0.6246165633201599,
+      "learning_rate": 1.9044562660155158e-05,
+      "loss": 0.7287,
+      "mean_token_accuracy": 0.7933633595705032,
+      "num_tokens": 43756816.0,
+      "step": 570
+    },
+    {
+      "entropy": 1.0436548948287965,
+      "epoch": 0.8470244614822928,
+      "grad_norm": 0.5454269051551819,
+      "learning_rate": 1.900381873180704e-05,
+      "loss": 0.7019,
+      "mean_token_accuracy": 0.8000532567501069,
+      "num_tokens": 44502793.0,
+      "step": 580
+    },
+    {
+      "entropy": 1.061595305800438,
+      "epoch": 0.8616283315078496,
+      "grad_norm": 0.5577073693275452,
+      "learning_rate": 1.896226956595214e-05,
+      "loss": 0.7347,
+      "mean_token_accuracy": 0.7934001550078392,
+      "num_tokens": 45260661.0,
+      "step": 590
+    },
+    {
+      "entropy": 1.0671459570527078,
+      "epoch": 0.8762322015334063,
+      "grad_norm": 0.5666953325271606,
+      "learning_rate": 1.891991887845233e-05,
+      "loss": 0.7157,
+      "mean_token_accuracy": 0.7979460313916207,
+      "num_tokens": 46040873.0,
+      "step": 600
+    },
+    {
+      "entropy": 1.0621700644493104,
+      "epoch": 0.8908360715589632,
+      "grad_norm": 0.5529576539993286,
+      "learning_rate": 1.887677045685188e-05,
+      "loss": 0.7252,
+      "mean_token_accuracy": 0.7951329663395882,
+      "num_tokens": 46818910.0,
+      "step": 610
+    },
+    {
+      "entropy": 1.0564261555671692,
+      "epoch": 0.9054399415845199,
+      "grad_norm": 0.6081776022911072,
+      "learning_rate": 1.8832828160038717e-05,
+      "loss": 0.7224,
+      "mean_token_accuracy": 0.7945129945874214,
+      "num_tokens": 47581801.0,
+      "step": 620
+    },
+    {
+      "entropy": 1.0867107123136521,
+      "epoch": 0.9200438116100766,
+      "grad_norm": 0.5485532283782959,
+      "learning_rate": 1.8788095917899322e-05,
+      "loss": 0.7397,
+      "mean_token_accuracy": 0.7922059059143066,
+      "num_tokens": 48353228.0,
+      "step": 630
+    },
+    {
+      "entropy": 1.0725250199437142,
+      "epoch": 0.9346476816356335,
+      "grad_norm": 0.5478349924087524,
+      "learning_rate": 1.8742577730967275e-05,
+      "loss": 0.7282,
+      "mean_token_accuracy": 0.7937524914741516,
+      "num_tokens": 49128771.0,
+      "step": 640
+    },
+    {
+      "entropy": 1.065740318596363,
+      "epoch": 0.9492515516611902,
+      "grad_norm": 0.5200572609901428,
+      "learning_rate": 1.8696277670065453e-05,
+      "loss": 0.7158,
+      "mean_token_accuracy": 0.7966950073838234,
+      "num_tokens": 49898864.0,
+      "step": 650
+    },
+    {
+      "entropy": 1.0765223398804664,
+      "epoch": 0.963855421686747,
+      "grad_norm": 0.5623005032539368,
+      "learning_rate": 1.8649199875942e-05,
+      "loss": 0.7316,
+      "mean_token_accuracy": 0.79328583329916,
+      "num_tokens": 50667692.0,
+      "step": 660
+    },
+    {
+      "entropy": 1.0664428249001503,
+      "epoch": 0.9784592917123037,
+      "grad_norm": 0.5457090735435486,
+      "learning_rate": 1.860134855889997e-05,
+      "loss": 0.7192,
+      "mean_token_accuracy": 0.7951617255806923,
+      "num_tokens": 51439436.0,
+      "step": 670
+    },
+    {
+      "entropy": 1.054172757267952,
+      "epoch": 0.9930631617378606,
+      "grad_norm": 0.558342456817627,
+      "learning_rate": 1.8552727998420815e-05,
+      "loss": 0.7284,
+      "mean_token_accuracy": 0.795482975244522,
+      "num_tokens": 52216680.0,
+      "step": 680
+    },
+    {
+      "entropy": 1.0329305483744695,
+      "epoch": 1.0073019350127783,
+      "grad_norm": 0.519607424736023,
+      "learning_rate": 1.850334254278164e-05,
+      "loss": 0.6312,
+      "mean_token_accuracy": 0.8187721814864721,
+      "num_tokens": 52953330.0,
+      "step": 690
+    },
+    {
+      "entropy": 0.8502921864390374,
+      "epoch": 1.0219058050383352,
+      "grad_norm": 0.6058902740478516,
+      "learning_rate": 1.845319660866635e-05,
+      "loss": 0.5259,
+      "mean_token_accuracy": 0.8443149983882904,
+      "num_tokens": 53713129.0,
+      "step": 700
+    },
+    {
+      "entropy": 0.8117894425988197,
+      "epoch": 1.036509675063892,
+      "grad_norm": 0.5925081968307495,
+      "learning_rate": 1.8402294680770607e-05,
+      "loss": 0.5039,
+      "mean_token_accuracy": 0.8498010948300362,
+      "num_tokens": 54480149.0,
+      "step": 710
+    },
+    {
+      "entropy": 0.7792180389165878,
+      "epoch": 1.0511135450894487,
+      "grad_norm": 0.637313961982727,
+      "learning_rate": 1.8350641311400813e-05,
+      "loss": 0.4964,
+      "mean_token_accuracy": 0.851611290872097,
+      "num_tokens": 55232208.0,
+      "step": 720
+    },
+    {
+      "entropy": 0.8110285863280297,
+      "epoch": 1.0657174151150055,
+      "grad_norm": 0.6269355416297913,
+      "learning_rate": 1.8298241120066923e-05,
+      "loss": 0.5102,
+      "mean_token_accuracy": 0.8473430201411247,
+      "num_tokens": 56017632.0,
+      "step": 730
+    },
+    {
+      "entropy": 0.805873404443264,
+      "epoch": 1.0803212851405624,
+      "grad_norm": 0.6263405084609985,
+      "learning_rate": 1.8245098793069353e-05,
+      "loss": 0.4989,
+      "mean_token_accuracy": 0.8503431648015976,
+      "num_tokens": 56794094.0,
+      "step": 740
+    },
+    {
+      "entropy": 0.7984029710292816,
+      "epoch": 1.094925155166119,
+      "grad_norm": 0.6596930623054504,
+      "learning_rate": 1.819121908307985e-05,
+      "loss": 0.5029,
+      "mean_token_accuracy": 0.8493917599320412,
+      "num_tokens": 57569065.0,
+      "step": 750
+    },
+    {
+      "entropy": 0.8163655236363411,
+      "epoch": 1.1095290251916758,
+      "grad_norm": 0.6013880372047424,
+      "learning_rate": 1.813660680871645e-05,
+      "loss": 0.508,
+      "mean_token_accuracy": 0.8476730227470398,
+      "num_tokens": 58342947.0,
+      "step": 760
+    },
+    {
+      "entropy": 0.8022693067789077,
+      "epoch": 1.1241328952172325,
+      "grad_norm": 0.6172059774398804,
+      "learning_rate": 1.8081266854112536e-05,
+      "loss": 0.5066,
+      "mean_token_accuracy": 0.847652480006218,
+      "num_tokens": 59095076.0,
+      "step": 770
+    },
+    {
+      "entropy": 0.8082443997263908,
+      "epoch": 1.1387367652427893,
+      "grad_norm": 0.5798039436340332,
+      "learning_rate": 1.8025204168480036e-05,
+      "loss": 0.5084,
+      "mean_token_accuracy": 0.8473975166678429,
+      "num_tokens": 59855251.0,
+      "step": 780
+    },
+    {
+      "entropy": 0.8160747662186623,
+      "epoch": 1.1533406352683462,
+      "grad_norm": 0.6323524713516235,
+      "learning_rate": 1.7968423765666805e-05,
+      "loss": 0.5181,
+      "mean_token_accuracy": 0.8460124984383584,
+      "num_tokens": 60616729.0,
+      "step": 790
+    },
+    {
+      "entropy": 0.8220678076148034,
+      "epoch": 1.1679445052939028,
+      "grad_norm": 0.5704360008239746,
+      "learning_rate": 1.7910930723708206e-05,
+      "loss": 0.4989,
+      "mean_token_accuracy": 0.8500168919563293,
+      "num_tokens": 61391607.0,
+      "step": 800
+    },
+    {
+      "entropy": 0.8171453341841698,
+      "epoch": 1.1825483753194597,
+      "grad_norm": 0.6301026940345764,
+      "learning_rate": 1.7852730184372996e-05,
+      "loss": 0.5094,
+      "mean_token_accuracy": 0.847735871374607,
+      "num_tokens": 62176326.0,
+      "step": 810
+    },
+    {
+      "entropy": 0.8387730494141579,
+      "epoch": 1.1971522453450165,
+      "grad_norm": 0.6920040845870972,
+      "learning_rate": 1.779382735270345e-05,
+      "loss": 0.528,
+      "mean_token_accuracy": 0.8430045962333679,
+      "num_tokens": 62936900.0,
+      "step": 820
+    },
+    {
+      "entropy": 0.8173261970281601,
+      "epoch": 1.2117561153705732,
+      "grad_norm": 0.6174663305282593,
+      "learning_rate": 1.773422749654988e-05,
+      "loss": 0.505,
+      "mean_token_accuracy": 0.849335603415966,
+      "num_tokens": 63714937.0,
+      "step": 830
+    },
+    {
+      "entropy": 0.8201940849423408,
+      "epoch": 1.22635998539613,
+      "grad_norm": 0.6521441340446472,
+      "learning_rate": 1.7673935946099515e-05,
+      "loss": 0.5169,
+      "mean_token_accuracy": 0.8452720895409584,
+      "num_tokens": 64484592.0,
+      "step": 840
+    },
+    {
+      "entropy": 0.8129187062382698,
+      "epoch": 1.2409638554216866,
+      "grad_norm": 0.6480047106742859,
+      "learning_rate": 1.7612958093399793e-05,
+      "loss": 0.5101,
+      "mean_token_accuracy": 0.847150382399559,
+      "num_tokens": 65242523.0,
+      "step": 850
+    },
+    {
+      "entropy": 0.797595490515232,
+      "epoch": 1.2555677254472435,
+      "grad_norm": 0.6072301268577576,
+      "learning_rate": 1.7551299391876147e-05,
+      "loss": 0.4954,
+      "mean_token_accuracy": 0.8505616560578346,
+      "num_tokens": 66010969.0,
+      "step": 860
+    },
+    {
+      "entropy": 0.8110659316182136,
+      "epoch": 1.2701715954728003,
+      "grad_norm": 0.6669343709945679,
+      "learning_rate": 1.7488965355844293e-05,
+      "loss": 0.5132,
+      "mean_token_accuracy": 0.8465436458587646,
+      "num_tokens": 66767531.0,
+      "step": 870
+    },
+    {
+      "entropy": 0.8244558870792389,
+      "epoch": 1.284775465498357,
+      "grad_norm": 0.6384320855140686,
+      "learning_rate": 1.742596156001705e-05,
+      "loss": 0.5276,
+      "mean_token_accuracy": 0.8408863857388497,
+      "num_tokens": 67498692.0,
+      "step": 880
+    },
+    {
+      "entropy": 0.8114714965224266,
+      "epoch": 1.2993793355239138,
+      "grad_norm": 0.5711145401000977,
+      "learning_rate": 1.73622936390058e-05,
+      "loss": 0.5139,
+      "mean_token_accuracy": 0.8463606715202332,
+      "num_tokens": 68285940.0,
+      "step": 890
+    },
+    {
+      "entropy": 0.8195077747106552,
+      "epoch": 1.3139832055494707,
+      "grad_norm": 0.6267218589782715,
+      "learning_rate": 1.7297967286816553e-05,
+      "loss": 0.5154,
+      "mean_token_accuracy": 0.8449677467346192,
+      "num_tokens": 69039571.0,
+      "step": 900
+    },
+    {
+      "entropy": 0.8230465367436409,
+      "epoch": 1.3285870755750273,
+      "grad_norm": 0.6753976345062256,
+      "learning_rate": 1.723298825634072e-05,
+      "loss": 0.5357,
+      "mean_token_accuracy": 0.8401415064930916,
+      "num_tokens": 69793108.0,
+      "step": 910
+    },
+    {
+      "entropy": 0.8096449464559555,
+      "epoch": 1.3431909456005842,
+      "grad_norm": 0.6720008850097656,
+      "learning_rate": 1.716736235884062e-05,
+      "loss": 0.5098,
+      "mean_token_accuracy": 0.8470739260315895,
+      "num_tokens": 70568130.0,
+      "step": 920
+    },
+    {
+      "entropy": 0.8186160072684288,
+      "epoch": 1.357794815626141,
+      "grad_norm": 0.5903974771499634,
+      "learning_rate": 1.7101095463429748e-05,
+      "loss": 0.5198,
+      "mean_token_accuracy": 0.8454992339015007,
+      "num_tokens": 71349507.0,
+      "step": 930
+    },
+    {
+      "entropy": 0.8150006383657455,
+      "epoch": 1.3723986856516976,
+      "grad_norm": 0.5898988842964172,
+      "learning_rate": 1.7034193496547903e-05,
+      "loss": 0.5071,
+      "mean_token_accuracy": 0.8491646945476532,
+      "num_tokens": 72135471.0,
+      "step": 940
+    },
+    {
+      "entropy": 0.808189807832241,
+      "epoch": 1.3870025556772545,
+      "grad_norm": 0.5973511934280396,
+      "learning_rate": 1.6966662441431157e-05,
+      "loss": 0.5321,
+      "mean_token_accuracy": 0.8418431639671325,
+      "num_tokens": 72886499.0,
+      "step": 950
+    },
+    {
+      "entropy": 0.8094048380851746,
+      "epoch": 1.4016064257028114,
+      "grad_norm": 0.6130071878433228,
+      "learning_rate": 1.6898508337576754e-05,
+      "loss": 0.519,
+      "mean_token_accuracy": 0.8449132606387139,
+      "num_tokens": 73639876.0,
+      "step": 960
+    },
+    {
+      "entropy": 0.8094017982482911,
+      "epoch": 1.416210295728368,
+      "grad_norm": 0.6462528705596924,
+      "learning_rate": 1.6829737280203e-05,
+      "loss": 0.5158,
+      "mean_token_accuracy": 0.8453432083129883,
+      "num_tokens": 74401459.0,
+      "step": 970
+    },
+    {
+      "entropy": 0.8205527007579804,
+      "epoch": 1.4308141657539248,
+      "grad_norm": 0.6410425901412964,
+      "learning_rate": 1.676035541970411e-05,
+      "loss": 0.5366,
+      "mean_token_accuracy": 0.8401491552591324,
+      "num_tokens": 75180262.0,
+      "step": 980
+    },
+    {
+      "entropy": 0.8101997837424278,
+      "epoch": 1.4454180357794817,
+      "grad_norm": 0.6770428419113159,
+      "learning_rate": 1.669036896110021e-05,
+      "loss": 0.5278,
+      "mean_token_accuracy": 0.8413688018918037,
+      "num_tokens": 75935757.0,
+      "step": 990
+    },
+    {
+      "entropy": 0.8236281529068947,
+      "epoch": 1.4600219058050383,
+      "grad_norm": 0.5985122323036194,
+      "learning_rate": 1.6619784163482372e-05,
+      "loss": 0.5169,
+      "mean_token_accuracy": 0.8454629138112069,
+      "num_tokens": 76677214.0,
+      "step": 1000
+    },
+    {
+      "entropy": 0.8149201571941376,
+      "epoch": 1.4746257758305952,
+      "grad_norm": 0.6033229231834412,
+      "learning_rate": 1.6548607339452853e-05,
+      "loss": 0.5157,
+      "mean_token_accuracy": 0.8454660639166832,
+      "num_tokens": 77429839.0,
+      "step": 1010
+    },
+    {
+      "entropy": 0.8358469530940056,
+      "epoch": 1.4892296458561518,
+      "grad_norm": 0.6250749826431274,
+      "learning_rate": 1.6476844854560537e-05,
+      "loss": 0.5274,
+      "mean_token_accuracy": 0.8430205255746841,
+      "num_tokens": 78198665.0,
+      "step": 1020
+    },
+    {
+      "entropy": 0.8159400016069412,
+      "epoch": 1.5038335158817087,
+      "grad_norm": 0.6094375848770142,
+      "learning_rate": 1.640450312673166e-05,
+      "loss": 0.5237,
+      "mean_token_accuracy": 0.8430356681346893,
+      "num_tokens": 78963745.0,
+      "step": 1030
+    },
+    {
+      "entropy": 0.7948550447821617,
+      "epoch": 1.5184373859072653,
+      "grad_norm": 0.6190559267997742,
+      "learning_rate": 1.6331588625695823e-05,
+      "loss": 0.4984,
+      "mean_token_accuracy": 0.850845368206501,
+      "num_tokens": 79742482.0,
+      "step": 1040
+    },
+    {
+      "entropy": 0.8203089535236359,
+      "epoch": 1.5330412559328221,
+      "grad_norm": 0.627266526222229,
+      "learning_rate": 1.6258107872407376e-05,
+      "loss": 0.5222,
+      "mean_token_accuracy": 0.8443042784929276,
+      "num_tokens": 80513082.0,
+      "step": 1050
+    },
+    {
+      "entropy": 0.8034705385565758,
+      "epoch": 1.547645125958379,
+      "grad_norm": 0.5822563767433167,
+      "learning_rate": 1.6184067438462268e-05,
+      "loss": 0.5108,
+      "mean_token_accuracy": 0.8467739015817642,
+      "num_tokens": 81287042.0,
+      "step": 1060
+    },
+    {
+      "entropy": 0.7996939823031426,
+      "epoch": 1.5622489959839356,
+      "grad_norm": 0.8322076201438904,
+      "learning_rate": 1.6109473945510277e-05,
+      "loss": 0.5059,
+      "mean_token_accuracy": 0.8484288737177849,
+      "num_tokens": 82062918.0,
+      "step": 1070
+    },
+    {
+      "entropy": 0.8242404267191887,
+      "epoch": 1.5768528660094925,
+      "grad_norm": 0.6267576217651367,
+      "learning_rate": 1.6034334064662868e-05,
+      "loss": 0.5236,
+      "mean_token_accuracy": 0.8428681313991546,
+      "num_tokens": 82835236.0,
+      "step": 1080
+    },
+    {
+      "entropy": 0.8226122260093689,
+      "epoch": 1.5914567360350493,
+      "grad_norm": 0.578869104385376,
+      "learning_rate": 1.595865451589654e-05,
+      "loss": 0.522,
+      "mean_token_accuracy": 0.8434418380260468,
+      "num_tokens": 83610617.0,
+      "step": 1090
+    },
+    {
+      "entropy": 0.8142560958862305,
+      "epoch": 1.606060606060606,
+      "grad_norm": 0.652740478515625,
+      "learning_rate": 1.5882442067451856e-05,
+      "loss": 0.52,
+      "mean_token_accuracy": 0.8449100449681282,
+      "num_tokens": 84379036.0,
+      "step": 1100
+    },
+    {
+      "entropy": 0.8149377256631851,
+      "epoch": 1.6206644760861628,
+      "grad_norm": 0.5886592864990234,
+      "learning_rate": 1.5805703535228137e-05,
+      "loss": 0.5318,
+      "mean_token_accuracy": 0.8409554123878479,
+      "num_tokens": 85126712.0,
+      "step": 1110
+    },
+    {
+      "entropy": 0.8216987118124962,
+      "epoch": 1.6352683461117197,
+      "grad_norm": 0.625464916229248,
+      "learning_rate": 1.5728445782173896e-05,
+      "loss": 0.5213,
+      "mean_token_accuracy": 0.8435990065336227,
+      "num_tokens": 85888279.0,
+      "step": 1120
+    },
+    {
+      "entropy": 0.8107628434896469,
+      "epoch": 1.6498722161372763,
+      "grad_norm": 0.6127883195877075,
+      "learning_rate": 1.565067571767306e-05,
+      "loss": 0.5307,
+      "mean_token_accuracy": 0.8404728651046753,
+      "num_tokens": 86651718.0,
+      "step": 1130
+    },
+    {
+      "entropy": 0.8122030794620514,
+      "epoch": 1.6644760861628332,
+      "grad_norm": 0.6447589993476868,
+      "learning_rate": 1.557240029692705e-05,
+      "loss": 0.5288,
+      "mean_token_accuracy": 0.8413975268602372,
+      "num_tokens": 87418899.0,
+      "step": 1140
+    },
+    {
+      "entropy": 0.8063643991947174,
+      "epoch": 1.67907995618839,
+      "grad_norm": 0.6214607357978821,
+      "learning_rate": 1.5493626520332758e-05,
+      "loss": 0.5046,
+      "mean_token_accuracy": 0.8487621054053307,
+      "num_tokens": 88197755.0,
+      "step": 1150
+    },
+    {
+      "entropy": 0.811661048233509,
+      "epoch": 1.6936838262139466,
+      "grad_norm": 0.587363600730896,
+      "learning_rate": 1.5414361432856475e-05,
+      "loss": 0.5232,
+      "mean_token_accuracy": 0.8434179335832596,
+      "num_tokens": 88960882.0,
+      "step": 1160
+    },
+    {
+      "entropy": 0.824562780559063,
+      "epoch": 1.7082876962395035,
+      "grad_norm": 0.5840896964073181,
+      "learning_rate": 1.533461212340384e-05,
+      "loss": 0.5295,
+      "mean_token_accuracy": 0.8425872087478637,
+      "num_tokens": 89730659.0,
+      "step": 1170
+    },
+    {
+      "entropy": 0.8084111362695694,
+      "epoch": 1.7228915662650603,
+      "grad_norm": 0.6445785760879517,
+      "learning_rate": 1.5254385724185872e-05,
+      "loss": 0.5189,
+      "mean_token_accuracy": 0.8443948805332184,
+      "num_tokens": 90519374.0,
+      "step": 1180
+    },
+    {
+      "entropy": 0.802851003408432,
+      "epoch": 1.737495436290617,
+      "grad_norm": 0.5873962044715881,
+      "learning_rate": 1.5173689410081091e-05,
+      "loss": 0.5215,
+      "mean_token_accuracy": 0.8436427339911461,
+      "num_tokens": 91280437.0,
+      "step": 1190
+    },
+    {
+      "entropy": 0.8080698132514954,
+      "epoch": 1.7520993063161738,
+      "grad_norm": 0.594432532787323,
+      "learning_rate": 1.5092530397993877e-05,
+      "loss": 0.519,
+      "mean_token_accuracy": 0.8449363052845001,
+      "num_tokens": 92051212.0,
+      "step": 1200
+    },
+    {
+      "entropy": 0.8046671271324157,
+      "epoch": 1.7667031763417307,
+      "grad_norm": 0.6073573231697083,
+      "learning_rate": 1.5010915946209013e-05,
+      "loss": 0.5207,
+      "mean_token_accuracy": 0.8437707021832466,
+      "num_tokens": 92824656.0,
+      "step": 1210
+    },
+    {
+      "entropy": 0.799671696126461,
+      "epoch": 1.7813070463672873,
+      "grad_norm": 0.5851039886474609,
+      "learning_rate": 1.492885335374258e-05,
+      "loss": 0.5171,
+      "mean_token_accuracy": 0.845556665956974,
+      "num_tokens": 93601248.0,
+      "step": 1220
+    },
+    {
+      "entropy": 0.7995420083403587,
+      "epoch": 1.795910916392844,
+      "grad_norm": 0.6009591221809387,
+      "learning_rate": 1.4846349959689166e-05,
+      "loss": 0.5064,
+      "mean_token_accuracy": 0.8478008210659027,
+      "num_tokens": 94367653.0,
+      "step": 1230
+    },
+    {
+      "entropy": 0.8032964497804642,
+      "epoch": 1.810514786418401,
+      "grad_norm": 0.6265828013420105,
+      "learning_rate": 1.4763413142565524e-05,
+      "loss": 0.5126,
+      "mean_token_accuracy": 0.8462978065013885,
+      "num_tokens": 95134532.0,
+      "step": 1240
+    },
+    {
+      "entropy": 0.8157682687044143,
+      "epoch": 1.8251186564439577,
+      "grad_norm": 0.6303804516792297,
+      "learning_rate": 1.468005031965068e-05,
+      "loss": 0.5285,
+      "mean_token_accuracy": 0.8411597847938538,
+      "num_tokens": 95888124.0,
+      "step": 1250
+    },
+    {
+      "entropy": 0.8242416352033615,
+      "epoch": 1.8397225264695143,
+      "grad_norm": 0.6679196953773499,
+      "learning_rate": 1.4596268946322587e-05,
+      "loss": 0.5277,
+      "mean_token_accuracy": 0.8423770368099213,
+      "num_tokens": 96640919.0,
+      "step": 1260
+    },
+    {
+      "entropy": 0.8027110368013382,
+      "epoch": 1.8543263964950711,
+      "grad_norm": 0.6514571309089661,
+      "learning_rate": 1.4512076515391375e-05,
+      "loss": 0.5144,
+      "mean_token_accuracy": 0.845685575902462,
+      "num_tokens": 97402349.0,
+      "step": 1270
+    },
+    {
+      "entropy": 0.8184534996747971,
+      "epoch": 1.868930266520628,
+      "grad_norm": 0.6248390078544617,
+      "learning_rate": 1.4427480556429237e-05,
+      "loss": 0.5232,
+      "mean_token_accuracy": 0.8441895946860314,
+      "num_tokens": 98195965.0,
+      "step": 1280
+    },
+    {
+      "entropy": 0.8160968586802483,
+      "epoch": 1.8835341365461846,
+      "grad_norm": 0.6021704077720642,
+      "learning_rate": 1.4342488635097044e-05,
+      "loss": 0.5149,
+      "mean_token_accuracy": 0.8458479061722756,
+      "num_tokens": 98968374.0,
+      "step": 1290
+    },
+    {
+      "entropy": 0.8257978558540344,
+      "epoch": 1.8981380065717415,
+      "grad_norm": 0.5873125195503235,
+      "learning_rate": 1.425710835246773e-05,
+      "loss": 0.5265,
+      "mean_token_accuracy": 0.8433559283614158,
+      "num_tokens": 99729725.0,
+      "step": 1300
+    },
+    {
+      "entropy": 0.8362466841936111,
+      "epoch": 1.9127418765972983,
+      "grad_norm": 0.6376807689666748,
+      "learning_rate": 1.4171347344346494e-05,
+      "loss": 0.5343,
+      "mean_token_accuracy": 0.8405352383852005,
+      "num_tokens": 100511457.0,
+      "step": 1310
+    },
+    {
+      "entropy": 0.806542806327343,
+      "epoch": 1.927345746622855,
+      "grad_norm": 0.6307621002197266,
+      "learning_rate": 1.4085213280587916e-05,
+      "loss": 0.508,
+      "mean_token_accuracy": 0.8474292501807212,
+      "num_tokens": 101294648.0,
+      "step": 1320
+    },
+    {
+      "entropy": 0.828097864985466,
+      "epoch": 1.9419496166484118,
+      "grad_norm": 0.6401568055152893,
+      "learning_rate": 1.3998713864410029e-05,
+      "loss": 0.523,
+      "mean_token_accuracy": 0.8436363622546196,
+      "num_tokens": 102057660.0,
+      "step": 1330
+    },
+    {
+      "entropy": 0.8092041403055191,
+      "epoch": 1.9565534866739687,
+      "grad_norm": 0.5988907217979431,
+      "learning_rate": 1.3911856831705372e-05,
+      "loss": 0.5199,
+      "mean_token_accuracy": 0.8438414841890335,
+      "num_tokens": 102826372.0,
+      "step": 1340
+    },
+    {
+      "entropy": 0.8254723891615867,
+      "epoch": 1.9711573566995253,
+      "grad_norm": 0.6260574460029602,
+      "learning_rate": 1.3824649950349173e-05,
+      "loss": 0.5307,
+      "mean_token_accuracy": 0.8420073762536049,
+      "num_tokens": 103585161.0,
+      "step": 1350
+    },
+    {
+      "entropy": 0.8164636805653572,
+      "epoch": 1.9857612267250822,
+      "grad_norm": 0.6160163879394531,
+      "learning_rate": 1.373710101950464e-05,
+      "loss": 0.5213,
+      "mean_token_accuracy": 0.8451445356011391,
+      "num_tokens": 104380569.0,
+      "step": 1360
+    },
+    {
+      "entropy": 0.8088174370618967,
+      "epoch": 2.0,
+      "grad_norm": 0.6965954303741455,
+      "learning_rate": 1.3649217868925435e-05,
+      "loss": 0.5143,
+      "mean_token_accuracy": 0.8466584361516513,
+      "num_tokens": 105148277.0,
+      "step": 1370
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 3425,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 5,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 8.54747959995138e+17,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dcae2930bf0a52637d2a62b591647b4719b80a1a27e03c4d883bb330b7178dd5
+size 6737

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff