sirui6011 commited on 10 days ago

Commit

dc871d8

verified ·

1 Parent(s): 24d9a63

add checkpoints/codi3b_a0.5_b1.0_g0.5_ls2

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +6 -0
checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-1000/added_tokens.json +35 -0
checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-1000/chat_template.jinja +54 -0
checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-1000/config.json +66 -0
checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-1000/merges.txt +0 -0
checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-1000/pytorch_model.bin +3 -0
checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-1000/special_tokens_map.json +31 -0
checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-1000/thought_projector.pt +3 -0
checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-1000/tokenizer.json +3 -0
checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-1000/tokenizer_config.json +295 -0
checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-1000/trainer_state.json +2046 -0
checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-1000/vocab.json +0 -0
checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-200/added_tokens.json +35 -0
checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-200/chat_template.jinja +54 -0
checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-200/config.json +66 -0
checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-200/merges.txt +0 -0
checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-200/pytorch_model.bin +3 -0
checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-200/special_tokens_map.json +31 -0
checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-200/thought_projector.pt +3 -0
checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-200/tokenizer.json +3 -0
checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-200/tokenizer_config.json +295 -0
checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-200/trainer_state.json +434 -0
checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-200/vocab.json +0 -0
checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-400/added_tokens.json +35 -0
checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-400/chat_template.jinja +54 -0
checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-400/config.json +66 -0
checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-400/merges.txt +0 -0
checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-400/pytorch_model.bin +3 -0
checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-400/special_tokens_map.json +31 -0
checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-400/thought_projector.pt +3 -0
checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-400/tokenizer.json +3 -0
checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-400/tokenizer_config.json +295 -0
checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-400/trainer_state.json +834 -0
checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-400/vocab.json +0 -0
checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-600/added_tokens.json +35 -0
checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-600/chat_template.jinja +54 -0
checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-600/config.json +66 -0
checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-600/merges.txt +0 -0
checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-600/pytorch_model.bin +3 -0
checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-600/special_tokens_map.json +31 -0
checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-600/thought_projector.pt +3 -0
checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-600/tokenizer.json +3 -0
checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-600/tokenizer_config.json +295 -0
checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-600/trainer_state.json +1234 -0
checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-600/vocab.json +0 -0
checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-800/added_tokens.json +35 -0
checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-800/chat_template.jinja +54 -0
checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-800/config.json +66 -0
checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-800/merges.txt +0 -0
checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-800/pytorch_model.bin +3 -0

.gitattributes CHANGED Viewed

@@ -62,3 +62,9 @@ checkpoints/codi3b_a0.5_b1.0_g0.5_ls1/checkpoint-400/tokenizer.json filter=lfs d
 checkpoints/codi3b_a0.5_b1.0_g0.5_ls1/checkpoint-600/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 checkpoints/codi3b_a0.5_b1.0_g0.5_ls1/checkpoint-800/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 checkpoints/codi3b_a0.5_b1.0_g0.5_ls1/wandb/offline-run-20260620_190723-v54ju1cd/run-v54ju1cd.wandb filter=lfs diff=lfs merge=lfs -text

 checkpoints/codi3b_a0.5_b1.0_g0.5_ls1/checkpoint-600/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 checkpoints/codi3b_a0.5_b1.0_g0.5_ls1/checkpoint-800/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 checkpoints/codi3b_a0.5_b1.0_g0.5_ls1/wandb/offline-run-20260620_190723-v54ju1cd/run-v54ju1cd.wandb filter=lfs diff=lfs merge=lfs -text
+checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-1000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-200/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-400/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-600/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-800/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/wandb/offline-run-20260620_205602-mgxj3qvh/run-mgxj3qvh.wandb filter=lfs diff=lfs merge=lfs -text

checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-1000/added_tokens.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|action_sep|>": 151670,
+  "<|arg_sep|>": 151671,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|call_sep|>": 151666,
+  "<|end_of_text|>": 151673,
+  "<|endoftext|>": 151643,
+  "<|exception_sep|>": 151669,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|frame_sep|>": 151672,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|latent_end|>": 151675,
+  "<|latent_start|>": 151674,
+  "<|line_sep|>": 151667,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|return_sep|>": 151668,
+  "<|trace_context_start|>": 151665,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-1000/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-1000/config.json ADDED Viewed

	@@ -0,0 +1,66 @@

+{
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "dtype": "bfloat16",
+  "eos_token_id": 151643,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 11008,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 32768,
+  "max_window_layers": 36,
+  "model_type": "qwen2",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 36,
+  "num_key_value_heads": 2,
+  "pad_token_id": 151643,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "transformers_version": "4.57.6",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151676
+}

checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-1000/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-1000/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7fab2179217b40bd1d9699c0150feea4a61964b67f6e47b1faf4137925dee730
+size 6187858991

checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-1000/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-1000/thought_projector.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:80bed173a3b415528e742bdd7eb367f051e29649d3850f528c258ccd893ef46b
+size 16788033

checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-1000/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:83a790d654474f5dfe225f889afd0210313eb1083f942671f2c4b8e95a1c922b
+size 11424004

checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-1000/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,295 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<|trace_context_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151666": {
+      "content": "<|call_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151667": {
+      "content": "<|line_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151668": {
+      "content": "<|return_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151669": {
+      "content": "<|exception_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151670": {
+      "content": "<|action_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151671": {
+      "content": "<|arg_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151672": {
+      "content": "<|frame_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151673": {
+      "content": "<|end_of_text|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151674": {
+      "content": "<|latent_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151675": {
+      "content": "<|latent_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 32768,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-1000/trainer_state.json ADDED Viewed

	@@ -0,0 +1,2046 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.28835063437139563,
+  "eval_steps": 500,
+  "global_step": 1000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0014417531718569781,
+      "grad_norm": 64.5,
+      "kd_loss": 0.2920137941837311,
+      "learning_rate": 1.3333333333333334e-06,
+      "loss": 0.4591,
+      "step": 5,
+      "student_loss": 0.18811793625354767,
+      "teacher_loss": 0.0020189557690173388
+    },
+    {
+      "epoch": 0.0028835063437139563,
+      "grad_norm": 37.25,
+      "kd_loss": 0.27189651131629944,
+      "learning_rate": 3e-06,
+      "loss": 0.3445,
+      "step": 10,
+      "student_loss": 0.15471063554286957,
+      "teacher_loss": 0.00530141731724143
+    },
+    {
+      "epoch": 0.004325259515570935,
+      "grad_norm": 7.1875,
+      "kd_loss": 0.22213532030582428,
+      "learning_rate": 4.666666666666667e-06,
+      "loss": 0.2034,
+      "step": 15,
+      "student_loss": 0.007600904442369938,
+      "teacher_loss": 0.0011894693598151207
+    },
+    {
+      "epoch": 0.0057670126874279125,
+      "grad_norm": 7.21875,
+      "kd_loss": 0.14968645572662354,
+      "learning_rate": 6.333333333333333e-06,
+      "loss": 0.1464,
+      "step": 20,
+      "student_loss": 0.026719728484749794,
+      "teacher_loss": 0.0005942200659774244
+    },
+    {
+      "epoch": 0.00720876585928489,
+      "grad_norm": 2.703125,
+      "kd_loss": 0.11552157998085022,
+      "learning_rate": 8.000000000000001e-06,
+      "loss": 0.1171,
+      "step": 25,
+      "student_loss": 0.002865401329472661,
+      "teacher_loss": 0.0010151522001251578
+    },
+    {
+      "epoch": 0.00865051903114187,
+      "grad_norm": 1.484375,
+      "kd_loss": 0.07486728578805923,
+      "learning_rate": 9.666666666666667e-06,
+      "loss": 0.0901,
+      "step": 30,
+      "student_loss": 0.0011324587976559997,
+      "teacher_loss": 0.0005539655685424805
+    },
+    {
+      "epoch": 0.010092272202998846,
+      "grad_norm": 2.140625,
+      "kd_loss": 0.06898625195026398,
+      "learning_rate": 9.99958042442916e-06,
+      "loss": 0.0757,
+      "step": 35,
+      "student_loss": 0.0009480358567088842,
+      "teacher_loss": 0.0005732738063670695
+    },
+    {
+      "epoch": 0.011534025374855825,
+      "grad_norm": 1.296875,
+      "kd_loss": 0.0642521008849144,
+      "learning_rate": 9.997876019358083e-06,
+      "loss": 0.0685,
+      "step": 40,
+      "student_loss": 0.031368859112262726,
+      "teacher_loss": 0.027499590069055557
+    },
+    {
+      "epoch": 0.012975778546712802,
+      "grad_norm": 1.3359375,
+      "kd_loss": 0.11454806476831436,
+      "learning_rate": 9.99486100792044e-06,
+      "loss": 0.0541,
+      "step": 45,
+      "student_loss": 0.06097811087965965,
+      "teacher_loss": 0.0027094183024019003
+    },
+    {
+      "epoch": 0.01441753171856978,
+      "grad_norm": 1.5859375,
+      "kd_loss": 0.03876315429806709,
+      "learning_rate": 9.990536180750724e-06,
+      "loss": 0.0563,
+      "step": 50,
+      "student_loss": 0.02282995544373989,
+      "teacher_loss": 0.008940276689827442
+    },
+    {
+      "epoch": 0.015859284890426758,
+      "grad_norm": 1.125,
+      "kd_loss": 0.033022914081811905,
+      "learning_rate": 9.984902671959911e-06,
+      "loss": 0.0481,
+      "step": 55,
+      "student_loss": 0.0011594295501708984,
+      "teacher_loss": 0.0007985035772435367
+    },
+    {
+      "epoch": 0.01730103806228374,
+      "grad_norm": 0.8828125,
+      "kd_loss": 0.030243180692195892,
+      "learning_rate": 9.97796195883804e-06,
+      "loss": 0.0429,
+      "step": 60,
+      "student_loss": 0.0034187885466963053,
+      "teacher_loss": 0.0017242392059415579
+    },
+    {
+      "epoch": 0.018742791234140715,
+      "grad_norm": 1.4453125,
+      "kd_loss": 0.032282304018735886,
+      "learning_rate": 9.969715861466839e-06,
+      "loss": 0.0446,
+      "step": 65,
+      "student_loss": 0.029871758073568344,
+      "teacher_loss": 0.039595428854227066
+    },
+    {
+      "epoch": 0.020184544405997693,
+      "grad_norm": 1.1953125,
+      "kd_loss": 0.06090804189443588,
+      "learning_rate": 9.96016654224243e-06,
+      "loss": 0.0477,
+      "step": 70,
+      "student_loss": 0.017709577456116676,
+      "teacher_loss": 0.0022253147326409817
+    },
+    {
+      "epoch": 0.02162629757785467,
+      "grad_norm": 0.84765625,
+      "kd_loss": 0.024834414944052696,
+      "learning_rate": 9.94931650530827e-06,
+      "loss": 0.0418,
+      "step": 75,
+      "student_loss": 0.0012920524459332228,
+      "teacher_loss": 0.0008862126851454377
+    },
+    {
+      "epoch": 0.02306805074971165,
+      "grad_norm": 0.60546875,
+      "kd_loss": 0.025287121534347534,
+      "learning_rate": 9.93716859589851e-06,
+      "loss": 0.0365,
+      "step": 80,
+      "student_loss": 0.0013494148151949048,
+      "teacher_loss": 0.0008630359079688787
+    },
+    {
+      "epoch": 0.024509803921568627,
+      "grad_norm": 0.86328125,
+      "kd_loss": 0.024378223344683647,
+      "learning_rate": 9.923725999591846e-06,
+      "loss": 0.0395,
+      "step": 85,
+      "student_loss": 0.0005228935624472797,
+      "teacher_loss": 0.0004561410460155457
+    },
+    {
+      "epoch": 0.025951557093425604,
+      "grad_norm": 0.70703125,
+      "kd_loss": 0.07289917767047882,
+      "learning_rate": 9.908992241476189e-06,
+      "loss": 0.0394,
+      "step": 90,
+      "student_loss": 0.020422162488102913,
+      "teacher_loss": 0.0035499960649758577
+    },
+    {
+      "epoch": 0.027393310265282585,
+      "grad_norm": 1.125,
+      "kd_loss": 0.044955912977457047,
+      "learning_rate": 9.892971185224244e-06,
+      "loss": 0.0351,
+      "step": 95,
+      "student_loss": 0.008261171169579029,
+      "teacher_loss": 0.005078152287751436
+    },
+    {
+      "epoch": 0.02883506343713956,
+      "grad_norm": 0.83203125,
+      "kd_loss": 0.02227398194372654,
+      "learning_rate": 9.875667032080354e-06,
+      "loss": 0.0326,
+      "step": 100,
+      "student_loss": 0.0006025677430443466,
+      "teacher_loss": 0.00046476206625811756
+    },
+    {
+      "epoch": 0.03027681660899654,
+      "grad_norm": 1.2734375,
+      "kd_loss": 0.03532887250185013,
+      "learning_rate": 9.857084319758772e-06,
+      "loss": 0.036,
+      "step": 105,
+      "student_loss": 0.0034369053319096565,
+      "teacher_loss": 0.00029834595625288785
+    },
+    {
+      "epoch": 0.031718569780853516,
+      "grad_norm": 1.1640625,
+      "kd_loss": 0.04033924266695976,
+      "learning_rate": 9.837227921253747e-06,
+      "loss": 0.0375,
+      "step": 110,
+      "student_loss": 0.03080393560230732,
+      "teacher_loss": 0.018437745049595833
+    },
+    {
+      "epoch": 0.03316032295271049,
+      "grad_norm": 0.703125,
+      "kd_loss": 0.04197424277663231,
+      "learning_rate": 9.816103043561648e-06,
+      "loss": 0.0347,
+      "step": 115,
+      "student_loss": 0.0021668823901563883,
+      "teacher_loss": 0.00045062918798066676
+    },
+    {
+      "epoch": 0.03460207612456748,
+      "grad_norm": 0.98828125,
+      "kd_loss": 0.027563175186514854,
+      "learning_rate": 9.79371522631553e-06,
+      "loss": 0.032,
+      "step": 120,
+      "student_loss": 0.0016319500282406807,
+      "teacher_loss": 0.0008567498298361897
+    },
+    {
+      "epoch": 0.036043829296424454,
+      "grad_norm": 0.92578125,
+      "kd_loss": 0.06173818185925484,
+      "learning_rate": 9.770070340332457e-06,
+      "loss": 0.0364,
+      "step": 125,
+      "student_loss": 0.02385914884507656,
+      "teacher_loss": 0.00027849775506183505
+    },
+    {
+      "epoch": 0.03748558246828143,
+      "grad_norm": 1.0859375,
+      "kd_loss": 0.029248492792248726,
+      "learning_rate": 9.745174586073982e-06,
+      "loss": 0.0346,
+      "step": 130,
+      "student_loss": 0.0005455865757539868,
+      "teacher_loss": 0.0004959598300047219
+    },
+    {
+      "epoch": 0.03892733564013841,
+      "grad_norm": 0.953125,
+      "kd_loss": 0.0406946986913681,
+      "learning_rate": 9.719034492020183e-06,
+      "loss": 0.0377,
+      "step": 135,
+      "student_loss": 0.0013323032762855291,
+      "teacher_loss": 0.0005948346224613488
+    },
+    {
+      "epoch": 0.040369088811995385,
+      "grad_norm": 1.0390625,
+      "kd_loss": 0.023682042956352234,
+      "learning_rate": 9.691656912957686e-06,
+      "loss": 0.036,
+      "step": 140,
+      "student_loss": 0.0005881476681679487,
+      "teacher_loss": 0.0003679130459204316
+    },
+    {
+      "epoch": 0.04181084198385236,
+      "grad_norm": 0.5859375,
+      "kd_loss": 0.07271980494260788,
+      "learning_rate": 9.663049028182112e-06,
+      "loss": 0.0325,
+      "step": 145,
+      "student_loss": 0.028793470934033394,
+      "teacher_loss": 0.005983584560453892
+    },
+    {
+      "epoch": 0.04325259515570934,
+      "grad_norm": 0.609375,
+      "kd_loss": 0.01843745820224285,
+      "learning_rate": 9.633218339615433e-06,
+      "loss": 0.0316,
+      "step": 150,
+      "student_loss": 0.001051027444191277,
+      "teacher_loss": 0.000913174357265234
+    },
+    {
+      "epoch": 0.04469434832756632,
+      "grad_norm": 1.1171875,
+      "kd_loss": 0.023381218314170837,
+      "learning_rate": 9.602172669838721e-06,
+      "loss": 0.0381,
+      "step": 155,
+      "student_loss": 0.002775231609120965,
+      "teacher_loss": 0.0007182428380474448
+    },
+    {
+      "epoch": 0.0461361014994233,
+      "grad_norm": 1.6875,
+      "kd_loss": 0.03144950047135353,
+      "learning_rate": 9.569920160040815e-06,
+      "loss": 0.0344,
+      "step": 160,
+      "student_loss": 0.0778423622250557,
+      "teacher_loss": 0.044956743717193604
+    },
+    {
+      "epoch": 0.04757785467128028,
+      "grad_norm": 0.75,
+      "kd_loss": 0.03335012122988701,
+      "learning_rate": 9.536469267883432e-06,
+      "loss": 0.0311,
+      "step": 165,
+      "student_loss": 0.0012603362556546926,
+      "teacher_loss": 0.00417186226695776
+    },
+    {
+      "epoch": 0.049019607843137254,
+      "grad_norm": 1.8828125,
+      "kd_loss": 0.018793689087033272,
+      "learning_rate": 9.501828765283295e-06,
+      "loss": 0.0355,
+      "step": 170,
+      "student_loss": 0.0005546013708226383,
+      "teacher_loss": 0.0003494401171337813
+    },
+    {
+      "epoch": 0.05046136101499423,
+      "grad_norm": 1.09375,
+      "kd_loss": 0.027096513658761978,
+      "learning_rate": 9.466007736111846e-06,
+      "loss": 0.0322,
+      "step": 175,
+      "student_loss": 0.0030481775756925344,
+      "teacher_loss": 0.0003126203373540193
+    },
+    {
+      "epoch": 0.05190311418685121,
+      "grad_norm": 0.69140625,
+      "kd_loss": 0.03634097799658775,
+      "learning_rate": 9.429015573813163e-06,
+      "loss": 0.0302,
+      "step": 180,
+      "student_loss": 0.0012228424893692136,
+      "teacher_loss": 0.00034309024340473115
+    },
+    {
+      "epoch": 0.05334486735870819,
+      "grad_norm": 1.34375,
+      "kd_loss": 0.03704367205500603,
+      "learning_rate": 9.390861978940687e-06,
+      "loss": 0.0363,
+      "step": 185,
+      "student_loss": 0.035696618258953094,
+      "teacher_loss": 0.00034108557156287134
+    },
+    {
+      "epoch": 0.05478662053056517,
+      "grad_norm": 1.3046875,
+      "kd_loss": 0.052902791649103165,
+      "learning_rate": 9.351556956613423e-06,
+      "loss": 0.0321,
+      "step": 190,
+      "student_loss": 0.009327664040029049,
+      "teacher_loss": 0.0015260990476235747
+    },
+    {
+      "epoch": 0.056228373702422146,
+      "grad_norm": 2.828125,
+      "kd_loss": 0.05315268039703369,
+      "learning_rate": 9.31111081389227e-06,
+      "loss": 0.0327,
+      "step": 195,
+      "student_loss": 0.011893535032868385,
+      "teacher_loss": 0.0007797812577337027
+    },
+    {
+      "epoch": 0.05767012687427912,
+      "grad_norm": 0.8125,
+      "kd_loss": 0.03527738153934479,
+      "learning_rate": 9.269534157077177e-06,
+      "loss": 0.0394,
+      "step": 200,
+      "student_loss": 0.0005083663854748011,
+      "teacher_loss": 0.0002656931465025991
+    },
+    {
+      "epoch": 0.0591118800461361,
+      "grad_norm": 0.9921875,
+      "kd_loss": 0.02736036665737629,
+      "learning_rate": 9.226837888925813e-06,
+      "loss": 0.0341,
+      "step": 205,
+      "student_loss": 0.03568984195590019,
+      "teacher_loss": 0.027640890330076218
+    },
+    {
+      "epoch": 0.06055363321799308,
+      "grad_norm": 1.78125,
+      "kd_loss": 0.030339844524860382,
+      "learning_rate": 9.183033205794525e-06,
+      "loss": 0.0302,
+      "step": 210,
+      "student_loss": 0.000763049116358161,
+      "teacher_loss": 0.0003548153617884964
+    },
+    {
+      "epoch": 0.061995386389850055,
+      "grad_norm": 1.1484375,
+      "kd_loss": 0.03865697979927063,
+      "learning_rate": 9.13813159470227e-06,
+      "loss": 0.0326,
+      "step": 215,
+      "student_loss": 0.0005148217896930873,
+      "teacher_loss": 0.00020245747873559594
+    },
+    {
+      "epoch": 0.06343713956170703,
+      "grad_norm": 1.3671875,
+      "kd_loss": 0.04260854423046112,
+      "learning_rate": 9.092144830318357e-06,
+      "loss": 0.0316,
+      "step": 220,
+      "student_loss": 0.030991079285740852,
+      "teacher_loss": 0.00993641559034586
+    },
+    {
+      "epoch": 0.06487889273356401,
+      "grad_norm": 1.203125,
+      "kd_loss": 0.023958567529916763,
+      "learning_rate": 9.045084971874738e-06,
+      "loss": 0.0376,
+      "step": 225,
+      "student_loss": 0.043197184801101685,
+      "teacher_loss": 0.00216495874337852
+    },
+    {
+      "epoch": 0.06632064590542099,
+      "grad_norm": 1.7578125,
+      "kd_loss": 0.04370651766657829,
+      "learning_rate": 8.99696436000368e-06,
+      "loss": 0.0299,
+      "step": 230,
+      "student_loss": 0.005392159800976515,
+      "teacher_loss": 0.003661371534690261
+    },
+    {
+      "epoch": 0.06776239907727798,
+      "grad_norm": 0.8671875,
+      "kd_loss": 0.07619086652994156,
+      "learning_rate": 8.947795613501658e-06,
+      "loss": 0.0314,
+      "step": 235,
+      "student_loss": 0.008586333133280277,
+      "teacher_loss": 0.0005631999811157584
+    },
+    {
+      "epoch": 0.06920415224913495,
+      "grad_norm": 1.0625,
+      "kd_loss": 0.03111647628247738,
+      "learning_rate": 8.897591626020284e-06,
+      "loss": 0.034,
+      "step": 240,
+      "student_loss": 0.003906027879565954,
+      "teacher_loss": 0.0004787310608662665
+    },
+    {
+      "epoch": 0.07064590542099193,
+      "grad_norm": 1.453125,
+      "kd_loss": 0.017389042302966118,
+      "learning_rate": 8.846365562685178e-06,
+      "loss": 0.0279,
+      "step": 245,
+      "student_loss": 0.013969292864203453,
+      "teacher_loss": 0.006873726844787598
+    },
+    {
+      "epoch": 0.07208765859284891,
+      "grad_norm": 1.6015625,
+      "kd_loss": 0.0229345690459013,
+      "learning_rate": 8.794130856643635e-06,
+      "loss": 0.0311,
+      "step": 250,
+      "student_loss": 0.0008334179292432964,
+      "teacher_loss": 0.0003169570700265467
+    },
+    {
+      "epoch": 0.07352941176470588,
+      "grad_norm": 0.83203125,
+      "kd_loss": 0.01808979921042919,
+      "learning_rate": 8.74090120554202e-06,
+      "loss": 0.0312,
+      "step": 255,
+      "student_loss": 0.0003083710907958448,
+      "teacher_loss": 0.00030115401023067534
+    },
+    {
+      "epoch": 0.07497116493656286,
+      "grad_norm": 0.88671875,
+      "kd_loss": 0.026108454912900925,
+      "learning_rate": 8.686690567933803e-06,
+      "loss": 0.0333,
+      "step": 260,
+      "student_loss": 0.042571116238832474,
+      "teacher_loss": 0.03388316184282303
+    },
+    {
+      "epoch": 0.07641291810841984,
+      "grad_norm": 0.80078125,
+      "kd_loss": 0.016656002029776573,
+      "learning_rate": 8.63151315961915e-06,
+      "loss": 0.0317,
+      "step": 265,
+      "student_loss": 0.0003702428948599845,
+      "teacher_loss": 0.0002555136161390692
+    },
+    {
+      "epoch": 0.07785467128027682,
+      "grad_norm": 1.046875,
+      "kd_loss": 0.019304102286696434,
+      "learning_rate": 8.575383449917103e-06,
+      "loss": 0.0342,
+      "step": 270,
+      "student_loss": 0.001813149661757052,
+      "teacher_loss": 0.0011580288410186768
+    },
+    {
+      "epoch": 0.07929642445213379,
+      "grad_norm": 1.1796875,
+      "kd_loss": 0.023480774834752083,
+      "learning_rate": 8.518316157871232e-06,
+      "loss": 0.029,
+      "step": 275,
+      "student_loss": 0.04764978215098381,
+      "teacher_loss": 0.03439468517899513
+    },
+    {
+      "epoch": 0.08073817762399077,
+      "grad_norm": 1.015625,
+      "kd_loss": 0.03275206685066223,
+      "learning_rate": 8.460326248389825e-06,
+      "loss": 0.0289,
+      "step": 280,
+      "student_loss": 0.0005029537715017796,
+      "teacher_loss": 0.00019533037266228348
+    },
+    {
+      "epoch": 0.08217993079584775,
+      "grad_norm": 1.140625,
+      "kd_loss": 0.019457675516605377,
+      "learning_rate": 8.401428928321607e-06,
+      "loss": 0.0322,
+      "step": 285,
+      "student_loss": 0.0007758038118481636,
+      "teacher_loss": 0.000900130660738796
+    },
+    {
+      "epoch": 0.08362168396770472,
+      "grad_norm": 1.3125,
+      "kd_loss": 0.017802242189645767,
+      "learning_rate": 8.341639642468002e-06,
+      "loss": 0.0348,
+      "step": 290,
+      "student_loss": 0.023405462503433228,
+      "teacher_loss": 0.021540865302085876
+    },
+    {
+      "epoch": 0.0850634371395617,
+      "grad_norm": 1.03125,
+      "kd_loss": 0.01647804118692875,
+      "learning_rate": 8.280974069532999e-06,
+      "loss": 0.0328,
+      "step": 295,
+      "student_loss": 0.0006198033224791288,
+      "teacher_loss": 0.000554366793949157
+    },
+    {
+      "epoch": 0.08650519031141868,
+      "grad_norm": 0.9296875,
+      "kd_loss": 0.04784730449318886,
+      "learning_rate": 8.219448118011687e-06,
+      "loss": 0.0308,
+      "step": 300,
+      "student_loss": 0.015613794326782227,
+      "teacher_loss": 0.001725711626932025
+    },
+    {
+      "epoch": 0.08794694348327567,
+      "grad_norm": 1.1953125,
+      "kd_loss": 0.019965235143899918,
+      "learning_rate": 8.157077922018537e-06,
+      "loss": 0.0289,
+      "step": 305,
+      "student_loss": 0.006098510231822729,
+      "teacher_loss": 0.002777156652882695
+    },
+    {
+      "epoch": 0.08938869665513265,
+      "grad_norm": 1.0390625,
+      "kd_loss": 0.020748196169734,
+      "learning_rate": 8.093879837056486e-06,
+      "loss": 0.0309,
+      "step": 310,
+      "student_loss": 0.000493990199174732,
+      "teacher_loss": 0.00039537265547551215
+    },
+    {
+      "epoch": 0.09083044982698962,
+      "grad_norm": 1.4765625,
+      "kd_loss": 0.03892743960022926,
+      "learning_rate": 8.029870435728018e-06,
+      "loss": 0.0294,
+      "step": 315,
+      "student_loss": 0.0073717073537409306,
+      "teacher_loss": 0.00021180440671741962
+    },
+    {
+      "epoch": 0.0922722029988466,
+      "grad_norm": 1.3828125,
+      "kd_loss": 0.02363528683781624,
+      "learning_rate": 7.965066503389264e-06,
+      "loss": 0.0313,
+      "step": 320,
+      "student_loss": 0.0004119524674024433,
+      "teacher_loss": 0.00022927633835934103
+    },
+    {
+      "epoch": 0.09371395617070358,
+      "grad_norm": 1.7421875,
+      "kd_loss": 0.06230268254876137,
+      "learning_rate": 7.89948503374835e-06,
+      "loss": 0.0284,
+      "step": 325,
+      "student_loss": 0.009024329483509064,
+      "teacher_loss": 0.01963256485760212
+    },
+    {
+      "epoch": 0.09515570934256055,
+      "grad_norm": 0.90625,
+      "kd_loss": 0.04712303727865219,
+      "learning_rate": 7.833143224409076e-06,
+      "loss": 0.0302,
+      "step": 330,
+      "student_loss": 0.008720812387764454,
+      "teacher_loss": 0.0013501221546903253
+    },
+    {
+      "epoch": 0.09659746251441753,
+      "grad_norm": 0.9140625,
+      "kd_loss": 0.018766457214951515,
+      "learning_rate": 7.766058472361154e-06,
+      "loss": 0.0283,
+      "step": 335,
+      "student_loss": 0.001242243917658925,
+      "teacher_loss": 0.0023425817489624023
+    },
+    {
+      "epoch": 0.09803921568627451,
+      "grad_norm": 2.078125,
+      "kd_loss": 0.02911657840013504,
+      "learning_rate": 7.698248369418146e-06,
+      "loss": 0.0326,
+      "step": 340,
+      "student_loss": 0.009861858561635017,
+      "teacher_loss": 0.006796200294047594
+    },
+    {
+      "epoch": 0.09948096885813149,
+      "grad_norm": 1.2265625,
+      "kd_loss": 0.032899159938097,
+      "learning_rate": 7.629730697604314e-06,
+      "loss": 0.0335,
+      "step": 345,
+      "student_loss": 0.00739182299003005,
+      "teacher_loss": 0.0002512071805540472
+    },
+    {
+      "epoch": 0.10092272202998846,
+      "grad_norm": 1.0859375,
+      "kd_loss": 0.06120900437235832,
+      "learning_rate": 7.560523424491595e-06,
+      "loss": 0.0282,
+      "step": 350,
+      "student_loss": 0.003599822986871004,
+      "teacher_loss": 0.0007063632365316153
+    },
+    {
+      "epoch": 0.10236447520184544,
+      "grad_norm": 2.296875,
+      "kd_loss": 0.02087453007698059,
+      "learning_rate": 7.490644698487909e-06,
+      "loss": 0.035,
+      "step": 355,
+      "student_loss": 0.001389339566230774,
+      "teacher_loss": 0.0017551770433783531
+    },
+    {
+      "epoch": 0.10380622837370242,
+      "grad_norm": 0.90234375,
+      "kd_loss": 0.019557664170861244,
+      "learning_rate": 7.420112844078066e-06,
+      "loss": 0.0321,
+      "step": 360,
+      "student_loss": 0.0006360138650052249,
+      "teacher_loss": 0.0005147532792761922
+    },
+    {
+      "epoch": 0.1052479815455594,
+      "grad_norm": 1.3203125,
+      "kd_loss": 0.030097220093011856,
+      "learning_rate": 7.348946357018479e-06,
+      "loss": 0.0333,
+      "step": 365,
+      "student_loss": 0.04143820330500603,
+      "teacher_loss": 0.027219083160161972
+    },
+    {
+      "epoch": 0.10668973471741638,
+      "grad_norm": 0.9453125,
+      "kd_loss": 0.05344080179929733,
+      "learning_rate": 7.277163899486975e-06,
+      "loss": 0.0351,
+      "step": 370,
+      "student_loss": 0.021593965590000153,
+      "teacher_loss": 0.0005673202103935182
+    },
+    {
+      "epoch": 0.10813148788927336,
+      "grad_norm": 0.7265625,
+      "kd_loss": 0.03127056360244751,
+      "learning_rate": 7.204784295188959e-06,
+      "loss": 0.0287,
+      "step": 375,
+      "student_loss": 0.03163963928818703,
+      "teacher_loss": 0.020587248727679253
+    },
+    {
+      "epoch": 0.10957324106113034,
+      "grad_norm": 0.84375,
+      "kd_loss": 0.01984175480902195,
+      "learning_rate": 7.1318265244212305e-06,
+      "loss": 0.0311,
+      "step": 380,
+      "student_loss": 0.0005506337620317936,
+      "teacher_loss": 0.00038444914389401674
+    },
+    {
+      "epoch": 0.11101499423298732,
+      "grad_norm": 0.59765625,
+      "kd_loss": 0.02916550263762474,
+      "learning_rate": 7.05830971909472e-06,
+      "loss": 0.0294,
+      "step": 385,
+      "student_loss": 0.00045756620238535106,
+      "teacher_loss": 0.0003828653716482222
+    },
+    {
+      "epoch": 0.11245674740484429,
+      "grad_norm": 1.171875,
+      "kd_loss": 0.021802764385938644,
+      "learning_rate": 6.9842531577174865e-06,
+      "loss": 0.0271,
+      "step": 390,
+      "student_loss": 0.00026892355526797473,
+      "teacher_loss": 0.0002502185816410929
+    },
+    {
+      "epoch": 0.11389850057670127,
+      "grad_norm": 0.94140625,
+      "kd_loss": 0.020051907747983932,
+      "learning_rate": 6.9096762603392595e-06,
+      "loss": 0.0307,
+      "step": 395,
+      "student_loss": 0.0004568768781609833,
+      "teacher_loss": 0.00039133013342507184
+    },
+    {
+      "epoch": 0.11534025374855825,
+      "grad_norm": 0.78125,
+      "kd_loss": 0.03054620325565338,
+      "learning_rate": 6.834598583458862e-06,
+      "loss": 0.0275,
+      "step": 400,
+      "student_loss": 0.0005328103434294462,
+      "teacher_loss": 0.00015425821766257286
+    },
+    {
+      "epoch": 0.11678200692041522,
+      "grad_norm": 0.640625,
+      "kd_loss": 0.02604236640036106,
+      "learning_rate": 6.7590398148958625e-06,
+      "loss": 0.0335,
+      "step": 405,
+      "student_loss": 0.011330639012157917,
+      "teacher_loss": 0.00021900788124185055
+    },
+    {
+      "epoch": 0.1182237600922722,
+      "grad_norm": 0.8046875,
+      "kd_loss": 0.03826223686337471,
+      "learning_rate": 6.6830197686277945e-06,
+      "loss": 0.0366,
+      "step": 410,
+      "student_loss": 0.014453927055001259,
+      "teacher_loss": 0.00844341516494751
+    },
+    {
+      "epoch": 0.11966551326412918,
+      "grad_norm": 1.203125,
+      "kd_loss": 0.021850954741239548,
+      "learning_rate": 6.6065583795942625e-06,
+      "loss": 0.0347,
+      "step": 415,
+      "student_loss": 0.004759110510349274,
+      "teacher_loss": 0.0034685542341321707
+    },
+    {
+      "epoch": 0.12110726643598616,
+      "grad_norm": 0.93359375,
+      "kd_loss": 0.045137468725442886,
+      "learning_rate": 6.52967569846937e-06,
+      "loss": 0.0331,
+      "step": 420,
+      "student_loss": 0.04586087912321091,
+      "teacher_loss": 0.021448055282235146
+    },
+    {
+      "epoch": 0.12254901960784313,
+      "grad_norm": 1.4453125,
+      "kd_loss": 0.014929288066923618,
+      "learning_rate": 6.452391886403767e-06,
+      "loss": 0.0299,
+      "step": 425,
+      "student_loss": 0.002178685739636421,
+      "teacher_loss": 0.0021052202209830284
+    },
+    {
+      "epoch": 0.12399077277970011,
+      "grad_norm": 1.3671875,
+      "kd_loss": 0.0547032505273819,
+      "learning_rate": 6.374727209737743e-06,
+      "loss": 0.0368,
+      "step": 430,
+      "student_loss": 0.05219801887869835,
+      "teacher_loss": 0.060560259968042374
+    },
+    {
+      "epoch": 0.1254325259515571,
+      "grad_norm": 1.453125,
+      "kd_loss": 0.01706705428659916,
+      "learning_rate": 6.296702034686726e-06,
+      "loss": 0.0301,
+      "step": 435,
+      "student_loss": 0.0008379022474400699,
+      "teacher_loss": 0.0006224109092727304
+    },
+    {
+      "epoch": 0.12687427912341406,
+      "grad_norm": 1.0,
+      "kd_loss": 0.03890637308359146,
+      "learning_rate": 6.218336822000598e-06,
+      "loss": 0.0314,
+      "step": 440,
+      "student_loss": 0.012142423540353775,
+      "teacher_loss": 0.007434291299432516
+    },
+    {
+      "epoch": 0.12831603229527105,
+      "grad_norm": 0.890625,
+      "kd_loss": 0.03179040178656578,
+      "learning_rate": 6.139652121598219e-06,
+      "loss": 0.0313,
+      "step": 445,
+      "student_loss": 0.03341586887836456,
+      "teacher_loss": 0.018860887736082077
+    },
+    {
+      "epoch": 0.12975778546712802,
+      "grad_norm": 0.69140625,
+      "kd_loss": 0.03629063814878464,
+      "learning_rate": 6.060668567178561e-06,
+      "loss": 0.0329,
+      "step": 450,
+      "student_loss": 0.0010012147249653935,
+      "teacher_loss": 0.0014839700888842344
+    },
+    {
+      "epoch": 0.131199538638985,
+      "grad_norm": 0.8828125,
+      "kd_loss": 0.02263510413467884,
+      "learning_rate": 5.981406870809889e-06,
+      "loss": 0.0326,
+      "step": 455,
+      "student_loss": 0.0007265008171088994,
+      "teacher_loss": 0.0003751025360543281
+    },
+    {
+      "epoch": 0.13264129181084197,
+      "grad_norm": 1.2109375,
+      "kd_loss": 0.0492476262152195,
+      "learning_rate": 5.9018878174983674e-06,
+      "loss": 0.0295,
+      "step": 460,
+      "student_loss": 0.005101657006889582,
+      "teacher_loss": 0.0002927044697571546
+    },
+    {
+      "epoch": 0.13408304498269896,
+      "grad_norm": 1.2578125,
+      "kd_loss": 0.031007954850792885,
+      "learning_rate": 5.822132259737565e-06,
+      "loss": 0.034,
+      "step": 465,
+      "student_loss": 0.00047535731573589146,
+      "teacher_loss": 0.00024468303308822215
+    },
+    {
+      "epoch": 0.13552479815455595,
+      "grad_norm": 1.7265625,
+      "kd_loss": 0.02650834433734417,
+      "learning_rate": 5.742161112040237e-06,
+      "loss": 0.0313,
+      "step": 470,
+      "student_loss": 0.000378905504476279,
+      "teacher_loss": 0.0002708406245801598
+    },
+    {
+      "epoch": 0.13696655132641292,
+      "grad_norm": 1.171875,
+      "kd_loss": 0.03383399918675423,
+      "learning_rate": 5.661995345453867e-06,
+      "loss": 0.0289,
+      "step": 475,
+      "student_loss": 0.0004789994563907385,
+      "teacher_loss": 0.0002899342798627913
+    },
+    {
+      "epoch": 0.1384083044982699,
+      "grad_norm": 1.1953125,
+      "kd_loss": 0.03584924340248108,
+      "learning_rate": 5.581655982061367e-06,
+      "loss": 0.0422,
+      "step": 480,
+      "student_loss": 0.03862505778670311,
+      "teacher_loss": 0.036061737686395645
+    },
+    {
+      "epoch": 0.13985005767012687,
+      "grad_norm": 1.203125,
+      "kd_loss": 0.020711667835712433,
+      "learning_rate": 5.501164089468406e-06,
+      "loss": 0.0313,
+      "step": 485,
+      "student_loss": 0.0004981390666216612,
+      "teacher_loss": 0.0005692155100405216
+    },
+    {
+      "epoch": 0.14129181084198386,
+      "grad_norm": 1.3984375,
+      "kd_loss": 0.027012551203370094,
+      "learning_rate": 5.4205407752787884e-06,
+      "loss": 0.0367,
+      "step": 490,
+      "student_loss": 0.0008920110412873328,
+      "teacher_loss": 0.0007303191814571619
+    },
+    {
+      "epoch": 0.14273356401384082,
+      "grad_norm": 1.3671875,
+      "kd_loss": 0.040799129754304886,
+      "learning_rate": 5.339807181559359e-06,
+      "loss": 0.0399,
+      "step": 495,
+      "student_loss": 0.0009312813053838909,
+      "teacher_loss": 0.0005571586079895496
+    },
+    {
+      "epoch": 0.14417531718569782,
+      "grad_norm": 1.6875,
+      "kd_loss": 0.03058801032602787,
+      "learning_rate": 5.258984479295853e-06,
+      "loss": 0.0316,
+      "step": 500,
+      "student_loss": 0.00041255459655076265,
+      "teacher_loss": 0.00031951998244039714
+    },
+    {
+      "epoch": 0.14561707035755478,
+      "grad_norm": 1.734375,
+      "kd_loss": 0.02832438424229622,
+      "learning_rate": 5.1780938628411795e-06,
+      "loss": 0.0402,
+      "step": 505,
+      "student_loss": 0.0005547971813939512,
+      "teacher_loss": 0.0004580595705192536
+    },
+    {
+      "epoch": 0.14705882352941177,
+      "grad_norm": 2.28125,
+      "kd_loss": 0.019691068679094315,
+      "learning_rate": 5.097156544357567e-06,
+      "loss": 0.032,
+      "step": 510,
+      "student_loss": 0.00035549805033952,
+      "teacher_loss": 0.00029006152180954814
+    },
+    {
+      "epoch": 0.14850057670126873,
+      "grad_norm": 1.4609375,
+      "kd_loss": 0.028027402237057686,
+      "learning_rate": 5.016193748254045e-06,
+      "loss": 0.0299,
+      "step": 515,
+      "student_loss": 0.016550345346331596,
+      "teacher_loss": 0.00025012606056407094
+    },
+    {
+      "epoch": 0.14994232987312572,
+      "grad_norm": 1.7734375,
+      "kd_loss": 0.06320768594741821,
+      "learning_rate": 4.935226705620699e-06,
+      "loss": 0.0344,
+      "step": 520,
+      "student_loss": 0.04100845754146576,
+      "teacher_loss": 0.011567573063075542
+    },
+    {
+      "epoch": 0.1513840830449827,
+      "grad_norm": 0.859375,
+      "kd_loss": 0.032060496509075165,
+      "learning_rate": 4.8542766486612035e-06,
+      "loss": 0.033,
+      "step": 525,
+      "student_loss": 0.0004020243068225682,
+      "teacher_loss": 0.00020839735225308686
+    },
+    {
+      "epoch": 0.15282583621683968,
+      "grad_norm": 1.203125,
+      "kd_loss": 0.024977991357445717,
+      "learning_rate": 4.773364805125025e-06,
+      "loss": 0.0309,
+      "step": 530,
+      "student_loss": 0.0005168431089259684,
+      "teacher_loss": 0.00039579602889716625
+    },
+    {
+      "epoch": 0.15426758938869667,
+      "grad_norm": 2.046875,
+      "kd_loss": 0.03925255313515663,
+      "learning_rate": 4.6925123927408265e-06,
+      "loss": 0.0308,
+      "step": 535,
+      "student_loss": 0.0007517871563322842,
+      "teacher_loss": 0.00039392782491631806
+    },
+    {
+      "epoch": 0.15570934256055363,
+      "grad_norm": 1.8203125,
+      "kd_loss": 0.05933701992034912,
+      "learning_rate": 4.611740613652485e-06,
+      "loss": 0.0286,
+      "step": 540,
+      "student_loss": 0.0131508968770504,
+      "teacher_loss": 0.0005503469728864729
+    },
+    {
+      "epoch": 0.15715109573241062,
+      "grad_norm": 1.0390625,
+      "kd_loss": 0.023665662854909897,
+      "learning_rate": 4.531070648859186e-06,
+      "loss": 0.0282,
+      "step": 545,
+      "student_loss": 0.0027271448634564877,
+      "teacher_loss": 0.0027394567150622606
+    },
+    {
+      "epoch": 0.15859284890426759,
+      "grad_norm": 2.046875,
+      "kd_loss": 0.05410351976752281,
+      "learning_rate": 4.450523652661086e-06,
+      "loss": 0.0276,
+      "step": 550,
+      "student_loss": 0.00023584096925333142,
+      "teacher_loss": 0.0036266453098505735
+    },
+    {
+      "epoch": 0.16003460207612458,
+      "grad_norm": 1.1796875,
+      "kd_loss": 0.08308840543031693,
+      "learning_rate": 4.370120747111956e-06,
+      "loss": 0.0327,
+      "step": 555,
+      "student_loss": 0.046959906816482544,
+      "teacher_loss": 0.0017155115492641926
+    },
+    {
+      "epoch": 0.16147635524798154,
+      "grad_norm": 2.4375,
+      "kd_loss": 0.0380968302488327,
+      "learning_rate": 4.289883016480291e-06,
+      "loss": 0.0349,
+      "step": 560,
+      "student_loss": 0.001130731194280088,
+      "teacher_loss": 0.00036855213693343103
+    },
+    {
+      "epoch": 0.16291810841983853,
+      "grad_norm": 1.1328125,
+      "kd_loss": 0.03884750232100487,
+      "learning_rate": 4.209831501720328e-06,
+      "loss": 0.0325,
+      "step": 565,
+      "student_loss": 0.02741910330951214,
+      "teacher_loss": 0.019537584856152534
+    },
+    {
+      "epoch": 0.1643598615916955,
+      "grad_norm": 5.1875,
+      "kd_loss": 0.0330473892390728,
+      "learning_rate": 4.129987194954421e-06,
+      "loss": 0.0356,
+      "step": 570,
+      "student_loss": 0.010371256619691849,
+      "teacher_loss": 0.0070776245556771755
+    },
+    {
+      "epoch": 0.16580161476355249,
+      "grad_norm": 1.1015625,
+      "kd_loss": 0.028591087087988853,
+      "learning_rate": 4.050371033968216e-06,
+      "loss": 0.0328,
+      "step": 575,
+      "student_loss": 0.000467249978100881,
+      "teacher_loss": 0.0002494181098882109
+    },
+    {
+      "epoch": 0.16724336793540945,
+      "grad_norm": 0.80078125,
+      "kd_loss": 0.02640584297478199,
+      "learning_rate": 3.9710038967200825e-06,
+      "loss": 0.0284,
+      "step": 580,
+      "student_loss": 0.0017712963744997978,
+      "teacher_loss": 0.0019773358944803476
+    },
+    {
+      "epoch": 0.16868512110726644,
+      "grad_norm": 0.85546875,
+      "kd_loss": 0.030399348586797714,
+      "learning_rate": 3.89190659586623e-06,
+      "loss": 0.0281,
+      "step": 585,
+      "student_loss": 0.0004607080190908164,
+      "teacher_loss": 0.0002647584769874811
+    },
+    {
+      "epoch": 0.1701268742791234,
+      "grad_norm": 1.1328125,
+      "kd_loss": 0.03226058557629585,
+      "learning_rate": 3.8130998733029517e-06,
+      "loss": 0.0302,
+      "step": 590,
+      "student_loss": 0.0005413145408965647,
+      "teacher_loss": 0.0011012081522494555
+    },
+    {
+      "epoch": 0.1715686274509804,
+      "grad_norm": 1.8203125,
+      "kd_loss": 0.023172084242105484,
+      "learning_rate": 3.734604394727419e-06,
+      "loss": 0.0325,
+      "step": 595,
+      "student_loss": 0.00059064372908324,
+      "teacher_loss": 0.0005090903723612428
+    },
+    {
+      "epoch": 0.17301038062283736,
+      "grad_norm": 1.1640625,
+      "kd_loss": 0.038484539836645126,
+      "learning_rate": 3.656440744218464e-06,
+      "loss": 0.0346,
+      "step": 600,
+      "student_loss": 0.041177455335855484,
+      "teacher_loss": 0.004930450581014156
+    },
+    {
+      "epoch": 0.17445213379469435,
+      "grad_norm": 2.515625,
+      "kd_loss": 0.04977564886212349,
+      "learning_rate": 3.578629418838757e-06,
+      "loss": 0.0334,
+      "step": 605,
+      "student_loss": 0.07967430353164673,
+      "teacher_loss": 0.03648905083537102
+    },
+    {
+      "epoch": 0.17589388696655134,
+      "grad_norm": 1.34375,
+      "kd_loss": 0.04043799638748169,
+      "learning_rate": 3.5011908232598124e-06,
+      "loss": 0.0305,
+      "step": 610,
+      "student_loss": 0.00654568150639534,
+      "teacher_loss": 0.00039838344673626125
+    },
+    {
+      "epoch": 0.1773356401384083,
+      "grad_norm": 1.109375,
+      "kd_loss": 0.023836029693484306,
+      "learning_rate": 3.4241452644112085e-06,
+      "loss": 0.0266,
+      "step": 615,
+      "student_loss": 0.0002936046803370118,
+      "teacher_loss": 0.0003476935962680727
+    },
+    {
+      "epoch": 0.1787773933102653,
+      "grad_norm": 1.1640625,
+      "kd_loss": 0.033266592770814896,
+      "learning_rate": 3.3475129461554567e-06,
+      "loss": 0.0375,
+      "step": 620,
+      "student_loss": 0.01093846745789051,
+      "teacher_loss": 0.007155448663979769
+    },
+    {
+      "epoch": 0.18021914648212226,
+      "grad_norm": 1.4609375,
+      "kd_loss": 0.02975156530737877,
+      "learning_rate": 3.271313963989886e-06,
+      "loss": 0.0286,
+      "step": 625,
+      "student_loss": 0.014491051435470581,
+      "teacher_loss": 0.004247736185789108
+    },
+    {
+      "epoch": 0.18166089965397925,
+      "grad_norm": 0.98046875,
+      "kd_loss": 0.04514408856630325,
+      "learning_rate": 3.195568299776945e-06,
+      "loss": 0.0303,
+      "step": 630,
+      "student_loss": 0.0005301684141159058,
+      "teacher_loss": 0.0009141535847447813
+    },
+    {
+      "epoch": 0.1831026528258362,
+      "grad_norm": 0.82421875,
+      "kd_loss": 0.03230506554245949,
+      "learning_rate": 3.1202958165043053e-06,
+      "loss": 0.0417,
+      "step": 635,
+      "student_loss": 0.0006954488926567137,
+      "teacher_loss": 0.0002712230198085308
+    },
+    {
+      "epoch": 0.1845444059976932,
+      "grad_norm": 1.46875,
+      "kd_loss": 0.02938377857208252,
+      "learning_rate": 3.045516253076137e-06,
+      "loss": 0.0276,
+      "step": 640,
+      "student_loss": 0.0007398583693429828,
+      "teacher_loss": 0.00036641399492509663
+    },
+    {
+      "epoch": 0.18598615916955016,
+      "grad_norm": 0.90234375,
+      "kd_loss": 0.03232554718852043,
+      "learning_rate": 2.9712492191369245e-06,
+      "loss": 0.0279,
+      "step": 645,
+      "student_loss": 0.0006595580489374697,
+      "teacher_loss": 0.0004537871282082051
+    },
+    {
+      "epoch": 0.18742791234140715,
+      "grad_norm": 0.72265625,
+      "kd_loss": 0.02090008556842804,
+      "learning_rate": 2.8975141899291777e-06,
+      "loss": 0.0271,
+      "step": 650,
+      "student_loss": 0.000823685375507921,
+      "teacher_loss": 0.0008547761826775968
+    },
+    {
+      "epoch": 0.18886966551326412,
+      "grad_norm": 1.0546875,
+      "kd_loss": 0.04636139050126076,
+      "learning_rate": 2.8243305011863843e-06,
+      "loss": 0.0281,
+      "step": 655,
+      "student_loss": 0.0005695814033970237,
+      "teacher_loss": 0.00031179824145510793
+    },
+    {
+      "epoch": 0.1903114186851211,
+      "grad_norm": 1.6875,
+      "kd_loss": 0.024737229570746422,
+      "learning_rate": 2.751717344062552e-06,
+      "loss": 0.0316,
+      "step": 660,
+      "student_loss": 0.0006874947575852275,
+      "teacher_loss": 0.002697322051972151
+    },
+    {
+      "epoch": 0.19175317185697807,
+      "grad_norm": 0.953125,
+      "kd_loss": 0.06534231454133987,
+      "learning_rate": 2.6796937600996587e-06,
+      "loss": 0.0336,
+      "step": 665,
+      "student_loss": 0.0013298896374180913,
+      "teacher_loss": 0.0003429361677262932
+    },
+    {
+      "epoch": 0.19319492502883506,
+      "grad_norm": 1.3046875,
+      "kd_loss": 0.03078162856400013,
+      "learning_rate": 2.6082786362343377e-06,
+      "loss": 0.035,
+      "step": 670,
+      "student_loss": 0.0029936288483440876,
+      "teacher_loss": 0.0007150344317778945
+    },
+    {
+      "epoch": 0.19463667820069205,
+      "grad_norm": 0.94140625,
+      "kd_loss": 0.019795667380094528,
+      "learning_rate": 2.5374906998451094e-06,
+      "loss": 0.0333,
+      "step": 675,
+      "student_loss": 0.0008592633530497551,
+      "teacher_loss": 0.0008608284406363964
+    },
+    {
+      "epoch": 0.19607843137254902,
+      "grad_norm": 1.34375,
+      "kd_loss": 0.061516523361206055,
+      "learning_rate": 2.467348513841447e-06,
+      "loss": 0.0279,
+      "step": 680,
+      "student_loss": 0.11378592997789383,
+      "teacher_loss": 0.04401650279760361
+    },
+    {
+      "epoch": 0.197520184544406,
+      "grad_norm": 1.1953125,
+      "kd_loss": 0.02606314793229103,
+      "learning_rate": 2.3978704717959777e-06,
+      "loss": 0.0311,
+      "step": 685,
+      "student_loss": 0.037850309163331985,
+      "teacher_loss": 0.0306412260979414
+    },
+    {
+      "epoch": 0.19896193771626297,
+      "grad_norm": 2.078125,
+      "kd_loss": 0.05581173300743103,
+      "learning_rate": 2.329074793121085e-06,
+      "loss": 0.0321,
+      "step": 690,
+      "student_loss": 0.007378917653113604,
+      "teacher_loss": 0.01118262205272913
+    },
+    {
+      "epoch": 0.20040369088811996,
+      "grad_norm": 0.984375,
+      "kd_loss": 0.035412125289440155,
+      "learning_rate": 2.260979518291186e-06,
+      "loss": 0.0349,
+      "step": 695,
+      "student_loss": 0.0029192951042205095,
+      "teacher_loss": 0.007233661133795977
+    },
+    {
+      "epoch": 0.20184544405997693,
+      "grad_norm": 1.3359375,
+      "kd_loss": 0.024152226746082306,
+      "learning_rate": 2.1936025041119268e-06,
+      "loss": 0.031,
+      "step": 700,
+      "student_loss": 0.0007802582113072276,
+      "teacher_loss": 0.0007531556766480207
+    },
+    {
+      "epoch": 0.20328719723183392,
+      "grad_norm": 1.796875,
+      "kd_loss": 0.02112882025539875,
+      "learning_rate": 2.1269614190375477e-06,
+      "loss": 0.0307,
+      "step": 705,
+      "student_loss": 0.001995673170313239,
+      "teacher_loss": 0.0013640234246850014
+    },
+    {
+      "epoch": 0.20472895040369088,
+      "grad_norm": 0.76171875,
+      "kd_loss": 0.060643333941698074,
+      "learning_rate": 2.061073738537635e-06,
+      "loss": 0.0335,
+      "step": 710,
+      "student_loss": 0.005931171588599682,
+      "teacher_loss": 0.00436427490785718
+    },
+    {
+      "epoch": 0.20617070357554787,
+      "grad_norm": 0.9453125,
+      "kd_loss": 0.025081951171159744,
+      "learning_rate": 1.9959567405144825e-06,
+      "loss": 0.0378,
+      "step": 715,
+      "student_loss": 0.0024764742702245712,
+      "teacher_loss": 0.005250995047390461
+    },
+    {
+      "epoch": 0.20761245674740483,
+      "grad_norm": 0.9765625,
+      "kd_loss": 0.023644844070076942,
+      "learning_rate": 1.931627500772263e-06,
+      "loss": 0.0328,
+      "step": 720,
+      "student_loss": 0.0015447784680873156,
+      "teacher_loss": 0.0032466305419802666
+    },
+    {
+      "epoch": 0.20905420991926182,
+      "grad_norm": 0.828125,
+      "kd_loss": 0.02127092145383358,
+      "learning_rate": 1.8681028885391905e-06,
+      "loss": 0.0261,
+      "step": 725,
+      "student_loss": 0.00040122531936503947,
+      "teacher_loss": 0.00032715691486373544
+    },
+    {
+      "epoch": 0.2104959630911188,
+      "grad_norm": 1.21875,
+      "kd_loss": 0.03333493322134018,
+      "learning_rate": 1.8053995620438625e-06,
+      "loss": 0.0308,
+      "step": 730,
+      "student_loss": 0.0026169014163315296,
+      "teacher_loss": 0.0015284213004633784
+    },
+    {
+      "epoch": 0.21193771626297578,
+      "grad_norm": 1.2578125,
+      "kd_loss": 0.03426002338528633,
+      "learning_rate": 1.743533964146924e-06,
+      "loss": 0.0366,
+      "step": 735,
+      "student_loss": 0.0005074241198599339,
+      "teacher_loss": 0.0002774895401671529
+    },
+    {
+      "epoch": 0.21337946943483277,
+      "grad_norm": 1.0703125,
+      "kd_loss": 0.03785933926701546,
+      "learning_rate": 1.6825223180292138e-06,
+      "loss": 0.0288,
+      "step": 740,
+      "student_loss": 0.0019946948159486055,
+      "teacher_loss": 0.00017226416093762964
+    },
+    {
+      "epoch": 0.21482122260668973,
+      "grad_norm": 0.98828125,
+      "kd_loss": 0.024361569434404373,
+      "learning_rate": 1.6223806229375182e-06,
+      "loss": 0.0301,
+      "step": 745,
+      "student_loss": 0.04029940441250801,
+      "teacher_loss": 0.018720334395766258
+    },
+    {
+      "epoch": 0.21626297577854672,
+      "grad_norm": 1.671875,
+      "kd_loss": 0.06201966851949692,
+      "learning_rate": 1.563124649989043e-06,
+      "loss": 0.0294,
+      "step": 750,
+      "student_loss": 0.01399194449186325,
+      "teacher_loss": 0.004409614019095898
+    },
+    {
+      "epoch": 0.2177047289504037,
+      "grad_norm": 1.7421875,
+      "kd_loss": 0.024494940415024757,
+      "learning_rate": 1.5047699380357134e-06,
+      "loss": 0.035,
+      "step": 755,
+      "student_loss": 0.0298053789883852,
+      "teacher_loss": 0.006942529231309891
+    },
+    {
+      "epoch": 0.21914648212226068,
+      "grad_norm": 1.5234375,
+      "kd_loss": 0.06555049866437912,
+      "learning_rate": 1.4473317895893773e-06,
+      "loss": 0.0292,
+      "step": 760,
+      "student_loss": 0.054234448820352554,
+      "teacher_loss": 0.0010094160679727793
+    },
+    {
+      "epoch": 0.22058823529411764,
+      "grad_norm": 1.1015625,
+      "kd_loss": 0.050752874463796616,
+      "learning_rate": 1.39082526680899e-06,
+      "loss": 0.0358,
+      "step": 765,
+      "student_loss": 0.0037405432667583227,
+      "teacher_loss": 0.0012883292511105537
+    },
+    {
+      "epoch": 0.22202998846597463,
+      "grad_norm": 1.4609375,
+      "kd_loss": 0.05563059076666832,
+      "learning_rate": 1.3352651875508204e-06,
+      "loss": 0.0312,
+      "step": 770,
+      "student_loss": 0.0009953544940799475,
+      "teacher_loss": 0.0020596326794475317
+    },
+    {
+      "epoch": 0.2234717416378316,
+      "grad_norm": 1.625,
+      "kd_loss": 0.03560180589556694,
+      "learning_rate": 1.2806661214827286e-06,
+      "loss": 0.0281,
+      "step": 775,
+      "student_loss": 0.002246091840788722,
+      "teacher_loss": 0.0005796651821583509
+    },
+    {
+      "epoch": 0.22491349480968859,
+      "grad_norm": 1.2421875,
+      "kd_loss": 0.029239855706691742,
+      "learning_rate": 1.2270423862635188e-06,
+      "loss": 0.0322,
+      "step": 780,
+      "student_loss": 0.0003545498475432396,
+      "teacher_loss": 0.00024268838751595467
+    },
+    {
+      "epoch": 0.22635524798154555,
+      "grad_norm": 1.53125,
+      "kd_loss": 0.019190076738595963,
+      "learning_rate": 1.1744080437883859e-06,
+      "loss": 0.0288,
+      "step": 785,
+      "student_loss": 0.0004118382930755615,
+      "teacher_loss": 0.0013559224316850305
+    },
+    {
+      "epoch": 0.22779700115340254,
+      "grad_norm": 1.71875,
+      "kd_loss": 0.01850035786628723,
+      "learning_rate": 1.1227768965014246e-06,
+      "loss": 0.0274,
+      "step": 790,
+      "student_loss": 0.006309983320534229,
+      "teacher_loss": 0.004283738788217306
+    },
+    {
+      "epoch": 0.2292387543252595,
+      "grad_norm": 1.1796875,
+      "kd_loss": 0.029416479170322418,
+      "learning_rate": 1.0721624837761768e-06,
+      "loss": 0.0311,
+      "step": 795,
+      "student_loss": 0.0004396810254547745,
+      "teacher_loss": 0.0006457470590248704
+    },
+    {
+      "epoch": 0.2306805074971165,
+      "grad_norm": 1.203125,
+      "kd_loss": 0.04670260474085808,
+      "learning_rate": 1.0225780783651689e-06,
+      "loss": 0.0362,
+      "step": 800,
+      "student_loss": 0.0542101114988327,
+      "teacher_loss": 0.0266594085842371
+    },
+    {
+      "epoch": 0.23212226066897348,
+      "grad_norm": 1.25,
+      "kd_loss": 0.019798053428530693,
+      "learning_rate": 9.740366829193587e-07,
+      "loss": 0.0339,
+      "step": 805,
+      "student_loss": 0.00048328418051823974,
+      "teacher_loss": 0.00044713987153954804
+    },
+    {
+      "epoch": 0.23356401384083045,
+      "grad_norm": 6.125,
+      "kd_loss": 0.03847261890769005,
+      "learning_rate": 9.265510265784189e-07,
+      "loss": 0.0292,
+      "step": 810,
+      "student_loss": 0.0004956105258315802,
+      "teacher_loss": 0.00024124003539327532
+    },
+    {
+      "epoch": 0.23500576701268744,
+      "grad_norm": 1.8671875,
+      "kd_loss": 0.02178817056119442,
+      "learning_rate": 8.801335616327378e-07,
+      "loss": 0.0305,
+      "step": 815,
+      "student_loss": 0.0008903697016648948,
+      "teacher_loss": 0.01014482881873846
+    },
+    {
+      "epoch": 0.2364475201845444,
+      "grad_norm": 1.6328125,
+      "kd_loss": 0.036839839071035385,
+      "learning_rate": 8.347964602580245e-07,
+      "loss": 0.0308,
+      "step": 820,
+      "student_loss": 0.008828239515423775,
+      "teacher_loss": 0.0019727114122360945
+    },
+    {
+      "epoch": 0.2378892733564014,
+      "grad_norm": 1.828125,
+      "kd_loss": 0.03977164626121521,
+      "learning_rate": 7.905516113233652e-07,
+      "loss": 0.0289,
+      "step": 825,
+      "student_loss": 0.0008772791479714215,
+      "teacher_loss": 0.016588712111115456
+    },
+    {
+      "epoch": 0.23933102652825836,
+      "grad_norm": 0.8359375,
+      "kd_loss": 0.03716457635164261,
+      "learning_rate": 7.474106172735746e-07,
+      "loss": 0.0306,
+      "step": 830,
+      "student_loss": 0.007541711442172527,
+      "teacher_loss": 0.004410990513861179
+    },
+    {
+      "epoch": 0.24077277970011535,
+      "grad_norm": 1.2890625,
+      "kd_loss": 0.028623711317777634,
+      "learning_rate": 7.053847910866513e-07,
+      "loss": 0.0283,
+      "step": 835,
+      "student_loss": 0.01320332009345293,
+      "teacher_loss": 0.004977709148079157
+    },
+    {
+      "epoch": 0.2422145328719723,
+      "grad_norm": 1.2265625,
+      "kd_loss": 0.038853954523801804,
+      "learning_rate": 6.644851533071556e-07,
+      "loss": 0.0303,
+      "step": 840,
+      "student_loss": 0.0004099447396583855,
+      "teacher_loss": 0.00023886460985522717
+    },
+    {
+      "epoch": 0.2436562860438293,
+      "grad_norm": 1.59375,
+      "kd_loss": 0.026872437447309494,
+      "learning_rate": 6.24722429156251e-07,
+      "loss": 0.0455,
+      "step": 845,
+      "student_loss": 0.0005929277976974845,
+      "teacher_loss": 0.0004925825633108616
+    },
+    {
+      "epoch": 0.24509803921568626,
+      "grad_norm": 1.953125,
+      "kd_loss": 0.045193299651145935,
+      "learning_rate": 5.861070457192081e-07,
+      "loss": 0.0324,
+      "step": 850,
+      "student_loss": 0.01363430917263031,
+      "teacher_loss": 0.00036361668026074767
+    },
+    {
+      "epoch": 0.24653979238754326,
+      "grad_norm": 0.83203125,
+      "kd_loss": 0.029308617115020752,
+      "learning_rate": 5.486491292110796e-07,
+      "loss": 0.0307,
+      "step": 855,
+      "student_loss": 0.00042216500150971115,
+      "teacher_loss": 0.00046156966709531844
+    },
+    {
+      "epoch": 0.24798154555940022,
+      "grad_norm": 1.5625,
+      "kd_loss": 0.028431078419089317,
+      "learning_rate": 5.123585023212785e-07,
+      "loss": 0.0361,
+      "step": 860,
+      "student_loss": 0.0017358119366690516,
+      "teacher_loss": 0.0008302436908707023
+    },
+    {
+      "epoch": 0.2494232987312572,
+      "grad_norm": 1.859375,
+      "kd_loss": 0.03732982277870178,
+      "learning_rate": 4.772446816377408e-07,
+      "loss": 0.0322,
+      "step": 865,
+      "student_loss": 0.0003742146072909236,
+      "teacher_loss": 0.0004992606700398028
+    },
+    {
+      "epoch": 0.2508650519031142,
+      "grad_norm": 1.2109375,
+      "kd_loss": 0.045105963945388794,
+      "learning_rate": 4.4331687515137614e-07,
+      "loss": 0.032,
+      "step": 870,
+      "student_loss": 0.0020948813762515783,
+      "teacher_loss": 0.0004395451978780329
+    },
+    {
+      "epoch": 0.25230680507497116,
+      "grad_norm": 1.1484375,
+      "kd_loss": 0.028816962614655495,
+      "learning_rate": 4.1058397984142405e-07,
+      "loss": 0.0309,
+      "step": 875,
+      "student_loss": 0.00026272557443007827,
+      "teacher_loss": 0.00027639055042527616
+    },
+    {
+      "epoch": 0.2537485582468281,
+      "grad_norm": 1.7265625,
+      "kd_loss": 0.036270152777433395,
+      "learning_rate": 3.790545793423761e-07,
+      "loss": 0.0331,
+      "step": 880,
+      "student_loss": 0.0007760376902297139,
+      "teacher_loss": 0.0004516106564551592
+    },
+    {
+      "epoch": 0.25519031141868515,
+      "grad_norm": 1.6875,
+      "kd_loss": 0.07943305373191833,
+      "learning_rate": 3.4873694169306915e-07,
+      "loss": 0.0268,
+      "step": 885,
+      "student_loss": 0.014952230267226696,
+      "teacher_loss": 0.05265064164996147
+    },
+    {
+      "epoch": 0.2566320645905421,
+      "grad_norm": 1.296875,
+      "kd_loss": 0.022127894684672356,
+      "learning_rate": 3.196390171685343e-07,
+      "loss": 0.0328,
+      "step": 890,
+      "student_loss": 0.000466528203105554,
+      "teacher_loss": 0.00047626433661207557
+    },
+    {
+      "epoch": 0.25807381776239907,
+      "grad_norm": 1.359375,
+      "kd_loss": 0.044547051191329956,
+      "learning_rate": 2.917684361951728e-07,
+      "loss": 0.0313,
+      "step": 895,
+      "student_loss": 0.012537115253508091,
+      "teacher_loss": 0.004908856004476547
+    },
+    {
+      "epoch": 0.25951557093425603,
+      "grad_norm": 1.2265625,
+      "kd_loss": 0.030574947595596313,
+      "learning_rate": 2.65132507349814e-07,
+      "loss": 0.0349,
+      "step": 900,
+      "student_loss": 0.02907959371805191,
+      "teacher_loss": 0.021374521777033806
+    },
+    {
+      "epoch": 0.26095732410611305,
+      "grad_norm": 0.87890625,
+      "kd_loss": 0.02914801612496376,
+      "learning_rate": 2.397382154431621e-07,
+      "loss": 0.0301,
+      "step": 905,
+      "student_loss": 0.0034685099963098764,
+      "teacher_loss": 0.002060187980532646
+    },
+    {
+      "epoch": 0.26239907727797,
+      "grad_norm": 1.984375,
+      "kd_loss": 0.0164653267711401,
+      "learning_rate": 2.1559221968815547e-07,
+      "loss": 0.0285,
+      "step": 910,
+      "student_loss": 0.0004796285356860608,
+      "teacher_loss": 0.0005832899478264153
+    },
+    {
+      "epoch": 0.263840830449827,
+      "grad_norm": 1.3046875,
+      "kd_loss": 0.049093734472990036,
+      "learning_rate": 1.9270085195370048e-07,
+      "loss": 0.0291,
+      "step": 915,
+      "student_loss": 0.0019156670896336436,
+      "teacher_loss": 0.0006669871509075165
+    },
+    {
+      "epoch": 0.26528258362168394,
+      "grad_norm": 0.77734375,
+      "kd_loss": 0.024058707058429718,
+      "learning_rate": 1.7107011510424766e-07,
+      "loss": 0.0306,
+      "step": 920,
+      "student_loss": 0.014235829003155231,
+      "teacher_loss": 0.012093200348317623
+    },
+    {
+      "epoch": 0.26672433679354096,
+      "grad_norm": 0.78515625,
+      "kd_loss": 0.020182941108942032,
+      "learning_rate": 1.5070568142564912e-07,
+      "loss": 0.0286,
+      "step": 925,
+      "student_loss": 0.0004554464831016958,
+      "teacher_loss": 0.00047793317935429513
+    },
+    {
+      "epoch": 0.2681660899653979,
+      "grad_norm": 1.1640625,
+      "kd_loss": 0.030689207836985588,
+      "learning_rate": 1.3161289113769405e-07,
+      "loss": 0.0296,
+      "step": 930,
+      "student_loss": 0.015923185274004936,
+      "teacher_loss": 0.0007836474105715752
+    },
+    {
+      "epoch": 0.2696078431372549,
+      "grad_norm": 1.1015625,
+      "kd_loss": 0.026608861982822418,
+      "learning_rate": 1.1379675099373489e-07,
+      "loss": 0.0283,
+      "step": 935,
+      "student_loss": 0.0022123996168375015,
+      "teacher_loss": 0.0022658726666122675
+    },
+    {
+      "epoch": 0.2710495963091119,
+      "grad_norm": 1.0625,
+      "kd_loss": 0.04891452193260193,
+      "learning_rate": 9.726193296774767e-08,
+      "loss": 0.032,
+      "step": 940,
+      "student_loss": 0.007106420584022999,
+      "teacher_loss": 0.001212292816489935
+    },
+    {
+      "epoch": 0.27249134948096887,
+      "grad_norm": 1.03125,
+      "kd_loss": 0.02279752679169178,
+      "learning_rate": 8.201277302919086e-08,
+      "loss": 0.0335,
+      "step": 945,
+      "student_loss": 0.009904825128614902,
+      "teacher_loss": 0.0015650882851332426
+    },
+    {
+      "epoch": 0.27393310265282583,
+      "grad_norm": 0.875,
+      "kd_loss": 0.031176593154668808,
+      "learning_rate": 6.805327000596995e-08,
+      "loss": 0.0286,
+      "step": 950,
+      "student_loss": 0.0004509532009251416,
+      "teacher_loss": 0.00026176305254921317
+    },
+    {
+      "epoch": 0.2753748558246828,
+      "grad_norm": 1.1875,
+      "kd_loss": 0.029909612610936165,
+      "learning_rate": 5.538708453581787e-08,
+      "loss": 0.0302,
+      "step": 955,
+      "student_loss": 0.0009037847630679607,
+      "teacher_loss": 0.0007386825745925307
+    },
+    {
+      "epoch": 0.2768166089965398,
+      "grad_norm": 1.8515625,
+      "kd_loss": 0.03285350278019905,
+      "learning_rate": 4.40175381063529e-08,
+      "loss": 0.0321,
+      "step": 960,
+      "student_loss": 0.0016368315555155277,
+      "teacher_loss": 0.0012363821733742952
+    },
+    {
+      "epoch": 0.2782583621683968,
+      "grad_norm": 1.3515625,
+      "kd_loss": 0.038273654878139496,
+      "learning_rate": 3.394761218407705e-08,
+      "loss": 0.0286,
+      "step": 965,
+      "student_loss": 0.010774951428174973,
+      "teacher_loss": 0.006840071175247431
+    },
+    {
+      "epoch": 0.27970011534025374,
+      "grad_norm": 1.1796875,
+      "kd_loss": 0.033728428184986115,
+      "learning_rate": 2.5179947432540376e-08,
+      "loss": 0.0303,
+      "step": 970,
+      "student_loss": 0.0003708581207320094,
+      "teacher_loss": 0.00017761511844582856
+    },
+    {
+      "epoch": 0.2811418685121107,
+      "grad_norm": 2.46875,
+      "kd_loss": 0.025371067225933075,
+      "learning_rate": 1.7716843019867646e-08,
+      "loss": 0.0281,
+      "step": 975,
+      "student_loss": 0.013592376373708248,
+      "teacher_loss": 0.0011630237568169832
+    },
+    {
+      "epoch": 0.2825836216839677,
+      "grad_norm": 0.90625,
+      "kd_loss": 0.03571967035531998,
+      "learning_rate": 1.156025601584676e-08,
+      "loss": 0.0317,
+      "step": 980,
+      "student_loss": 0.00045868984307162464,
+      "teacher_loss": 0.0002273312275065109
+    },
+    {
+      "epoch": 0.2840253748558247,
+      "grad_norm": 1.3046875,
+      "kd_loss": 0.030186688527464867,
+      "learning_rate": 6.711800878718144e-09,
+      "loss": 0.0318,
+      "step": 985,
+      "student_loss": 0.0002502851712051779,
+      "teacher_loss": 0.0002345747489016503
+    },
+    {
+      "epoch": 0.28546712802768165,
+      "grad_norm": 1.671875,
+      "kd_loss": 0.0296288151293993,
+      "learning_rate": 3.1727490318111953e-09,
+      "loss": 0.0292,
+      "step": 990,
+      "student_loss": 0.0013771216617897153,
+      "teacher_loss": 0.00027968850918114185
+    },
+    {
+      "epoch": 0.2869088811995386,
+      "grad_norm": 0.953125,
+      "kd_loss": 0.04763549193739891,
+      "learning_rate": 9.440285301370865e-10,
+      "loss": 0.0333,
+      "step": 995,
+      "student_loss": 0.0019419729942455888,
+      "teacher_loss": 0.0009000069694593549
+    },
+    {
+      "epoch": 0.28835063437139563,
+      "grad_norm": 1.1328125,
+      "kd_loss": 0.03819262236356735,
+      "learning_rate": 2.622381702066523e-11,
+      "loss": 0.0293,
+      "step": 1000,
+      "student_loss": 0.006347914692014456,
+      "teacher_loss": 0.00022976804757490754
+    },
+    {
+      "epoch": 0.28835063437139563,
+      "kd_loss": 0.03819262236356735,
+      "step": 1000,
+      "student_loss": 0.006347914692014456,
+      "teacher_loss": 0.00022976804757490754,
+      "total_flos": 0.0,
+      "train_loss": 0.038887605339288714,
+      "train_runtime": 42585.1685,
+      "train_samples_per_second": 0.376,
+      "train_steps_per_second": 0.023
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 1000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-1000/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-200/added_tokens.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|action_sep|>": 151670,
+  "<|arg_sep|>": 151671,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|call_sep|>": 151666,
+  "<|end_of_text|>": 151673,
+  "<|endoftext|>": 151643,
+  "<|exception_sep|>": 151669,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|frame_sep|>": 151672,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|latent_end|>": 151675,
+  "<|latent_start|>": 151674,
+  "<|line_sep|>": 151667,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|return_sep|>": 151668,
+  "<|trace_context_start|>": 151665,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-200/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-200/config.json ADDED Viewed

	@@ -0,0 +1,66 @@

+{
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "dtype": "bfloat16",
+  "eos_token_id": 151643,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 11008,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 32768,
+  "max_window_layers": 36,
+  "model_type": "qwen2",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 36,
+  "num_key_value_heads": 2,
+  "pad_token_id": 151643,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "transformers_version": "4.57.6",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151676
+}

checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-200/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-200/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e5a0fb2eb54ab86efa50b7fbd0472cd1a04e921f87f21a6abcd29b5f358429ed
+size 6187858991

checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-200/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-200/thought_projector.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:523885c5ce55a07b08632c61501c45ac0817d35409b075d4e080b994be75d8ce
+size 16788033

checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-200/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:83a790d654474f5dfe225f889afd0210313eb1083f942671f2c4b8e95a1c922b
+size 11424004

checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-200/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,295 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<|trace_context_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151666": {
+      "content": "<|call_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151667": {
+      "content": "<|line_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151668": {
+      "content": "<|return_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151669": {
+      "content": "<|exception_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151670": {
+      "content": "<|action_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151671": {
+      "content": "<|arg_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151672": {
+      "content": "<|frame_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151673": {
+      "content": "<|end_of_text|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151674": {
+      "content": "<|latent_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151675": {
+      "content": "<|latent_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 32768,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-200/trainer_state.json ADDED Viewed

	@@ -0,0 +1,434 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.05767012687427912,
+  "eval_steps": 500,
+  "global_step": 200,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0014417531718569781,
+      "grad_norm": 64.5,
+      "kd_loss": 0.2920137941837311,
+      "learning_rate": 1.3333333333333334e-06,
+      "loss": 0.4591,
+      "step": 5,
+      "student_loss": 0.18811793625354767,
+      "teacher_loss": 0.0020189557690173388
+    },
+    {
+      "epoch": 0.0028835063437139563,
+      "grad_norm": 37.25,
+      "kd_loss": 0.27189651131629944,
+      "learning_rate": 3e-06,
+      "loss": 0.3445,
+      "step": 10,
+      "student_loss": 0.15471063554286957,
+      "teacher_loss": 0.00530141731724143
+    },
+    {
+      "epoch": 0.004325259515570935,
+      "grad_norm": 7.1875,
+      "kd_loss": 0.22213532030582428,
+      "learning_rate": 4.666666666666667e-06,
+      "loss": 0.2034,
+      "step": 15,
+      "student_loss": 0.007600904442369938,
+      "teacher_loss": 0.0011894693598151207
+    },
+    {
+      "epoch": 0.0057670126874279125,
+      "grad_norm": 7.21875,
+      "kd_loss": 0.14968645572662354,
+      "learning_rate": 6.333333333333333e-06,
+      "loss": 0.1464,
+      "step": 20,
+      "student_loss": 0.026719728484749794,
+      "teacher_loss": 0.0005942200659774244
+    },
+    {
+      "epoch": 0.00720876585928489,
+      "grad_norm": 2.703125,
+      "kd_loss": 0.11552157998085022,
+      "learning_rate": 8.000000000000001e-06,
+      "loss": 0.1171,
+      "step": 25,
+      "student_loss": 0.002865401329472661,
+      "teacher_loss": 0.0010151522001251578
+    },
+    {
+      "epoch": 0.00865051903114187,
+      "grad_norm": 1.484375,
+      "kd_loss": 0.07486728578805923,
+      "learning_rate": 9.666666666666667e-06,
+      "loss": 0.0901,
+      "step": 30,
+      "student_loss": 0.0011324587976559997,
+      "teacher_loss": 0.0005539655685424805
+    },
+    {
+      "epoch": 0.010092272202998846,
+      "grad_norm": 2.140625,
+      "kd_loss": 0.06898625195026398,
+      "learning_rate": 9.99958042442916e-06,
+      "loss": 0.0757,
+      "step": 35,
+      "student_loss": 0.0009480358567088842,
+      "teacher_loss": 0.0005732738063670695
+    },
+    {
+      "epoch": 0.011534025374855825,
+      "grad_norm": 1.296875,
+      "kd_loss": 0.0642521008849144,
+      "learning_rate": 9.997876019358083e-06,
+      "loss": 0.0685,
+      "step": 40,
+      "student_loss": 0.031368859112262726,
+      "teacher_loss": 0.027499590069055557
+    },
+    {
+      "epoch": 0.012975778546712802,
+      "grad_norm": 1.3359375,
+      "kd_loss": 0.11454806476831436,
+      "learning_rate": 9.99486100792044e-06,
+      "loss": 0.0541,
+      "step": 45,
+      "student_loss": 0.06097811087965965,
+      "teacher_loss": 0.0027094183024019003
+    },
+    {
+      "epoch": 0.01441753171856978,
+      "grad_norm": 1.5859375,
+      "kd_loss": 0.03876315429806709,
+      "learning_rate": 9.990536180750724e-06,
+      "loss": 0.0563,
+      "step": 50,
+      "student_loss": 0.02282995544373989,
+      "teacher_loss": 0.008940276689827442
+    },
+    {
+      "epoch": 0.015859284890426758,
+      "grad_norm": 1.125,
+      "kd_loss": 0.033022914081811905,
+      "learning_rate": 9.984902671959911e-06,
+      "loss": 0.0481,
+      "step": 55,
+      "student_loss": 0.0011594295501708984,
+      "teacher_loss": 0.0007985035772435367
+    },
+    {
+      "epoch": 0.01730103806228374,
+      "grad_norm": 0.8828125,
+      "kd_loss": 0.030243180692195892,
+      "learning_rate": 9.97796195883804e-06,
+      "loss": 0.0429,
+      "step": 60,
+      "student_loss": 0.0034187885466963053,
+      "teacher_loss": 0.0017242392059415579
+    },
+    {
+      "epoch": 0.018742791234140715,
+      "grad_norm": 1.4453125,
+      "kd_loss": 0.032282304018735886,
+      "learning_rate": 9.969715861466839e-06,
+      "loss": 0.0446,
+      "step": 65,
+      "student_loss": 0.029871758073568344,
+      "teacher_loss": 0.039595428854227066
+    },
+    {
+      "epoch": 0.020184544405997693,
+      "grad_norm": 1.1953125,
+      "kd_loss": 0.06090804189443588,
+      "learning_rate": 9.96016654224243e-06,
+      "loss": 0.0477,
+      "step": 70,
+      "student_loss": 0.017709577456116676,
+      "teacher_loss": 0.0022253147326409817
+    },
+    {
+      "epoch": 0.02162629757785467,
+      "grad_norm": 0.84765625,
+      "kd_loss": 0.024834414944052696,
+      "learning_rate": 9.94931650530827e-06,
+      "loss": 0.0418,
+      "step": 75,
+      "student_loss": 0.0012920524459332228,
+      "teacher_loss": 0.0008862126851454377
+    },
+    {
+      "epoch": 0.02306805074971165,
+      "grad_norm": 0.60546875,
+      "kd_loss": 0.025287121534347534,
+      "learning_rate": 9.93716859589851e-06,
+      "loss": 0.0365,
+      "step": 80,
+      "student_loss": 0.0013494148151949048,
+      "teacher_loss": 0.0008630359079688787
+    },
+    {
+      "epoch": 0.024509803921568627,
+      "grad_norm": 0.86328125,
+      "kd_loss": 0.024378223344683647,
+      "learning_rate": 9.923725999591846e-06,
+      "loss": 0.0395,
+      "step": 85,
+      "student_loss": 0.0005228935624472797,
+      "teacher_loss": 0.0004561410460155457
+    },
+    {
+      "epoch": 0.025951557093425604,
+      "grad_norm": 0.70703125,
+      "kd_loss": 0.07289917767047882,
+      "learning_rate": 9.908992241476189e-06,
+      "loss": 0.0394,
+      "step": 90,
+      "student_loss": 0.020422162488102913,
+      "teacher_loss": 0.0035499960649758577
+    },
+    {
+      "epoch": 0.027393310265282585,
+      "grad_norm": 1.125,
+      "kd_loss": 0.044955912977457047,
+      "learning_rate": 9.892971185224244e-06,
+      "loss": 0.0351,
+      "step": 95,
+      "student_loss": 0.008261171169579029,
+      "teacher_loss": 0.005078152287751436
+    },
+    {
+      "epoch": 0.02883506343713956,
+      "grad_norm": 0.83203125,
+      "kd_loss": 0.02227398194372654,
+      "learning_rate": 9.875667032080354e-06,
+      "loss": 0.0326,
+      "step": 100,
+      "student_loss": 0.0006025677430443466,
+      "teacher_loss": 0.00046476206625811756
+    },
+    {
+      "epoch": 0.03027681660899654,
+      "grad_norm": 1.2734375,
+      "kd_loss": 0.03532887250185013,
+      "learning_rate": 9.857084319758772e-06,
+      "loss": 0.036,
+      "step": 105,
+      "student_loss": 0.0034369053319096565,
+      "teacher_loss": 0.00029834595625288785
+    },
+    {
+      "epoch": 0.031718569780853516,
+      "grad_norm": 1.1640625,
+      "kd_loss": 0.04033924266695976,
+      "learning_rate": 9.837227921253747e-06,
+      "loss": 0.0375,
+      "step": 110,
+      "student_loss": 0.03080393560230732,
+      "teacher_loss": 0.018437745049595833
+    },
+    {
+      "epoch": 0.03316032295271049,
+      "grad_norm": 0.703125,
+      "kd_loss": 0.04197424277663231,
+      "learning_rate": 9.816103043561648e-06,
+      "loss": 0.0347,
+      "step": 115,
+      "student_loss": 0.0021668823901563883,
+      "teacher_loss": 0.00045062918798066676
+    },
+    {
+      "epoch": 0.03460207612456748,
+      "grad_norm": 0.98828125,
+      "kd_loss": 0.027563175186514854,
+      "learning_rate": 9.79371522631553e-06,
+      "loss": 0.032,
+      "step": 120,
+      "student_loss": 0.0016319500282406807,
+      "teacher_loss": 0.0008567498298361897
+    },
+    {
+      "epoch": 0.036043829296424454,
+      "grad_norm": 0.92578125,
+      "kd_loss": 0.06173818185925484,
+      "learning_rate": 9.770070340332457e-06,
+      "loss": 0.0364,
+      "step": 125,
+      "student_loss": 0.02385914884507656,
+      "teacher_loss": 0.00027849775506183505
+    },
+    {
+      "epoch": 0.03748558246828143,
+      "grad_norm": 1.0859375,
+      "kd_loss": 0.029248492792248726,
+      "learning_rate": 9.745174586073982e-06,
+      "loss": 0.0346,
+      "step": 130,
+      "student_loss": 0.0005455865757539868,
+      "teacher_loss": 0.0004959598300047219
+    },
+    {
+      "epoch": 0.03892733564013841,
+      "grad_norm": 0.953125,
+      "kd_loss": 0.0406946986913681,
+      "learning_rate": 9.719034492020183e-06,
+      "loss": 0.0377,
+      "step": 135,
+      "student_loss": 0.0013323032762855291,
+      "teacher_loss": 0.0005948346224613488
+    },
+    {
+      "epoch": 0.040369088811995385,
+      "grad_norm": 1.0390625,
+      "kd_loss": 0.023682042956352234,
+      "learning_rate": 9.691656912957686e-06,
+      "loss": 0.036,
+      "step": 140,
+      "student_loss": 0.0005881476681679487,
+      "teacher_loss": 0.0003679130459204316
+    },
+    {
+      "epoch": 0.04181084198385236,
+      "grad_norm": 0.5859375,
+      "kd_loss": 0.07271980494260788,
+      "learning_rate": 9.663049028182112e-06,
+      "loss": 0.0325,
+      "step": 145,
+      "student_loss": 0.028793470934033394,
+      "teacher_loss": 0.005983584560453892
+    },
+    {
+      "epoch": 0.04325259515570934,
+      "grad_norm": 0.609375,
+      "kd_loss": 0.01843745820224285,
+      "learning_rate": 9.633218339615433e-06,
+      "loss": 0.0316,
+      "step": 150,
+      "student_loss": 0.001051027444191277,
+      "teacher_loss": 0.000913174357265234
+    },
+    {
+      "epoch": 0.04469434832756632,
+      "grad_norm": 1.1171875,
+      "kd_loss": 0.023381218314170837,
+      "learning_rate": 9.602172669838721e-06,
+      "loss": 0.0381,
+      "step": 155,
+      "student_loss": 0.002775231609120965,
+      "teacher_loss": 0.0007182428380474448
+    },
+    {
+      "epoch": 0.0461361014994233,
+      "grad_norm": 1.6875,
+      "kd_loss": 0.03144950047135353,
+      "learning_rate": 9.569920160040815e-06,
+      "loss": 0.0344,
+      "step": 160,
+      "student_loss": 0.0778423622250557,
+      "teacher_loss": 0.044956743717193604
+    },
+    {
+      "epoch": 0.04757785467128028,
+      "grad_norm": 0.75,
+      "kd_loss": 0.03335012122988701,
+      "learning_rate": 9.536469267883432e-06,
+      "loss": 0.0311,
+      "step": 165,
+      "student_loss": 0.0012603362556546926,
+      "teacher_loss": 0.00417186226695776
+    },
+    {
+      "epoch": 0.049019607843137254,
+      "grad_norm": 1.8828125,
+      "kd_loss": 0.018793689087033272,
+      "learning_rate": 9.501828765283295e-06,
+      "loss": 0.0355,
+      "step": 170,
+      "student_loss": 0.0005546013708226383,
+      "teacher_loss": 0.0003494401171337813
+    },
+    {
+      "epoch": 0.05046136101499423,
+      "grad_norm": 1.09375,
+      "kd_loss": 0.027096513658761978,
+      "learning_rate": 9.466007736111846e-06,
+      "loss": 0.0322,
+      "step": 175,
+      "student_loss": 0.0030481775756925344,
+      "teacher_loss": 0.0003126203373540193
+    },
+    {
+      "epoch": 0.05190311418685121,
+      "grad_norm": 0.69140625,
+      "kd_loss": 0.03634097799658775,
+      "learning_rate": 9.429015573813163e-06,
+      "loss": 0.0302,
+      "step": 180,
+      "student_loss": 0.0012228424893692136,
+      "teacher_loss": 0.00034309024340473115
+    },
+    {
+      "epoch": 0.05334486735870819,
+      "grad_norm": 1.34375,
+      "kd_loss": 0.03704367205500603,
+      "learning_rate": 9.390861978940687e-06,
+      "loss": 0.0363,
+      "step": 185,
+      "student_loss": 0.035696618258953094,
+      "teacher_loss": 0.00034108557156287134
+    },
+    {
+      "epoch": 0.05478662053056517,
+      "grad_norm": 1.3046875,
+      "kd_loss": 0.052902791649103165,
+      "learning_rate": 9.351556956613423e-06,
+      "loss": 0.0321,
+      "step": 190,
+      "student_loss": 0.009327664040029049,
+      "teacher_loss": 0.0015260990476235747
+    },
+    {
+      "epoch": 0.056228373702422146,
+      "grad_norm": 2.828125,
+      "kd_loss": 0.05315268039703369,
+      "learning_rate": 9.31111081389227e-06,
+      "loss": 0.0327,
+      "step": 195,
+      "student_loss": 0.011893535032868385,
+      "teacher_loss": 0.0007797812577337027
+    },
+    {
+      "epoch": 0.05767012687427912,
+      "grad_norm": 0.8125,
+      "kd_loss": 0.03527738153934479,
+      "learning_rate": 9.269534157077177e-06,
+      "loss": 0.0394,
+      "step": 200,
+      "student_loss": 0.0005083663854748011,
+      "teacher_loss": 0.0002656931465025991
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 1000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-200/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-400/added_tokens.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|action_sep|>": 151670,
+  "<|arg_sep|>": 151671,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|call_sep|>": 151666,
+  "<|end_of_text|>": 151673,
+  "<|endoftext|>": 151643,
+  "<|exception_sep|>": 151669,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|frame_sep|>": 151672,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|latent_end|>": 151675,
+  "<|latent_start|>": 151674,
+  "<|line_sep|>": 151667,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|return_sep|>": 151668,
+  "<|trace_context_start|>": 151665,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-400/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-400/config.json ADDED Viewed

	@@ -0,0 +1,66 @@

+{
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "dtype": "bfloat16",
+  "eos_token_id": 151643,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 11008,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 32768,
+  "max_window_layers": 36,
+  "model_type": "qwen2",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 36,
+  "num_key_value_heads": 2,
+  "pad_token_id": 151643,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "transformers_version": "4.57.6",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151676
+}

checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-400/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-400/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a5b555de6ba696a39fd5cd9998487dd6e8c54b33d39ba83e48f21a287406f531
+size 6187858991

checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-400/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-400/thought_projector.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:85684960a1d3635f643b98b163e29c8ad5fcdee2b4b15b6e9a2f5fd31f95c859
+size 16788033

checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-400/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:83a790d654474f5dfe225f889afd0210313eb1083f942671f2c4b8e95a1c922b
+size 11424004

checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-400/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,295 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<|trace_context_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151666": {
+      "content": "<|call_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151667": {
+      "content": "<|line_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151668": {
+      "content": "<|return_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151669": {
+      "content": "<|exception_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151670": {
+      "content": "<|action_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151671": {
+      "content": "<|arg_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151672": {
+      "content": "<|frame_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151673": {
+      "content": "<|end_of_text|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151674": {
+      "content": "<|latent_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151675": {
+      "content": "<|latent_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 32768,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-400/trainer_state.json ADDED Viewed

	@@ -0,0 +1,834 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.11534025374855825,
+  "eval_steps": 500,
+  "global_step": 400,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0014417531718569781,
+      "grad_norm": 64.5,
+      "kd_loss": 0.2920137941837311,
+      "learning_rate": 1.3333333333333334e-06,
+      "loss": 0.4591,
+      "step": 5,
+      "student_loss": 0.18811793625354767,
+      "teacher_loss": 0.0020189557690173388
+    },
+    {
+      "epoch": 0.0028835063437139563,
+      "grad_norm": 37.25,
+      "kd_loss": 0.27189651131629944,
+      "learning_rate": 3e-06,
+      "loss": 0.3445,
+      "step": 10,
+      "student_loss": 0.15471063554286957,
+      "teacher_loss": 0.00530141731724143
+    },
+    {
+      "epoch": 0.004325259515570935,
+      "grad_norm": 7.1875,
+      "kd_loss": 0.22213532030582428,
+      "learning_rate": 4.666666666666667e-06,
+      "loss": 0.2034,
+      "step": 15,
+      "student_loss": 0.007600904442369938,
+      "teacher_loss": 0.0011894693598151207
+    },
+    {
+      "epoch": 0.0057670126874279125,
+      "grad_norm": 7.21875,
+      "kd_loss": 0.14968645572662354,
+      "learning_rate": 6.333333333333333e-06,
+      "loss": 0.1464,
+      "step": 20,
+      "student_loss": 0.026719728484749794,
+      "teacher_loss": 0.0005942200659774244
+    },
+    {
+      "epoch": 0.00720876585928489,
+      "grad_norm": 2.703125,
+      "kd_loss": 0.11552157998085022,
+      "learning_rate": 8.000000000000001e-06,
+      "loss": 0.1171,
+      "step": 25,
+      "student_loss": 0.002865401329472661,
+      "teacher_loss": 0.0010151522001251578
+    },
+    {
+      "epoch": 0.00865051903114187,
+      "grad_norm": 1.484375,
+      "kd_loss": 0.07486728578805923,
+      "learning_rate": 9.666666666666667e-06,
+      "loss": 0.0901,
+      "step": 30,
+      "student_loss": 0.0011324587976559997,
+      "teacher_loss": 0.0005539655685424805
+    },
+    {
+      "epoch": 0.010092272202998846,
+      "grad_norm": 2.140625,
+      "kd_loss": 0.06898625195026398,
+      "learning_rate": 9.99958042442916e-06,
+      "loss": 0.0757,
+      "step": 35,
+      "student_loss": 0.0009480358567088842,
+      "teacher_loss": 0.0005732738063670695
+    },
+    {
+      "epoch": 0.011534025374855825,
+      "grad_norm": 1.296875,
+      "kd_loss": 0.0642521008849144,
+      "learning_rate": 9.997876019358083e-06,
+      "loss": 0.0685,
+      "step": 40,
+      "student_loss": 0.031368859112262726,
+      "teacher_loss": 0.027499590069055557
+    },
+    {
+      "epoch": 0.012975778546712802,
+      "grad_norm": 1.3359375,
+      "kd_loss": 0.11454806476831436,
+      "learning_rate": 9.99486100792044e-06,
+      "loss": 0.0541,
+      "step": 45,
+      "student_loss": 0.06097811087965965,
+      "teacher_loss": 0.0027094183024019003
+    },
+    {
+      "epoch": 0.01441753171856978,
+      "grad_norm": 1.5859375,
+      "kd_loss": 0.03876315429806709,
+      "learning_rate": 9.990536180750724e-06,
+      "loss": 0.0563,
+      "step": 50,
+      "student_loss": 0.02282995544373989,
+      "teacher_loss": 0.008940276689827442
+    },
+    {
+      "epoch": 0.015859284890426758,
+      "grad_norm": 1.125,
+      "kd_loss": 0.033022914081811905,
+      "learning_rate": 9.984902671959911e-06,
+      "loss": 0.0481,
+      "step": 55,
+      "student_loss": 0.0011594295501708984,
+      "teacher_loss": 0.0007985035772435367
+    },
+    {
+      "epoch": 0.01730103806228374,
+      "grad_norm": 0.8828125,
+      "kd_loss": 0.030243180692195892,
+      "learning_rate": 9.97796195883804e-06,
+      "loss": 0.0429,
+      "step": 60,
+      "student_loss": 0.0034187885466963053,
+      "teacher_loss": 0.0017242392059415579
+    },
+    {
+      "epoch": 0.018742791234140715,
+      "grad_norm": 1.4453125,
+      "kd_loss": 0.032282304018735886,
+      "learning_rate": 9.969715861466839e-06,
+      "loss": 0.0446,
+      "step": 65,
+      "student_loss": 0.029871758073568344,
+      "teacher_loss": 0.039595428854227066
+    },
+    {
+      "epoch": 0.020184544405997693,
+      "grad_norm": 1.1953125,
+      "kd_loss": 0.06090804189443588,
+      "learning_rate": 9.96016654224243e-06,
+      "loss": 0.0477,
+      "step": 70,
+      "student_loss": 0.017709577456116676,
+      "teacher_loss": 0.0022253147326409817
+    },
+    {
+      "epoch": 0.02162629757785467,
+      "grad_norm": 0.84765625,
+      "kd_loss": 0.024834414944052696,
+      "learning_rate": 9.94931650530827e-06,
+      "loss": 0.0418,
+      "step": 75,
+      "student_loss": 0.0012920524459332228,
+      "teacher_loss": 0.0008862126851454377
+    },
+    {
+      "epoch": 0.02306805074971165,
+      "grad_norm": 0.60546875,
+      "kd_loss": 0.025287121534347534,
+      "learning_rate": 9.93716859589851e-06,
+      "loss": 0.0365,
+      "step": 80,
+      "student_loss": 0.0013494148151949048,
+      "teacher_loss": 0.0008630359079688787
+    },
+    {
+      "epoch": 0.024509803921568627,
+      "grad_norm": 0.86328125,
+      "kd_loss": 0.024378223344683647,
+      "learning_rate": 9.923725999591846e-06,
+      "loss": 0.0395,
+      "step": 85,
+      "student_loss": 0.0005228935624472797,
+      "teacher_loss": 0.0004561410460155457
+    },
+    {
+      "epoch": 0.025951557093425604,
+      "grad_norm": 0.70703125,
+      "kd_loss": 0.07289917767047882,
+      "learning_rate": 9.908992241476189e-06,
+      "loss": 0.0394,
+      "step": 90,
+      "student_loss": 0.020422162488102913,
+      "teacher_loss": 0.0035499960649758577
+    },
+    {
+      "epoch": 0.027393310265282585,
+      "grad_norm": 1.125,
+      "kd_loss": 0.044955912977457047,
+      "learning_rate": 9.892971185224244e-06,
+      "loss": 0.0351,
+      "step": 95,
+      "student_loss": 0.008261171169579029,
+      "teacher_loss": 0.005078152287751436
+    },
+    {
+      "epoch": 0.02883506343713956,
+      "grad_norm": 0.83203125,
+      "kd_loss": 0.02227398194372654,
+      "learning_rate": 9.875667032080354e-06,
+      "loss": 0.0326,
+      "step": 100,
+      "student_loss": 0.0006025677430443466,
+      "teacher_loss": 0.00046476206625811756
+    },
+    {
+      "epoch": 0.03027681660899654,
+      "grad_norm": 1.2734375,
+      "kd_loss": 0.03532887250185013,
+      "learning_rate": 9.857084319758772e-06,
+      "loss": 0.036,
+      "step": 105,
+      "student_loss": 0.0034369053319096565,
+      "teacher_loss": 0.00029834595625288785
+    },
+    {
+      "epoch": 0.031718569780853516,
+      "grad_norm": 1.1640625,
+      "kd_loss": 0.04033924266695976,
+      "learning_rate": 9.837227921253747e-06,
+      "loss": 0.0375,
+      "step": 110,
+      "student_loss": 0.03080393560230732,
+      "teacher_loss": 0.018437745049595833
+    },
+    {
+      "epoch": 0.03316032295271049,
+      "grad_norm": 0.703125,
+      "kd_loss": 0.04197424277663231,
+      "learning_rate": 9.816103043561648e-06,
+      "loss": 0.0347,
+      "step": 115,
+      "student_loss": 0.0021668823901563883,
+      "teacher_loss": 0.00045062918798066676
+    },
+    {
+      "epoch": 0.03460207612456748,
+      "grad_norm": 0.98828125,
+      "kd_loss": 0.027563175186514854,
+      "learning_rate": 9.79371522631553e-06,
+      "loss": 0.032,
+      "step": 120,
+      "student_loss": 0.0016319500282406807,
+      "teacher_loss": 0.0008567498298361897
+    },
+    {
+      "epoch": 0.036043829296424454,
+      "grad_norm": 0.92578125,
+      "kd_loss": 0.06173818185925484,
+      "learning_rate": 9.770070340332457e-06,
+      "loss": 0.0364,
+      "step": 125,
+      "student_loss": 0.02385914884507656,
+      "teacher_loss": 0.00027849775506183505
+    },
+    {
+      "epoch": 0.03748558246828143,
+      "grad_norm": 1.0859375,
+      "kd_loss": 0.029248492792248726,
+      "learning_rate": 9.745174586073982e-06,
+      "loss": 0.0346,
+      "step": 130,
+      "student_loss": 0.0005455865757539868,
+      "teacher_loss": 0.0004959598300047219
+    },
+    {
+      "epoch": 0.03892733564013841,
+      "grad_norm": 0.953125,
+      "kd_loss": 0.0406946986913681,
+      "learning_rate": 9.719034492020183e-06,
+      "loss": 0.0377,
+      "step": 135,
+      "student_loss": 0.0013323032762855291,
+      "teacher_loss": 0.0005948346224613488
+    },
+    {
+      "epoch": 0.040369088811995385,
+      "grad_norm": 1.0390625,
+      "kd_loss": 0.023682042956352234,
+      "learning_rate": 9.691656912957686e-06,
+      "loss": 0.036,
+      "step": 140,
+      "student_loss": 0.0005881476681679487,
+      "teacher_loss": 0.0003679130459204316
+    },
+    {
+      "epoch": 0.04181084198385236,
+      "grad_norm": 0.5859375,
+      "kd_loss": 0.07271980494260788,
+      "learning_rate": 9.663049028182112e-06,
+      "loss": 0.0325,
+      "step": 145,
+      "student_loss": 0.028793470934033394,
+      "teacher_loss": 0.005983584560453892
+    },
+    {
+      "epoch": 0.04325259515570934,
+      "grad_norm": 0.609375,
+      "kd_loss": 0.01843745820224285,
+      "learning_rate": 9.633218339615433e-06,
+      "loss": 0.0316,
+      "step": 150,
+      "student_loss": 0.001051027444191277,
+      "teacher_loss": 0.000913174357265234
+    },
+    {
+      "epoch": 0.04469434832756632,
+      "grad_norm": 1.1171875,
+      "kd_loss": 0.023381218314170837,
+      "learning_rate": 9.602172669838721e-06,
+      "loss": 0.0381,
+      "step": 155,
+      "student_loss": 0.002775231609120965,
+      "teacher_loss": 0.0007182428380474448
+    },
+    {
+      "epoch": 0.0461361014994233,
+      "grad_norm": 1.6875,
+      "kd_loss": 0.03144950047135353,
+      "learning_rate": 9.569920160040815e-06,
+      "loss": 0.0344,
+      "step": 160,
+      "student_loss": 0.0778423622250557,
+      "teacher_loss": 0.044956743717193604
+    },
+    {
+      "epoch": 0.04757785467128028,
+      "grad_norm": 0.75,
+      "kd_loss": 0.03335012122988701,
+      "learning_rate": 9.536469267883432e-06,
+      "loss": 0.0311,
+      "step": 165,
+      "student_loss": 0.0012603362556546926,
+      "teacher_loss": 0.00417186226695776
+    },
+    {
+      "epoch": 0.049019607843137254,
+      "grad_norm": 1.8828125,
+      "kd_loss": 0.018793689087033272,
+      "learning_rate": 9.501828765283295e-06,
+      "loss": 0.0355,
+      "step": 170,
+      "student_loss": 0.0005546013708226383,
+      "teacher_loss": 0.0003494401171337813
+    },
+    {
+      "epoch": 0.05046136101499423,
+      "grad_norm": 1.09375,
+      "kd_loss": 0.027096513658761978,
+      "learning_rate": 9.466007736111846e-06,
+      "loss": 0.0322,
+      "step": 175,
+      "student_loss": 0.0030481775756925344,
+      "teacher_loss": 0.0003126203373540193
+    },
+    {
+      "epoch": 0.05190311418685121,
+      "grad_norm": 0.69140625,
+      "kd_loss": 0.03634097799658775,
+      "learning_rate": 9.429015573813163e-06,
+      "loss": 0.0302,
+      "step": 180,
+      "student_loss": 0.0012228424893692136,
+      "teacher_loss": 0.00034309024340473115
+    },
+    {
+      "epoch": 0.05334486735870819,
+      "grad_norm": 1.34375,
+      "kd_loss": 0.03704367205500603,
+      "learning_rate": 9.390861978940687e-06,
+      "loss": 0.0363,
+      "step": 185,
+      "student_loss": 0.035696618258953094,
+      "teacher_loss": 0.00034108557156287134
+    },
+    {
+      "epoch": 0.05478662053056517,
+      "grad_norm": 1.3046875,
+      "kd_loss": 0.052902791649103165,
+      "learning_rate": 9.351556956613423e-06,
+      "loss": 0.0321,
+      "step": 190,
+      "student_loss": 0.009327664040029049,
+      "teacher_loss": 0.0015260990476235747
+    },
+    {
+      "epoch": 0.056228373702422146,
+      "grad_norm": 2.828125,
+      "kd_loss": 0.05315268039703369,
+      "learning_rate": 9.31111081389227e-06,
+      "loss": 0.0327,
+      "step": 195,
+      "student_loss": 0.011893535032868385,
+      "teacher_loss": 0.0007797812577337027
+    },
+    {
+      "epoch": 0.05767012687427912,
+      "grad_norm": 0.8125,
+      "kd_loss": 0.03527738153934479,
+      "learning_rate": 9.269534157077177e-06,
+      "loss": 0.0394,
+      "step": 200,
+      "student_loss": 0.0005083663854748011,
+      "teacher_loss": 0.0002656931465025991
+    },
+    {
+      "epoch": 0.0591118800461361,
+      "grad_norm": 0.9921875,
+      "kd_loss": 0.02736036665737629,
+      "learning_rate": 9.226837888925813e-06,
+      "loss": 0.0341,
+      "step": 205,
+      "student_loss": 0.03568984195590019,
+      "teacher_loss": 0.027640890330076218
+    },
+    {
+      "epoch": 0.06055363321799308,
+      "grad_norm": 1.78125,
+      "kd_loss": 0.030339844524860382,
+      "learning_rate": 9.183033205794525e-06,
+      "loss": 0.0302,
+      "step": 210,
+      "student_loss": 0.000763049116358161,
+      "teacher_loss": 0.0003548153617884964
+    },
+    {
+      "epoch": 0.061995386389850055,
+      "grad_norm": 1.1484375,
+      "kd_loss": 0.03865697979927063,
+      "learning_rate": 9.13813159470227e-06,
+      "loss": 0.0326,
+      "step": 215,
+      "student_loss": 0.0005148217896930873,
+      "teacher_loss": 0.00020245747873559594
+    },
+    {
+      "epoch": 0.06343713956170703,
+      "grad_norm": 1.3671875,
+      "kd_loss": 0.04260854423046112,
+      "learning_rate": 9.092144830318357e-06,
+      "loss": 0.0316,
+      "step": 220,
+      "student_loss": 0.030991079285740852,
+      "teacher_loss": 0.00993641559034586
+    },
+    {
+      "epoch": 0.06487889273356401,
+      "grad_norm": 1.203125,
+      "kd_loss": 0.023958567529916763,
+      "learning_rate": 9.045084971874738e-06,
+      "loss": 0.0376,
+      "step": 225,
+      "student_loss": 0.043197184801101685,
+      "teacher_loss": 0.00216495874337852
+    },
+    {
+      "epoch": 0.06632064590542099,
+      "grad_norm": 1.7578125,
+      "kd_loss": 0.04370651766657829,
+      "learning_rate": 8.99696436000368e-06,
+      "loss": 0.0299,
+      "step": 230,
+      "student_loss": 0.005392159800976515,
+      "teacher_loss": 0.003661371534690261
+    },
+    {
+      "epoch": 0.06776239907727798,
+      "grad_norm": 0.8671875,
+      "kd_loss": 0.07619086652994156,
+      "learning_rate": 8.947795613501658e-06,
+      "loss": 0.0314,
+      "step": 235,
+      "student_loss": 0.008586333133280277,
+      "teacher_loss": 0.0005631999811157584
+    },
+    {
+      "epoch": 0.06920415224913495,
+      "grad_norm": 1.0625,
+      "kd_loss": 0.03111647628247738,
+      "learning_rate": 8.897591626020284e-06,
+      "loss": 0.034,
+      "step": 240,
+      "student_loss": 0.003906027879565954,
+      "teacher_loss": 0.0004787310608662665
+    },
+    {
+      "epoch": 0.07064590542099193,
+      "grad_norm": 1.453125,
+      "kd_loss": 0.017389042302966118,
+      "learning_rate": 8.846365562685178e-06,
+      "loss": 0.0279,
+      "step": 245,
+      "student_loss": 0.013969292864203453,
+      "teacher_loss": 0.006873726844787598
+    },
+    {
+      "epoch": 0.07208765859284891,
+      "grad_norm": 1.6015625,
+      "kd_loss": 0.0229345690459013,
+      "learning_rate": 8.794130856643635e-06,
+      "loss": 0.0311,
+      "step": 250,
+      "student_loss": 0.0008334179292432964,
+      "teacher_loss": 0.0003169570700265467
+    },
+    {
+      "epoch": 0.07352941176470588,
+      "grad_norm": 0.83203125,
+      "kd_loss": 0.01808979921042919,
+      "learning_rate": 8.74090120554202e-06,
+      "loss": 0.0312,
+      "step": 255,
+      "student_loss": 0.0003083710907958448,
+      "teacher_loss": 0.00030115401023067534
+    },
+    {
+      "epoch": 0.07497116493656286,
+      "grad_norm": 0.88671875,
+      "kd_loss": 0.026108454912900925,
+      "learning_rate": 8.686690567933803e-06,
+      "loss": 0.0333,
+      "step": 260,
+      "student_loss": 0.042571116238832474,
+      "teacher_loss": 0.03388316184282303
+    },
+    {
+      "epoch": 0.07641291810841984,
+      "grad_norm": 0.80078125,
+      "kd_loss": 0.016656002029776573,
+      "learning_rate": 8.63151315961915e-06,
+      "loss": 0.0317,
+      "step": 265,
+      "student_loss": 0.0003702428948599845,
+      "teacher_loss": 0.0002555136161390692
+    },
+    {
+      "epoch": 0.07785467128027682,
+      "grad_norm": 1.046875,
+      "kd_loss": 0.019304102286696434,
+      "learning_rate": 8.575383449917103e-06,
+      "loss": 0.0342,
+      "step": 270,
+      "student_loss": 0.001813149661757052,
+      "teacher_loss": 0.0011580288410186768
+    },
+    {
+      "epoch": 0.07929642445213379,
+      "grad_norm": 1.1796875,
+      "kd_loss": 0.023480774834752083,
+      "learning_rate": 8.518316157871232e-06,
+      "loss": 0.029,
+      "step": 275,
+      "student_loss": 0.04764978215098381,
+      "teacher_loss": 0.03439468517899513
+    },
+    {
+      "epoch": 0.08073817762399077,
+      "grad_norm": 1.015625,
+      "kd_loss": 0.03275206685066223,
+      "learning_rate": 8.460326248389825e-06,
+      "loss": 0.0289,
+      "step": 280,
+      "student_loss": 0.0005029537715017796,
+      "teacher_loss": 0.00019533037266228348
+    },
+    {
+      "epoch": 0.08217993079584775,
+      "grad_norm": 1.140625,
+      "kd_loss": 0.019457675516605377,
+      "learning_rate": 8.401428928321607e-06,
+      "loss": 0.0322,
+      "step": 285,
+      "student_loss": 0.0007758038118481636,
+      "teacher_loss": 0.000900130660738796
+    },
+    {
+      "epoch": 0.08362168396770472,
+      "grad_norm": 1.3125,
+      "kd_loss": 0.017802242189645767,
+      "learning_rate": 8.341639642468002e-06,
+      "loss": 0.0348,
+      "step": 290,
+      "student_loss": 0.023405462503433228,
+      "teacher_loss": 0.021540865302085876
+    },
+    {
+      "epoch": 0.0850634371395617,
+      "grad_norm": 1.03125,
+      "kd_loss": 0.01647804118692875,
+      "learning_rate": 8.280974069532999e-06,
+      "loss": 0.0328,
+      "step": 295,
+      "student_loss": 0.0006198033224791288,
+      "teacher_loss": 0.000554366793949157
+    },
+    {
+      "epoch": 0.08650519031141868,
+      "grad_norm": 0.9296875,
+      "kd_loss": 0.04784730449318886,
+      "learning_rate": 8.219448118011687e-06,
+      "loss": 0.0308,
+      "step": 300,
+      "student_loss": 0.015613794326782227,
+      "teacher_loss": 0.001725711626932025
+    },
+    {
+      "epoch": 0.08794694348327567,
+      "grad_norm": 1.1953125,
+      "kd_loss": 0.019965235143899918,
+      "learning_rate": 8.157077922018537e-06,
+      "loss": 0.0289,
+      "step": 305,
+      "student_loss": 0.006098510231822729,
+      "teacher_loss": 0.002777156652882695
+    },
+    {
+      "epoch": 0.08938869665513265,
+      "grad_norm": 1.0390625,
+      "kd_loss": 0.020748196169734,
+      "learning_rate": 8.093879837056486e-06,
+      "loss": 0.0309,
+      "step": 310,
+      "student_loss": 0.000493990199174732,
+      "teacher_loss": 0.00039537265547551215
+    },
+    {
+      "epoch": 0.09083044982698962,
+      "grad_norm": 1.4765625,
+      "kd_loss": 0.03892743960022926,
+      "learning_rate": 8.029870435728018e-06,
+      "loss": 0.0294,
+      "step": 315,
+      "student_loss": 0.0073717073537409306,
+      "teacher_loss": 0.00021180440671741962
+    },
+    {
+      "epoch": 0.0922722029988466,
+      "grad_norm": 1.3828125,
+      "kd_loss": 0.02363528683781624,
+      "learning_rate": 7.965066503389264e-06,
+      "loss": 0.0313,
+      "step": 320,
+      "student_loss": 0.0004119524674024433,
+      "teacher_loss": 0.00022927633835934103
+    },
+    {
+      "epoch": 0.09371395617070358,
+      "grad_norm": 1.7421875,
+      "kd_loss": 0.06230268254876137,
+      "learning_rate": 7.89948503374835e-06,
+      "loss": 0.0284,
+      "step": 325,
+      "student_loss": 0.009024329483509064,
+      "teacher_loss": 0.01963256485760212
+    },
+    {
+      "epoch": 0.09515570934256055,
+      "grad_norm": 0.90625,
+      "kd_loss": 0.04712303727865219,
+      "learning_rate": 7.833143224409076e-06,
+      "loss": 0.0302,
+      "step": 330,
+      "student_loss": 0.008720812387764454,
+      "teacher_loss": 0.0013501221546903253
+    },
+    {
+      "epoch": 0.09659746251441753,
+      "grad_norm": 0.9140625,
+      "kd_loss": 0.018766457214951515,
+      "learning_rate": 7.766058472361154e-06,
+      "loss": 0.0283,
+      "step": 335,
+      "student_loss": 0.001242243917658925,
+      "teacher_loss": 0.0023425817489624023
+    },
+    {
+      "epoch": 0.09803921568627451,
+      "grad_norm": 2.078125,
+      "kd_loss": 0.02911657840013504,
+      "learning_rate": 7.698248369418146e-06,
+      "loss": 0.0326,
+      "step": 340,
+      "student_loss": 0.009861858561635017,
+      "teacher_loss": 0.006796200294047594
+    },
+    {
+      "epoch": 0.09948096885813149,
+      "grad_norm": 1.2265625,
+      "kd_loss": 0.032899159938097,
+      "learning_rate": 7.629730697604314e-06,
+      "loss": 0.0335,
+      "step": 345,
+      "student_loss": 0.00739182299003005,
+      "teacher_loss": 0.0002512071805540472
+    },
+    {
+      "epoch": 0.10092272202998846,
+      "grad_norm": 1.0859375,
+      "kd_loss": 0.06120900437235832,
+      "learning_rate": 7.560523424491595e-06,
+      "loss": 0.0282,
+      "step": 350,
+      "student_loss": 0.003599822986871004,
+      "teacher_loss": 0.0007063632365316153
+    },
+    {
+      "epoch": 0.10236447520184544,
+      "grad_norm": 2.296875,
+      "kd_loss": 0.02087453007698059,
+      "learning_rate": 7.490644698487909e-06,
+      "loss": 0.035,
+      "step": 355,
+      "student_loss": 0.001389339566230774,
+      "teacher_loss": 0.0017551770433783531
+    },
+    {
+      "epoch": 0.10380622837370242,
+      "grad_norm": 0.90234375,
+      "kd_loss": 0.019557664170861244,
+      "learning_rate": 7.420112844078066e-06,
+      "loss": 0.0321,
+      "step": 360,
+      "student_loss": 0.0006360138650052249,
+      "teacher_loss": 0.0005147532792761922
+    },
+    {
+      "epoch": 0.1052479815455594,
+      "grad_norm": 1.3203125,
+      "kd_loss": 0.030097220093011856,
+      "learning_rate": 7.348946357018479e-06,
+      "loss": 0.0333,
+      "step": 365,
+      "student_loss": 0.04143820330500603,
+      "teacher_loss": 0.027219083160161972
+    },
+    {
+      "epoch": 0.10668973471741638,
+      "grad_norm": 0.9453125,
+      "kd_loss": 0.05344080179929733,
+      "learning_rate": 7.277163899486975e-06,
+      "loss": 0.0351,
+      "step": 370,
+      "student_loss": 0.021593965590000153,
+      "teacher_loss": 0.0005673202103935182
+    },
+    {
+      "epoch": 0.10813148788927336,
+      "grad_norm": 0.7265625,
+      "kd_loss": 0.03127056360244751,
+      "learning_rate": 7.204784295188959e-06,
+      "loss": 0.0287,
+      "step": 375,
+      "student_loss": 0.03163963928818703,
+      "teacher_loss": 0.020587248727679253
+    },
+    {
+      "epoch": 0.10957324106113034,
+      "grad_norm": 0.84375,
+      "kd_loss": 0.01984175480902195,
+      "learning_rate": 7.1318265244212305e-06,
+      "loss": 0.0311,
+      "step": 380,
+      "student_loss": 0.0005506337620317936,
+      "teacher_loss": 0.00038444914389401674
+    },
+    {
+      "epoch": 0.11101499423298732,
+      "grad_norm": 0.59765625,
+      "kd_loss": 0.02916550263762474,
+      "learning_rate": 7.05830971909472e-06,
+      "loss": 0.0294,
+      "step": 385,
+      "student_loss": 0.00045756620238535106,
+      "teacher_loss": 0.0003828653716482222
+    },
+    {
+      "epoch": 0.11245674740484429,
+      "grad_norm": 1.171875,
+      "kd_loss": 0.021802764385938644,
+      "learning_rate": 6.9842531577174865e-06,
+      "loss": 0.0271,
+      "step": 390,
+      "student_loss": 0.00026892355526797473,
+      "teacher_loss": 0.0002502185816410929
+    },
+    {
+      "epoch": 0.11389850057670127,
+      "grad_norm": 0.94140625,
+      "kd_loss": 0.020051907747983932,
+      "learning_rate": 6.9096762603392595e-06,
+      "loss": 0.0307,
+      "step": 395,
+      "student_loss": 0.0004568768781609833,
+      "teacher_loss": 0.00039133013342507184
+    },
+    {
+      "epoch": 0.11534025374855825,
+      "grad_norm": 0.78125,
+      "kd_loss": 0.03054620325565338,
+      "learning_rate": 6.834598583458862e-06,
+      "loss": 0.0275,
+      "step": 400,
+      "student_loss": 0.0005328103434294462,
+      "teacher_loss": 0.00015425821766257286
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 1000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-400/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-600/added_tokens.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|action_sep|>": 151670,
+  "<|arg_sep|>": 151671,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|call_sep|>": 151666,
+  "<|end_of_text|>": 151673,
+  "<|endoftext|>": 151643,
+  "<|exception_sep|>": 151669,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|frame_sep|>": 151672,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|latent_end|>": 151675,
+  "<|latent_start|>": 151674,
+  "<|line_sep|>": 151667,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|return_sep|>": 151668,
+  "<|trace_context_start|>": 151665,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-600/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-600/config.json ADDED Viewed

	@@ -0,0 +1,66 @@

+{
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "dtype": "bfloat16",
+  "eos_token_id": 151643,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 11008,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 32768,
+  "max_window_layers": 36,
+  "model_type": "qwen2",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 36,
+  "num_key_value_heads": 2,
+  "pad_token_id": 151643,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "transformers_version": "4.57.6",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151676
+}

checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-600/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-600/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6244ca33c3a3fef3be7e4a026d26ccc4173f9ba0ff320a9f37dcab8df189ad0b
+size 6187858991

checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-600/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-600/thought_projector.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:80e5886a026245018a58a248884d160c0e06852fc283c5566d08bbc7cf5afe5d
+size 16788033

checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-600/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:83a790d654474f5dfe225f889afd0210313eb1083f942671f2c4b8e95a1c922b
+size 11424004

checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-600/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,295 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<|trace_context_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151666": {
+      "content": "<|call_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151667": {
+      "content": "<|line_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151668": {
+      "content": "<|return_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151669": {
+      "content": "<|exception_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151670": {
+      "content": "<|action_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151671": {
+      "content": "<|arg_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151672": {
+      "content": "<|frame_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151673": {
+      "content": "<|end_of_text|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151674": {
+      "content": "<|latent_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151675": {
+      "content": "<|latent_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 32768,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-600/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1234 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.17301038062283736,
+  "eval_steps": 500,
+  "global_step": 600,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0014417531718569781,
+      "grad_norm": 64.5,
+      "kd_loss": 0.2920137941837311,
+      "learning_rate": 1.3333333333333334e-06,
+      "loss": 0.4591,
+      "step": 5,
+      "student_loss": 0.18811793625354767,
+      "teacher_loss": 0.0020189557690173388
+    },
+    {
+      "epoch": 0.0028835063437139563,
+      "grad_norm": 37.25,
+      "kd_loss": 0.27189651131629944,
+      "learning_rate": 3e-06,
+      "loss": 0.3445,
+      "step": 10,
+      "student_loss": 0.15471063554286957,
+      "teacher_loss": 0.00530141731724143
+    },
+    {
+      "epoch": 0.004325259515570935,
+      "grad_norm": 7.1875,
+      "kd_loss": 0.22213532030582428,
+      "learning_rate": 4.666666666666667e-06,
+      "loss": 0.2034,
+      "step": 15,
+      "student_loss": 0.007600904442369938,
+      "teacher_loss": 0.0011894693598151207
+    },
+    {
+      "epoch": 0.0057670126874279125,
+      "grad_norm": 7.21875,
+      "kd_loss": 0.14968645572662354,
+      "learning_rate": 6.333333333333333e-06,
+      "loss": 0.1464,
+      "step": 20,
+      "student_loss": 0.026719728484749794,
+      "teacher_loss": 0.0005942200659774244
+    },
+    {
+      "epoch": 0.00720876585928489,
+      "grad_norm": 2.703125,
+      "kd_loss": 0.11552157998085022,
+      "learning_rate": 8.000000000000001e-06,
+      "loss": 0.1171,
+      "step": 25,
+      "student_loss": 0.002865401329472661,
+      "teacher_loss": 0.0010151522001251578
+    },
+    {
+      "epoch": 0.00865051903114187,
+      "grad_norm": 1.484375,
+      "kd_loss": 0.07486728578805923,
+      "learning_rate": 9.666666666666667e-06,
+      "loss": 0.0901,
+      "step": 30,
+      "student_loss": 0.0011324587976559997,
+      "teacher_loss": 0.0005539655685424805
+    },
+    {
+      "epoch": 0.010092272202998846,
+      "grad_norm": 2.140625,
+      "kd_loss": 0.06898625195026398,
+      "learning_rate": 9.99958042442916e-06,
+      "loss": 0.0757,
+      "step": 35,
+      "student_loss": 0.0009480358567088842,
+      "teacher_loss": 0.0005732738063670695
+    },
+    {
+      "epoch": 0.011534025374855825,
+      "grad_norm": 1.296875,
+      "kd_loss": 0.0642521008849144,
+      "learning_rate": 9.997876019358083e-06,
+      "loss": 0.0685,
+      "step": 40,
+      "student_loss": 0.031368859112262726,
+      "teacher_loss": 0.027499590069055557
+    },
+    {
+      "epoch": 0.012975778546712802,
+      "grad_norm": 1.3359375,
+      "kd_loss": 0.11454806476831436,
+      "learning_rate": 9.99486100792044e-06,
+      "loss": 0.0541,
+      "step": 45,
+      "student_loss": 0.06097811087965965,
+      "teacher_loss": 0.0027094183024019003
+    },
+    {
+      "epoch": 0.01441753171856978,
+      "grad_norm": 1.5859375,
+      "kd_loss": 0.03876315429806709,
+      "learning_rate": 9.990536180750724e-06,
+      "loss": 0.0563,
+      "step": 50,
+      "student_loss": 0.02282995544373989,
+      "teacher_loss": 0.008940276689827442
+    },
+    {
+      "epoch": 0.015859284890426758,
+      "grad_norm": 1.125,
+      "kd_loss": 0.033022914081811905,
+      "learning_rate": 9.984902671959911e-06,
+      "loss": 0.0481,
+      "step": 55,
+      "student_loss": 0.0011594295501708984,
+      "teacher_loss": 0.0007985035772435367
+    },
+    {
+      "epoch": 0.01730103806228374,
+      "grad_norm": 0.8828125,
+      "kd_loss": 0.030243180692195892,
+      "learning_rate": 9.97796195883804e-06,
+      "loss": 0.0429,
+      "step": 60,
+      "student_loss": 0.0034187885466963053,
+      "teacher_loss": 0.0017242392059415579
+    },
+    {
+      "epoch": 0.018742791234140715,
+      "grad_norm": 1.4453125,
+      "kd_loss": 0.032282304018735886,
+      "learning_rate": 9.969715861466839e-06,
+      "loss": 0.0446,
+      "step": 65,
+      "student_loss": 0.029871758073568344,
+      "teacher_loss": 0.039595428854227066
+    },
+    {
+      "epoch": 0.020184544405997693,
+      "grad_norm": 1.1953125,
+      "kd_loss": 0.06090804189443588,
+      "learning_rate": 9.96016654224243e-06,
+      "loss": 0.0477,
+      "step": 70,
+      "student_loss": 0.017709577456116676,
+      "teacher_loss": 0.0022253147326409817
+    },
+    {
+      "epoch": 0.02162629757785467,
+      "grad_norm": 0.84765625,
+      "kd_loss": 0.024834414944052696,
+      "learning_rate": 9.94931650530827e-06,
+      "loss": 0.0418,
+      "step": 75,
+      "student_loss": 0.0012920524459332228,
+      "teacher_loss": 0.0008862126851454377
+    },
+    {
+      "epoch": 0.02306805074971165,
+      "grad_norm": 0.60546875,
+      "kd_loss": 0.025287121534347534,
+      "learning_rate": 9.93716859589851e-06,
+      "loss": 0.0365,
+      "step": 80,
+      "student_loss": 0.0013494148151949048,
+      "teacher_loss": 0.0008630359079688787
+    },
+    {
+      "epoch": 0.024509803921568627,
+      "grad_norm": 0.86328125,
+      "kd_loss": 0.024378223344683647,
+      "learning_rate": 9.923725999591846e-06,
+      "loss": 0.0395,
+      "step": 85,
+      "student_loss": 0.0005228935624472797,
+      "teacher_loss": 0.0004561410460155457
+    },
+    {
+      "epoch": 0.025951557093425604,
+      "grad_norm": 0.70703125,
+      "kd_loss": 0.07289917767047882,
+      "learning_rate": 9.908992241476189e-06,
+      "loss": 0.0394,
+      "step": 90,
+      "student_loss": 0.020422162488102913,
+      "teacher_loss": 0.0035499960649758577
+    },
+    {
+      "epoch": 0.027393310265282585,
+      "grad_norm": 1.125,
+      "kd_loss": 0.044955912977457047,
+      "learning_rate": 9.892971185224244e-06,
+      "loss": 0.0351,
+      "step": 95,
+      "student_loss": 0.008261171169579029,
+      "teacher_loss": 0.005078152287751436
+    },
+    {
+      "epoch": 0.02883506343713956,
+      "grad_norm": 0.83203125,
+      "kd_loss": 0.02227398194372654,
+      "learning_rate": 9.875667032080354e-06,
+      "loss": 0.0326,
+      "step": 100,
+      "student_loss": 0.0006025677430443466,
+      "teacher_loss": 0.00046476206625811756
+    },
+    {
+      "epoch": 0.03027681660899654,
+      "grad_norm": 1.2734375,
+      "kd_loss": 0.03532887250185013,
+      "learning_rate": 9.857084319758772e-06,
+      "loss": 0.036,
+      "step": 105,
+      "student_loss": 0.0034369053319096565,
+      "teacher_loss": 0.00029834595625288785
+    },
+    {
+      "epoch": 0.031718569780853516,
+      "grad_norm": 1.1640625,
+      "kd_loss": 0.04033924266695976,
+      "learning_rate": 9.837227921253747e-06,
+      "loss": 0.0375,
+      "step": 110,
+      "student_loss": 0.03080393560230732,
+      "teacher_loss": 0.018437745049595833
+    },
+    {
+      "epoch": 0.03316032295271049,
+      "grad_norm": 0.703125,
+      "kd_loss": 0.04197424277663231,
+      "learning_rate": 9.816103043561648e-06,
+      "loss": 0.0347,
+      "step": 115,
+      "student_loss": 0.0021668823901563883,
+      "teacher_loss": 0.00045062918798066676
+    },
+    {
+      "epoch": 0.03460207612456748,
+      "grad_norm": 0.98828125,
+      "kd_loss": 0.027563175186514854,
+      "learning_rate": 9.79371522631553e-06,
+      "loss": 0.032,
+      "step": 120,
+      "student_loss": 0.0016319500282406807,
+      "teacher_loss": 0.0008567498298361897
+    },
+    {
+      "epoch": 0.036043829296424454,
+      "grad_norm": 0.92578125,
+      "kd_loss": 0.06173818185925484,
+      "learning_rate": 9.770070340332457e-06,
+      "loss": 0.0364,
+      "step": 125,
+      "student_loss": 0.02385914884507656,
+      "teacher_loss": 0.00027849775506183505
+    },
+    {
+      "epoch": 0.03748558246828143,
+      "grad_norm": 1.0859375,
+      "kd_loss": 0.029248492792248726,
+      "learning_rate": 9.745174586073982e-06,
+      "loss": 0.0346,
+      "step": 130,
+      "student_loss": 0.0005455865757539868,
+      "teacher_loss": 0.0004959598300047219
+    },
+    {
+      "epoch": 0.03892733564013841,
+      "grad_norm": 0.953125,
+      "kd_loss": 0.0406946986913681,
+      "learning_rate": 9.719034492020183e-06,
+      "loss": 0.0377,
+      "step": 135,
+      "student_loss": 0.0013323032762855291,
+      "teacher_loss": 0.0005948346224613488
+    },
+    {
+      "epoch": 0.040369088811995385,
+      "grad_norm": 1.0390625,
+      "kd_loss": 0.023682042956352234,
+      "learning_rate": 9.691656912957686e-06,
+      "loss": 0.036,
+      "step": 140,
+      "student_loss": 0.0005881476681679487,
+      "teacher_loss": 0.0003679130459204316
+    },
+    {
+      "epoch": 0.04181084198385236,
+      "grad_norm": 0.5859375,
+      "kd_loss": 0.07271980494260788,
+      "learning_rate": 9.663049028182112e-06,
+      "loss": 0.0325,
+      "step": 145,
+      "student_loss": 0.028793470934033394,
+      "teacher_loss": 0.005983584560453892
+    },
+    {
+      "epoch": 0.04325259515570934,
+      "grad_norm": 0.609375,
+      "kd_loss": 0.01843745820224285,
+      "learning_rate": 9.633218339615433e-06,
+      "loss": 0.0316,
+      "step": 150,
+      "student_loss": 0.001051027444191277,
+      "teacher_loss": 0.000913174357265234
+    },
+    {
+      "epoch": 0.04469434832756632,
+      "grad_norm": 1.1171875,
+      "kd_loss": 0.023381218314170837,
+      "learning_rate": 9.602172669838721e-06,
+      "loss": 0.0381,
+      "step": 155,
+      "student_loss": 0.002775231609120965,
+      "teacher_loss": 0.0007182428380474448
+    },
+    {
+      "epoch": 0.0461361014994233,
+      "grad_norm": 1.6875,
+      "kd_loss": 0.03144950047135353,
+      "learning_rate": 9.569920160040815e-06,
+      "loss": 0.0344,
+      "step": 160,
+      "student_loss": 0.0778423622250557,
+      "teacher_loss": 0.044956743717193604
+    },
+    {
+      "epoch": 0.04757785467128028,
+      "grad_norm": 0.75,
+      "kd_loss": 0.03335012122988701,
+      "learning_rate": 9.536469267883432e-06,
+      "loss": 0.0311,
+      "step": 165,
+      "student_loss": 0.0012603362556546926,
+      "teacher_loss": 0.00417186226695776
+    },
+    {
+      "epoch": 0.049019607843137254,
+      "grad_norm": 1.8828125,
+      "kd_loss": 0.018793689087033272,
+      "learning_rate": 9.501828765283295e-06,
+      "loss": 0.0355,
+      "step": 170,
+      "student_loss": 0.0005546013708226383,
+      "teacher_loss": 0.0003494401171337813
+    },
+    {
+      "epoch": 0.05046136101499423,
+      "grad_norm": 1.09375,
+      "kd_loss": 0.027096513658761978,
+      "learning_rate": 9.466007736111846e-06,
+      "loss": 0.0322,
+      "step": 175,
+      "student_loss": 0.0030481775756925344,
+      "teacher_loss": 0.0003126203373540193
+    },
+    {
+      "epoch": 0.05190311418685121,
+      "grad_norm": 0.69140625,
+      "kd_loss": 0.03634097799658775,
+      "learning_rate": 9.429015573813163e-06,
+      "loss": 0.0302,
+      "step": 180,
+      "student_loss": 0.0012228424893692136,
+      "teacher_loss": 0.00034309024340473115
+    },
+    {
+      "epoch": 0.05334486735870819,
+      "grad_norm": 1.34375,
+      "kd_loss": 0.03704367205500603,
+      "learning_rate": 9.390861978940687e-06,
+      "loss": 0.0363,
+      "step": 185,
+      "student_loss": 0.035696618258953094,
+      "teacher_loss": 0.00034108557156287134
+    },
+    {
+      "epoch": 0.05478662053056517,
+      "grad_norm": 1.3046875,
+      "kd_loss": 0.052902791649103165,
+      "learning_rate": 9.351556956613423e-06,
+      "loss": 0.0321,
+      "step": 190,
+      "student_loss": 0.009327664040029049,
+      "teacher_loss": 0.0015260990476235747
+    },
+    {
+      "epoch": 0.056228373702422146,
+      "grad_norm": 2.828125,
+      "kd_loss": 0.05315268039703369,
+      "learning_rate": 9.31111081389227e-06,
+      "loss": 0.0327,
+      "step": 195,
+      "student_loss": 0.011893535032868385,
+      "teacher_loss": 0.0007797812577337027
+    },
+    {
+      "epoch": 0.05767012687427912,
+      "grad_norm": 0.8125,
+      "kd_loss": 0.03527738153934479,
+      "learning_rate": 9.269534157077177e-06,
+      "loss": 0.0394,
+      "step": 200,
+      "student_loss": 0.0005083663854748011,
+      "teacher_loss": 0.0002656931465025991
+    },
+    {
+      "epoch": 0.0591118800461361,
+      "grad_norm": 0.9921875,
+      "kd_loss": 0.02736036665737629,
+      "learning_rate": 9.226837888925813e-06,
+      "loss": 0.0341,
+      "step": 205,
+      "student_loss": 0.03568984195590019,
+      "teacher_loss": 0.027640890330076218
+    },
+    {
+      "epoch": 0.06055363321799308,
+      "grad_norm": 1.78125,
+      "kd_loss": 0.030339844524860382,
+      "learning_rate": 9.183033205794525e-06,
+      "loss": 0.0302,
+      "step": 210,
+      "student_loss": 0.000763049116358161,
+      "teacher_loss": 0.0003548153617884964
+    },
+    {
+      "epoch": 0.061995386389850055,
+      "grad_norm": 1.1484375,
+      "kd_loss": 0.03865697979927063,
+      "learning_rate": 9.13813159470227e-06,
+      "loss": 0.0326,
+      "step": 215,
+      "student_loss": 0.0005148217896930873,
+      "teacher_loss": 0.00020245747873559594
+    },
+    {
+      "epoch": 0.06343713956170703,
+      "grad_norm": 1.3671875,
+      "kd_loss": 0.04260854423046112,
+      "learning_rate": 9.092144830318357e-06,
+      "loss": 0.0316,
+      "step": 220,
+      "student_loss": 0.030991079285740852,
+      "teacher_loss": 0.00993641559034586
+    },
+    {
+      "epoch": 0.06487889273356401,
+      "grad_norm": 1.203125,
+      "kd_loss": 0.023958567529916763,
+      "learning_rate": 9.045084971874738e-06,
+      "loss": 0.0376,
+      "step": 225,
+      "student_loss": 0.043197184801101685,
+      "teacher_loss": 0.00216495874337852
+    },
+    {
+      "epoch": 0.06632064590542099,
+      "grad_norm": 1.7578125,
+      "kd_loss": 0.04370651766657829,
+      "learning_rate": 8.99696436000368e-06,
+      "loss": 0.0299,
+      "step": 230,
+      "student_loss": 0.005392159800976515,
+      "teacher_loss": 0.003661371534690261
+    },
+    {
+      "epoch": 0.06776239907727798,
+      "grad_norm": 0.8671875,
+      "kd_loss": 0.07619086652994156,
+      "learning_rate": 8.947795613501658e-06,
+      "loss": 0.0314,
+      "step": 235,
+      "student_loss": 0.008586333133280277,
+      "teacher_loss": 0.0005631999811157584
+    },
+    {
+      "epoch": 0.06920415224913495,
+      "grad_norm": 1.0625,
+      "kd_loss": 0.03111647628247738,
+      "learning_rate": 8.897591626020284e-06,
+      "loss": 0.034,
+      "step": 240,
+      "student_loss": 0.003906027879565954,
+      "teacher_loss": 0.0004787310608662665
+    },
+    {
+      "epoch": 0.07064590542099193,
+      "grad_norm": 1.453125,
+      "kd_loss": 0.017389042302966118,
+      "learning_rate": 8.846365562685178e-06,
+      "loss": 0.0279,
+      "step": 245,
+      "student_loss": 0.013969292864203453,
+      "teacher_loss": 0.006873726844787598
+    },
+    {
+      "epoch": 0.07208765859284891,
+      "grad_norm": 1.6015625,
+      "kd_loss": 0.0229345690459013,
+      "learning_rate": 8.794130856643635e-06,
+      "loss": 0.0311,
+      "step": 250,
+      "student_loss": 0.0008334179292432964,
+      "teacher_loss": 0.0003169570700265467
+    },
+    {
+      "epoch": 0.07352941176470588,
+      "grad_norm": 0.83203125,
+      "kd_loss": 0.01808979921042919,
+      "learning_rate": 8.74090120554202e-06,
+      "loss": 0.0312,
+      "step": 255,
+      "student_loss": 0.0003083710907958448,
+      "teacher_loss": 0.00030115401023067534
+    },
+    {
+      "epoch": 0.07497116493656286,
+      "grad_norm": 0.88671875,
+      "kd_loss": 0.026108454912900925,
+      "learning_rate": 8.686690567933803e-06,
+      "loss": 0.0333,
+      "step": 260,
+      "student_loss": 0.042571116238832474,
+      "teacher_loss": 0.03388316184282303
+    },
+    {
+      "epoch": 0.07641291810841984,
+      "grad_norm": 0.80078125,
+      "kd_loss": 0.016656002029776573,
+      "learning_rate": 8.63151315961915e-06,
+      "loss": 0.0317,
+      "step": 265,
+      "student_loss": 0.0003702428948599845,
+      "teacher_loss": 0.0002555136161390692
+    },
+    {
+      "epoch": 0.07785467128027682,
+      "grad_norm": 1.046875,
+      "kd_loss": 0.019304102286696434,
+      "learning_rate": 8.575383449917103e-06,
+      "loss": 0.0342,
+      "step": 270,
+      "student_loss": 0.001813149661757052,
+      "teacher_loss": 0.0011580288410186768
+    },
+    {
+      "epoch": 0.07929642445213379,
+      "grad_norm": 1.1796875,
+      "kd_loss": 0.023480774834752083,
+      "learning_rate": 8.518316157871232e-06,
+      "loss": 0.029,
+      "step": 275,
+      "student_loss": 0.04764978215098381,
+      "teacher_loss": 0.03439468517899513
+    },
+    {
+      "epoch": 0.08073817762399077,
+      "grad_norm": 1.015625,
+      "kd_loss": 0.03275206685066223,
+      "learning_rate": 8.460326248389825e-06,
+      "loss": 0.0289,
+      "step": 280,
+      "student_loss": 0.0005029537715017796,
+      "teacher_loss": 0.00019533037266228348
+    },
+    {
+      "epoch": 0.08217993079584775,
+      "grad_norm": 1.140625,
+      "kd_loss": 0.019457675516605377,
+      "learning_rate": 8.401428928321607e-06,
+      "loss": 0.0322,
+      "step": 285,
+      "student_loss": 0.0007758038118481636,
+      "teacher_loss": 0.000900130660738796
+    },
+    {
+      "epoch": 0.08362168396770472,
+      "grad_norm": 1.3125,
+      "kd_loss": 0.017802242189645767,
+      "learning_rate": 8.341639642468002e-06,
+      "loss": 0.0348,
+      "step": 290,
+      "student_loss": 0.023405462503433228,
+      "teacher_loss": 0.021540865302085876
+    },
+    {
+      "epoch": 0.0850634371395617,
+      "grad_norm": 1.03125,
+      "kd_loss": 0.01647804118692875,
+      "learning_rate": 8.280974069532999e-06,
+      "loss": 0.0328,
+      "step": 295,
+      "student_loss": 0.0006198033224791288,
+      "teacher_loss": 0.000554366793949157
+    },
+    {
+      "epoch": 0.08650519031141868,
+      "grad_norm": 0.9296875,
+      "kd_loss": 0.04784730449318886,
+      "learning_rate": 8.219448118011687e-06,
+      "loss": 0.0308,
+      "step": 300,
+      "student_loss": 0.015613794326782227,
+      "teacher_loss": 0.001725711626932025
+    },
+    {
+      "epoch": 0.08794694348327567,
+      "grad_norm": 1.1953125,
+      "kd_loss": 0.019965235143899918,
+      "learning_rate": 8.157077922018537e-06,
+      "loss": 0.0289,
+      "step": 305,
+      "student_loss": 0.006098510231822729,
+      "teacher_loss": 0.002777156652882695
+    },
+    {
+      "epoch": 0.08938869665513265,
+      "grad_norm": 1.0390625,
+      "kd_loss": 0.020748196169734,
+      "learning_rate": 8.093879837056486e-06,
+      "loss": 0.0309,
+      "step": 310,
+      "student_loss": 0.000493990199174732,
+      "teacher_loss": 0.00039537265547551215
+    },
+    {
+      "epoch": 0.09083044982698962,
+      "grad_norm": 1.4765625,
+      "kd_loss": 0.03892743960022926,
+      "learning_rate": 8.029870435728018e-06,
+      "loss": 0.0294,
+      "step": 315,
+      "student_loss": 0.0073717073537409306,
+      "teacher_loss": 0.00021180440671741962
+    },
+    {
+      "epoch": 0.0922722029988466,
+      "grad_norm": 1.3828125,
+      "kd_loss": 0.02363528683781624,
+      "learning_rate": 7.965066503389264e-06,
+      "loss": 0.0313,
+      "step": 320,
+      "student_loss": 0.0004119524674024433,
+      "teacher_loss": 0.00022927633835934103
+    },
+    {
+      "epoch": 0.09371395617070358,
+      "grad_norm": 1.7421875,
+      "kd_loss": 0.06230268254876137,
+      "learning_rate": 7.89948503374835e-06,
+      "loss": 0.0284,
+      "step": 325,
+      "student_loss": 0.009024329483509064,
+      "teacher_loss": 0.01963256485760212
+    },
+    {
+      "epoch": 0.09515570934256055,
+      "grad_norm": 0.90625,
+      "kd_loss": 0.04712303727865219,
+      "learning_rate": 7.833143224409076e-06,
+      "loss": 0.0302,
+      "step": 330,
+      "student_loss": 0.008720812387764454,
+      "teacher_loss": 0.0013501221546903253
+    },
+    {
+      "epoch": 0.09659746251441753,
+      "grad_norm": 0.9140625,
+      "kd_loss": 0.018766457214951515,
+      "learning_rate": 7.766058472361154e-06,
+      "loss": 0.0283,
+      "step": 335,
+      "student_loss": 0.001242243917658925,
+      "teacher_loss": 0.0023425817489624023
+    },
+    {
+      "epoch": 0.09803921568627451,
+      "grad_norm": 2.078125,
+      "kd_loss": 0.02911657840013504,
+      "learning_rate": 7.698248369418146e-06,
+      "loss": 0.0326,
+      "step": 340,
+      "student_loss": 0.009861858561635017,
+      "teacher_loss": 0.006796200294047594
+    },
+    {
+      "epoch": 0.09948096885813149,
+      "grad_norm": 1.2265625,
+      "kd_loss": 0.032899159938097,
+      "learning_rate": 7.629730697604314e-06,
+      "loss": 0.0335,
+      "step": 345,
+      "student_loss": 0.00739182299003005,
+      "teacher_loss": 0.0002512071805540472
+    },
+    {
+      "epoch": 0.10092272202998846,
+      "grad_norm": 1.0859375,
+      "kd_loss": 0.06120900437235832,
+      "learning_rate": 7.560523424491595e-06,
+      "loss": 0.0282,
+      "step": 350,
+      "student_loss": 0.003599822986871004,
+      "teacher_loss": 0.0007063632365316153
+    },
+    {
+      "epoch": 0.10236447520184544,
+      "grad_norm": 2.296875,
+      "kd_loss": 0.02087453007698059,
+      "learning_rate": 7.490644698487909e-06,
+      "loss": 0.035,
+      "step": 355,
+      "student_loss": 0.001389339566230774,
+      "teacher_loss": 0.0017551770433783531
+    },
+    {
+      "epoch": 0.10380622837370242,
+      "grad_norm": 0.90234375,
+      "kd_loss": 0.019557664170861244,
+      "learning_rate": 7.420112844078066e-06,
+      "loss": 0.0321,
+      "step": 360,
+      "student_loss": 0.0006360138650052249,
+      "teacher_loss": 0.0005147532792761922
+    },
+    {
+      "epoch": 0.1052479815455594,
+      "grad_norm": 1.3203125,
+      "kd_loss": 0.030097220093011856,
+      "learning_rate": 7.348946357018479e-06,
+      "loss": 0.0333,
+      "step": 365,
+      "student_loss": 0.04143820330500603,
+      "teacher_loss": 0.027219083160161972
+    },
+    {
+      "epoch": 0.10668973471741638,
+      "grad_norm": 0.9453125,
+      "kd_loss": 0.05344080179929733,
+      "learning_rate": 7.277163899486975e-06,
+      "loss": 0.0351,
+      "step": 370,
+      "student_loss": 0.021593965590000153,
+      "teacher_loss": 0.0005673202103935182
+    },
+    {
+      "epoch": 0.10813148788927336,
+      "grad_norm": 0.7265625,
+      "kd_loss": 0.03127056360244751,
+      "learning_rate": 7.204784295188959e-06,
+      "loss": 0.0287,
+      "step": 375,
+      "student_loss": 0.03163963928818703,
+      "teacher_loss": 0.020587248727679253
+    },
+    {
+      "epoch": 0.10957324106113034,
+      "grad_norm": 0.84375,
+      "kd_loss": 0.01984175480902195,
+      "learning_rate": 7.1318265244212305e-06,
+      "loss": 0.0311,
+      "step": 380,
+      "student_loss": 0.0005506337620317936,
+      "teacher_loss": 0.00038444914389401674
+    },
+    {
+      "epoch": 0.11101499423298732,
+      "grad_norm": 0.59765625,
+      "kd_loss": 0.02916550263762474,
+      "learning_rate": 7.05830971909472e-06,
+      "loss": 0.0294,
+      "step": 385,
+      "student_loss": 0.00045756620238535106,
+      "teacher_loss": 0.0003828653716482222
+    },
+    {
+      "epoch": 0.11245674740484429,
+      "grad_norm": 1.171875,
+      "kd_loss": 0.021802764385938644,
+      "learning_rate": 6.9842531577174865e-06,
+      "loss": 0.0271,
+      "step": 390,
+      "student_loss": 0.00026892355526797473,
+      "teacher_loss": 0.0002502185816410929
+    },
+    {
+      "epoch": 0.11389850057670127,
+      "grad_norm": 0.94140625,
+      "kd_loss": 0.020051907747983932,
+      "learning_rate": 6.9096762603392595e-06,
+      "loss": 0.0307,
+      "step": 395,
+      "student_loss": 0.0004568768781609833,
+      "teacher_loss": 0.00039133013342507184
+    },
+    {
+      "epoch": 0.11534025374855825,
+      "grad_norm": 0.78125,
+      "kd_loss": 0.03054620325565338,
+      "learning_rate": 6.834598583458862e-06,
+      "loss": 0.0275,
+      "step": 400,
+      "student_loss": 0.0005328103434294462,
+      "teacher_loss": 0.00015425821766257286
+    },
+    {
+      "epoch": 0.11678200692041522,
+      "grad_norm": 0.640625,
+      "kd_loss": 0.02604236640036106,
+      "learning_rate": 6.7590398148958625e-06,
+      "loss": 0.0335,
+      "step": 405,
+      "student_loss": 0.011330639012157917,
+      "teacher_loss": 0.00021900788124185055
+    },
+    {
+      "epoch": 0.1182237600922722,
+      "grad_norm": 0.8046875,
+      "kd_loss": 0.03826223686337471,
+      "learning_rate": 6.6830197686277945e-06,
+      "loss": 0.0366,
+      "step": 410,
+      "student_loss": 0.014453927055001259,
+      "teacher_loss": 0.00844341516494751
+    },
+    {
+      "epoch": 0.11966551326412918,
+      "grad_norm": 1.203125,
+      "kd_loss": 0.021850954741239548,
+      "learning_rate": 6.6065583795942625e-06,
+      "loss": 0.0347,
+      "step": 415,
+      "student_loss": 0.004759110510349274,
+      "teacher_loss": 0.0034685542341321707
+    },
+    {
+      "epoch": 0.12110726643598616,
+      "grad_norm": 0.93359375,
+      "kd_loss": 0.045137468725442886,
+      "learning_rate": 6.52967569846937e-06,
+      "loss": 0.0331,
+      "step": 420,
+      "student_loss": 0.04586087912321091,
+      "teacher_loss": 0.021448055282235146
+    },
+    {
+      "epoch": 0.12254901960784313,
+      "grad_norm": 1.4453125,
+      "kd_loss": 0.014929288066923618,
+      "learning_rate": 6.452391886403767e-06,
+      "loss": 0.0299,
+      "step": 425,
+      "student_loss": 0.002178685739636421,
+      "teacher_loss": 0.0021052202209830284
+    },
+    {
+      "epoch": 0.12399077277970011,
+      "grad_norm": 1.3671875,
+      "kd_loss": 0.0547032505273819,
+      "learning_rate": 6.374727209737743e-06,
+      "loss": 0.0368,
+      "step": 430,
+      "student_loss": 0.05219801887869835,
+      "teacher_loss": 0.060560259968042374
+    },
+    {
+      "epoch": 0.1254325259515571,
+      "grad_norm": 1.453125,
+      "kd_loss": 0.01706705428659916,
+      "learning_rate": 6.296702034686726e-06,
+      "loss": 0.0301,
+      "step": 435,
+      "student_loss": 0.0008379022474400699,
+      "teacher_loss": 0.0006224109092727304
+    },
+    {
+      "epoch": 0.12687427912341406,
+      "grad_norm": 1.0,
+      "kd_loss": 0.03890637308359146,
+      "learning_rate": 6.218336822000598e-06,
+      "loss": 0.0314,
+      "step": 440,
+      "student_loss": 0.012142423540353775,
+      "teacher_loss": 0.007434291299432516
+    },
+    {
+      "epoch": 0.12831603229527105,
+      "grad_norm": 0.890625,
+      "kd_loss": 0.03179040178656578,
+      "learning_rate": 6.139652121598219e-06,
+      "loss": 0.0313,
+      "step": 445,
+      "student_loss": 0.03341586887836456,
+      "teacher_loss": 0.018860887736082077
+    },
+    {
+      "epoch": 0.12975778546712802,
+      "grad_norm": 0.69140625,
+      "kd_loss": 0.03629063814878464,
+      "learning_rate": 6.060668567178561e-06,
+      "loss": 0.0329,
+      "step": 450,
+      "student_loss": 0.0010012147249653935,
+      "teacher_loss": 0.0014839700888842344
+    },
+    {
+      "epoch": 0.131199538638985,
+      "grad_norm": 0.8828125,
+      "kd_loss": 0.02263510413467884,
+      "learning_rate": 5.981406870809889e-06,
+      "loss": 0.0326,
+      "step": 455,
+      "student_loss": 0.0007265008171088994,
+      "teacher_loss": 0.0003751025360543281
+    },
+    {
+      "epoch": 0.13264129181084197,
+      "grad_norm": 1.2109375,
+      "kd_loss": 0.0492476262152195,
+      "learning_rate": 5.9018878174983674e-06,
+      "loss": 0.0295,
+      "step": 460,
+      "student_loss": 0.005101657006889582,
+      "teacher_loss": 0.0002927044697571546
+    },
+    {
+      "epoch": 0.13408304498269896,
+      "grad_norm": 1.2578125,
+      "kd_loss": 0.031007954850792885,
+      "learning_rate": 5.822132259737565e-06,
+      "loss": 0.034,
+      "step": 465,
+      "student_loss": 0.00047535731573589146,
+      "teacher_loss": 0.00024468303308822215
+    },
+    {
+      "epoch": 0.13552479815455595,
+      "grad_norm": 1.7265625,
+      "kd_loss": 0.02650834433734417,
+      "learning_rate": 5.742161112040237e-06,
+      "loss": 0.0313,
+      "step": 470,
+      "student_loss": 0.000378905504476279,
+      "teacher_loss": 0.0002708406245801598
+    },
+    {
+      "epoch": 0.13696655132641292,
+      "grad_norm": 1.171875,
+      "kd_loss": 0.03383399918675423,
+      "learning_rate": 5.661995345453867e-06,
+      "loss": 0.0289,
+      "step": 475,
+      "student_loss": 0.0004789994563907385,
+      "teacher_loss": 0.0002899342798627913
+    },
+    {
+      "epoch": 0.1384083044982699,
+      "grad_norm": 1.1953125,
+      "kd_loss": 0.03584924340248108,
+      "learning_rate": 5.581655982061367e-06,
+      "loss": 0.0422,
+      "step": 480,
+      "student_loss": 0.03862505778670311,
+      "teacher_loss": 0.036061737686395645
+    },
+    {
+      "epoch": 0.13985005767012687,
+      "grad_norm": 1.203125,
+      "kd_loss": 0.020711667835712433,
+      "learning_rate": 5.501164089468406e-06,
+      "loss": 0.0313,
+      "step": 485,
+      "student_loss": 0.0004981390666216612,
+      "teacher_loss": 0.0005692155100405216
+    },
+    {
+      "epoch": 0.14129181084198386,
+      "grad_norm": 1.3984375,
+      "kd_loss": 0.027012551203370094,
+      "learning_rate": 5.4205407752787884e-06,
+      "loss": 0.0367,
+      "step": 490,
+      "student_loss": 0.0008920110412873328,
+      "teacher_loss": 0.0007303191814571619
+    },
+    {
+      "epoch": 0.14273356401384082,
+      "grad_norm": 1.3671875,
+      "kd_loss": 0.040799129754304886,
+      "learning_rate": 5.339807181559359e-06,
+      "loss": 0.0399,
+      "step": 495,
+      "student_loss": 0.0009312813053838909,
+      "teacher_loss": 0.0005571586079895496
+    },
+    {
+      "epoch": 0.14417531718569782,
+      "grad_norm": 1.6875,
+      "kd_loss": 0.03058801032602787,
+      "learning_rate": 5.258984479295853e-06,
+      "loss": 0.0316,
+      "step": 500,
+      "student_loss": 0.00041255459655076265,
+      "teacher_loss": 0.00031951998244039714
+    },
+    {
+      "epoch": 0.14561707035755478,
+      "grad_norm": 1.734375,
+      "kd_loss": 0.02832438424229622,
+      "learning_rate": 5.1780938628411795e-06,
+      "loss": 0.0402,
+      "step": 505,
+      "student_loss": 0.0005547971813939512,
+      "teacher_loss": 0.0004580595705192536
+    },
+    {
+      "epoch": 0.14705882352941177,
+      "grad_norm": 2.28125,
+      "kd_loss": 0.019691068679094315,
+      "learning_rate": 5.097156544357567e-06,
+      "loss": 0.032,
+      "step": 510,
+      "student_loss": 0.00035549805033952,
+      "teacher_loss": 0.00029006152180954814
+    },
+    {
+      "epoch": 0.14850057670126873,
+      "grad_norm": 1.4609375,
+      "kd_loss": 0.028027402237057686,
+      "learning_rate": 5.016193748254045e-06,
+      "loss": 0.0299,
+      "step": 515,
+      "student_loss": 0.016550345346331596,
+      "teacher_loss": 0.00025012606056407094
+    },
+    {
+      "epoch": 0.14994232987312572,
+      "grad_norm": 1.7734375,
+      "kd_loss": 0.06320768594741821,
+      "learning_rate": 4.935226705620699e-06,
+      "loss": 0.0344,
+      "step": 520,
+      "student_loss": 0.04100845754146576,
+      "teacher_loss": 0.011567573063075542
+    },
+    {
+      "epoch": 0.1513840830449827,
+      "grad_norm": 0.859375,
+      "kd_loss": 0.032060496509075165,
+      "learning_rate": 4.8542766486612035e-06,
+      "loss": 0.033,
+      "step": 525,
+      "student_loss": 0.0004020243068225682,
+      "teacher_loss": 0.00020839735225308686
+    },
+    {
+      "epoch": 0.15282583621683968,
+      "grad_norm": 1.203125,
+      "kd_loss": 0.024977991357445717,
+      "learning_rate": 4.773364805125025e-06,
+      "loss": 0.0309,
+      "step": 530,
+      "student_loss": 0.0005168431089259684,
+      "teacher_loss": 0.00039579602889716625
+    },
+    {
+      "epoch": 0.15426758938869667,
+      "grad_norm": 2.046875,
+      "kd_loss": 0.03925255313515663,
+      "learning_rate": 4.6925123927408265e-06,
+      "loss": 0.0308,
+      "step": 535,
+      "student_loss": 0.0007517871563322842,
+      "teacher_loss": 0.00039392782491631806
+    },
+    {
+      "epoch": 0.15570934256055363,
+      "grad_norm": 1.8203125,
+      "kd_loss": 0.05933701992034912,
+      "learning_rate": 4.611740613652485e-06,
+      "loss": 0.0286,
+      "step": 540,
+      "student_loss": 0.0131508968770504,
+      "teacher_loss": 0.0005503469728864729
+    },
+    {
+      "epoch": 0.15715109573241062,
+      "grad_norm": 1.0390625,
+      "kd_loss": 0.023665662854909897,
+      "learning_rate": 4.531070648859186e-06,
+      "loss": 0.0282,
+      "step": 545,
+      "student_loss": 0.0027271448634564877,
+      "teacher_loss": 0.0027394567150622606
+    },
+    {
+      "epoch": 0.15859284890426759,
+      "grad_norm": 2.046875,
+      "kd_loss": 0.05410351976752281,
+      "learning_rate": 4.450523652661086e-06,
+      "loss": 0.0276,
+      "step": 550,
+      "student_loss": 0.00023584096925333142,
+      "teacher_loss": 0.0036266453098505735
+    },
+    {
+      "epoch": 0.16003460207612458,
+      "grad_norm": 1.1796875,
+      "kd_loss": 0.08308840543031693,
+      "learning_rate": 4.370120747111956e-06,
+      "loss": 0.0327,
+      "step": 555,
+      "student_loss": 0.046959906816482544,
+      "teacher_loss": 0.0017155115492641926
+    },
+    {
+      "epoch": 0.16147635524798154,
+      "grad_norm": 2.4375,
+      "kd_loss": 0.0380968302488327,
+      "learning_rate": 4.289883016480291e-06,
+      "loss": 0.0349,
+      "step": 560,
+      "student_loss": 0.001130731194280088,
+      "teacher_loss": 0.00036855213693343103
+    },
+    {
+      "epoch": 0.16291810841983853,
+      "grad_norm": 1.1328125,
+      "kd_loss": 0.03884750232100487,
+      "learning_rate": 4.209831501720328e-06,
+      "loss": 0.0325,
+      "step": 565,
+      "student_loss": 0.02741910330951214,
+      "teacher_loss": 0.019537584856152534
+    },
+    {
+      "epoch": 0.1643598615916955,
+      "grad_norm": 5.1875,
+      "kd_loss": 0.0330473892390728,
+      "learning_rate": 4.129987194954421e-06,
+      "loss": 0.0356,
+      "step": 570,
+      "student_loss": 0.010371256619691849,
+      "teacher_loss": 0.0070776245556771755
+    },
+    {
+      "epoch": 0.16580161476355249,
+      "grad_norm": 1.1015625,
+      "kd_loss": 0.028591087087988853,
+      "learning_rate": 4.050371033968216e-06,
+      "loss": 0.0328,
+      "step": 575,
+      "student_loss": 0.000467249978100881,
+      "teacher_loss": 0.0002494181098882109
+    },
+    {
+      "epoch": 0.16724336793540945,
+      "grad_norm": 0.80078125,
+      "kd_loss": 0.02640584297478199,
+      "learning_rate": 3.9710038967200825e-06,
+      "loss": 0.0284,
+      "step": 580,
+      "student_loss": 0.0017712963744997978,
+      "teacher_loss": 0.0019773358944803476
+    },
+    {
+      "epoch": 0.16868512110726644,
+      "grad_norm": 0.85546875,
+      "kd_loss": 0.030399348586797714,
+      "learning_rate": 3.89190659586623e-06,
+      "loss": 0.0281,
+      "step": 585,
+      "student_loss": 0.0004607080190908164,
+      "teacher_loss": 0.0002647584769874811
+    },
+    {
+      "epoch": 0.1701268742791234,
+      "grad_norm": 1.1328125,
+      "kd_loss": 0.03226058557629585,
+      "learning_rate": 3.8130998733029517e-06,
+      "loss": 0.0302,
+      "step": 590,
+      "student_loss": 0.0005413145408965647,
+      "teacher_loss": 0.0011012081522494555
+    },
+    {
+      "epoch": 0.1715686274509804,
+      "grad_norm": 1.8203125,
+      "kd_loss": 0.023172084242105484,
+      "learning_rate": 3.734604394727419e-06,
+      "loss": 0.0325,
+      "step": 595,
+      "student_loss": 0.00059064372908324,
+      "teacher_loss": 0.0005090903723612428
+    },
+    {
+      "epoch": 0.17301038062283736,
+      "grad_norm": 1.1640625,
+      "kd_loss": 0.038484539836645126,
+      "learning_rate": 3.656440744218464e-06,
+      "loss": 0.0346,
+      "step": 600,
+      "student_loss": 0.041177455335855484,
+      "teacher_loss": 0.004930450581014156
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 1000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-600/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-800/added_tokens.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|action_sep|>": 151670,
+  "<|arg_sep|>": 151671,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|call_sep|>": 151666,
+  "<|end_of_text|>": 151673,
+  "<|endoftext|>": 151643,
+  "<|exception_sep|>": 151669,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|frame_sep|>": 151672,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|latent_end|>": 151675,
+  "<|latent_start|>": 151674,
+  "<|line_sep|>": 151667,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|return_sep|>": 151668,
+  "<|trace_context_start|>": 151665,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-800/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-800/config.json ADDED Viewed

	@@ -0,0 +1,66 @@

+{
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "dtype": "bfloat16",
+  "eos_token_id": 151643,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 11008,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 32768,
+  "max_window_layers": 36,
+  "model_type": "qwen2",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 36,
+  "num_key_value_heads": 2,
+  "pad_token_id": 151643,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "transformers_version": "4.57.6",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151676
+}

checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-800/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/codi3b_a0.5_b1.0_g0.5_ls2/checkpoint-800/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9edcc00bfefc311d04e3e1a0d401e8fbed45afc64aae68aa648cda8c39bc16c0
+size 6187858991