cucl2 commited on 2 days ago

Commit

85b84c2

verified ·

1 Parent(s): f90f7e7

Add files using upload-large-folder tool

Browse files

Files changed (28) hide show

.gitattributes +1 -0
README.md +71 -0
added_tokens.json +35 -0
chat_template.jinja +105 -0
config.json +528 -0
generation_config.json +8 -0
merges.txt +0 -0
model-00001-of-00013.safetensors +3 -0
model-00002-of-00013.safetensors +3 -0
model-00003-of-00013.safetensors +3 -0
model-00004-of-00013.safetensors +3 -0
model-00005-of-00013.safetensors +3 -0
model-00006-of-00013.safetensors +3 -0
model-00007-of-00013.safetensors +3 -0
model-00008-of-00013.safetensors +3 -0
model-00009-of-00013.safetensors +3 -0
model-00010-of-00013.safetensors +3 -0
model-00011-of-00013.safetensors +3 -0
model-00012-of-00013.safetensors +3 -0
model-00013-of-00013.safetensors +3 -0
model.safetensors.index.json +0 -0
preprocessor_config.json +30 -0
special_tokens_map.json +44 -0
tokenizer.json +3 -0
tokenizer_config.json +317 -0
trainer_state.json +1354 -0
video_preprocessor_config.json +54 -0
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,71 @@

+---
+license: apache-2.0
+language:
+  - en
+  - zh
+library_name: transformers
+base_model: Qwen/Qwen3-Omni-30B-A3B-Captioner
+tags:
+  - audio
+  - audio-language-model
+  - instruction-following
+  - rubric-based-evaluation
+  - judge-model
+pipeline_tag: audio-text-to-text
+---
+# AnyAudio-Judge-30B
+`AnyAudio-Judge-30B` is the **dynamic rubric-based audio judge** reported in the paper. It is initialized from [Qwen3-Omni-30B-A3B-Captioner](https://huggingface.co/Qwen) and fine-tuned on the AnyAudio-Judge SFT Corpus.
+For each binary rubric item describing one verifiable aspect of an audio caption, the model predicts yes / no and produces a one-sentence evidence string drawn from what it heard in the audio. Aggregating the per-item soft probabilities yields a fine-grained alignment score that is significantly more sensitive to partial mismatches than a single holistic match/mismatch judgment.
+> Companion benchmark: [`cucl2/AnyAudio-Judge-Bench`](https://huggingface.co/datasets/cucl2/AnyAudio-Judge-Bench)
+> Companion corpus: [`cucl2/AnyAudio-Judge-Corpus`](https://huggingface.co/datasets/cucl2/AnyAudio-Judge-Corpus)
+> Smaller variant: [`cucl2/AnyAudio-Judge-7B`](https://huggingface.co/cucl2/AnyAudio-Judge-7B)
+## Headline numbers (AnyAudio-Judge Bench, accuracy ↑)
+| Model | Avg (en) | Avg (zh) |
+|---|---:|---:|
+| Qwen3-Omni-30B-A3B-Captioner (dynamic rubric, no fine-tuning) | 76.77 | 76.66 |
+| Gemini-2.5-Pro (holistic) | 77.72 | 80.01 |
+| **AnyAudio-Judge-30B (this checkpoint)** | **84.45** | **85.26** |
+## Training
+- **Base**: Qwen3-Omni-30B-A3B-Captioner
+- **Corpus**: 105K (audio, instruction, rubric, CoT) tuples
+- **Stage 1 — SFT**: full-parameter fine-tuning, 1 epoch, 16 × H20 96GB, lr 1e-5, per-device bs 4
+- **Stage 2 — GRPO** (separate release): LoRA r=16, α=32, 1 epoch on 8,454 hard samples
+This release contains the **SFT-only** stage (matching the "+SFT" row of the ablation table). The +GRPO improvement reported in the paper can be reproduced by running GRPO on top of this checkpoint.
+## Usage
+```python
+from anyaudio_judge import AnyAudioJudge, decompose_instruction
+caption = "A gentle, delicate female voice, with soft and smooth pitch, calm and restrained throughout."
+rubric  = decompose_instruction(caption)
+judge = AnyAudioJudge.from_pretrained("cucl2/AnyAudio-Judge-30B")
+result = judge.judge("./demo.wav", rubric)
+print("alignment_score:", result.score)
+for item in result.items:
+    print(item.question, "->", item.answer)
+```
+## License
+Apache-2.0, inheriting the license of the base Qwen3-Omni-30B-A3B-Captioner model.
+## Citation
+```bibtex
+@inproceedings{anyaudiojudge2026,
+  title     = {AnyAudio-Judge: A Dynamic Rubric-Based Benchmark and Evaluator for Audio Instruction Following},
+  booktitle = {Proceedings of ACL},
+  year      = {2026}
+}
+```

added_tokens.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+  "</think>": 151668,
+  "</tool_call>": 151658,
+  "</tool_response>": 151666,
+  "<think>": 151667,
+  "<tool_call>": 151657,
+  "<tool_response>": 151665,
+  "<tts_pad>": 151671,
+  "<tts_text_bos>": 151672,
+  "<tts_text_bos_single>": 151674,
+  "<tts_text_eod>": 151673,
+  "<|audio_end|>": 151670,
+  "<|audio_pad|>": 151675,
+  "<|audio_start|>": 151669,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,105 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}{{- messages[0].content + '\n\n' }}{%- endif %}
+    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {%- if messages[0].content is string %}
+            {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+        {%- else %}
+            {%- for content in messages[0].content %}
+                {%- if content.type == 'image' or 'image' in content or 'image_url' in content %}
+                    {{- '<|im_start|>system\n' +"<|vision_start|><|image_pad|><|vision_end|>"+ '<|im_end|>\n' }}
+                {%- elif content.type == 'audio' or 'audio' in content or 'audio_url' in content %}
+                    {{- '<|im_start|>system\n' +"<|audio_start|><|audio_pad|><|audio_end|>"+ '<|im_end|>\n' }}
+                {%- elif content.type == 'video' or 'video' in content %}
+                    {{- '<|im_start|>system\n' +"<|vision_start|><|video_pad|><|vision_end|>"+ '<|im_end|>\n' }}
+                {%- elif content.type == 'text' %}
+                    {{- '<|im_start|>system\n' +content.text+ '<|im_end|>\n' }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+    {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+    {%- set index = (messages|length - 1) - loop.index0 %}
+    {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
+        {%- set ns.multi_step_tool = false %}
+        {%- set ns.last_query_index = index %}
+    {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+    {%- if message.content is string %}
+        {%- set content = message.content %}
+    {%- else %}
+        {%- set content = namespace(text="") %}
+        {%- for mcontent in message.content %}
+            {%- if mcontent.type == 'image' or 'image' in mcontent or 'image_url' in mcontent %}
+                {%- set content.text = content.text~"<|vision_start|><|image_pad|><|vision_end|>" %}
+            {%- elif mcontent.type == 'audio' or 'audio' in mcontent or 'audio_url' in mcontent %}
+                {%- set content.text = content.text~"<|audio_start|><|audio_pad|><|audio_end|>" %}
+            {%- elif mcontent.type == 'video' or 'video' in mcontent %}
+                {%- set content.text = content.text~"<|vision_start|><|video_pad|><|vision_end|>" %}
+            {%- elif mcontent.type == 'text' %}
+                {%- set content.text = content.text~mcontent.text %}
+            {%- endif %}
+        {%- endfor %}
+        {%- set content = content.text %}
+    {%- endif %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+        {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {%- set reasoning_content = "" %}
+        {%- if message.reasoning_content is string %}
+            {%- set reasoning_content = message.reasoning_content %}
+        {%- else %}
+            {%- if '</think>' in content %}
+            {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+            {%- set content = content.split('</think>')[-1].lstrip('\n') %}
+        {%- endif %}
+    {%- endif %}
+    {%- if loop.index0 > ns.last_query_index %}
+        {%- if loop.last or (not loop.last and reasoning_content) %}
+            {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip("\n") + '\n</think>\n\n' + content.lstrip('\n') }}
+        {%- else %}
+            {{- '<|im_start|>' + message.role + '\n' + content }}
+        {%- endif %}
+    {%- else %}
+        {{- '<|im_start|>' + message.role + '\n' + content }}
+    {%- endif %}
+    {%- if message.tool_calls %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if (loop.first and content) or (not loop.first) %}{{- '\n' }}{%- endif %}
+            {%- if tool_call.function %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {%- if tool_call.arguments is string %}
+                {{- tool_call.arguments }}
+            {%- else %}
+                {{- tool_call.arguments | tojson }}
+            {%- endif %}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+    {%- endif %}
+    {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}{{- '<|im_start|>user' }}{%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}{{- '<|im_end|>\n' }}{%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+    {%- if enable_thinking is defined and enable_thinking is false %}{{- '<think>\n\n</think>\n\n' }}{%- endif %}
+{%- endif %}

config.json ADDED Viewed

	@@ -0,0 +1,528 @@

+{
+  "architectures": [
+    "Qwen3OmniMoeForConditionalGeneration"
+  ],
+  "assistant_token_id": 77091,
+  "code2wav_config": {
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "codebook_size": 2048,
+    "decoder_dim": 1536,
+    "dtype": "bfloat16",
+    "hidden_act": "silu",
+    "hidden_size": 1024,
+    "intermediate_size": 3072,
+    "layer_scale_initial_scale": 0.01,
+    "max_position_embeddings": 8000,
+    "model_type": "",
+    "num_attention_heads": 16,
+    "num_hidden_layers": 8,
+    "num_key_value_heads": 16,
+    "num_quantizers": 16,
+    "rms_norm_eps": 1e-05,
+    "rope_theta": 10000,
+    "sliding_window": 72,
+    "upsample_rates": [
+      8,
+      5,
+      4,
+      3
+    ],
+    "upsampling_ratios": [
+      2,
+      2
+    ]
+  },
+  "dtype": "bfloat16",
+  "enable_audio_output": false,
+  "eos_token_id": 151645,
+  "hidden_size": 1024,
+  "im_end_token_id": 151645,
+  "im_start_token_id": 151644,
+  "keys_to_ignore_at_inference": [
+    "past_key_values",
+    "hidden_states",
+    "attention_mask"
+  ],
+  "model_type": "qwen3_omni_moe",
+  "pad_token_id": 151643,
+  "system_token_id": 8948,
+  "talker_config": {
+    "accept_hidden_layer": 18,
+    "audio_start_token_id": 151669,
+    "audio_token_id": 151646,
+    "code_predictor_config": {
+      "_name_or_path": "",
+      "add_cross_attention": false,
+      "architectures": null,
+      "attention_bias": false,
+      "attention_dropout": 0,
+      "bad_words_ids": null,
+      "begin_suppress_tokens": null,
+      "bos_token_id": null,
+      "chunk_size_feed_forward": 0,
+      "cross_attention_hidden_size": null,
+      "decoder_start_token_id": null,
+      "diversity_penalty": 0.0,
+      "do_sample": false,
+      "dtype": null,
+      "early_stopping": false,
+      "encoder_no_repeat_ngram_size": 0,
+      "eos_token_id": null,
+      "exponential_decay_length_penalty": null,
+      "finetuning_task": null,
+      "forced_bos_token_id": null,
+      "forced_eos_token_id": null,
+      "head_dim": 128,
+      "hidden_act": "silu",
+      "hidden_size": 1024,
+      "id2label": {
+        "0": "LABEL_0",
+        "1": "LABEL_1"
+      },
+      "initializer_range": 0.02,
+      "intermediate_size": 3072,
+      "is_decoder": false,
+      "is_encoder_decoder": false,
+      "label2id": {
+        "LABEL_0": 0,
+        "LABEL_1": 1
+      },
+      "layer_types": [
+        "full_attention",
+        "full_attention",
+        "full_attention",
+        "full_attention",
+        "full_attention"
+      ],
+      "length_penalty": 1.0,
+      "max_length": 20,
+      "max_position_embeddings": 32768,
+      "min_length": 0,
+      "model_type": "qwen3_omni_moe_talker_code_predictor",
+      "no_repeat_ngram_size": 0,
+      "num_attention_heads": 16,
+      "num_beam_groups": 1,
+      "num_beams": 1,
+      "num_code_groups": 32,
+      "num_hidden_layers": 5,
+      "num_key_value_heads": 8,
+      "num_return_sequences": 1,
+      "output_attentions": false,
+      "output_hidden_states": false,
+      "output_scores": false,
+      "pad_token_id": null,
+      "prefix": null,
+      "problem_type": null,
+      "pruned_heads": {},
+      "remove_invalid_values": false,
+      "repetition_penalty": 1.0,
+      "return_dict": true,
+      "return_dict_in_generate": false,
+      "rms_norm_eps": 1e-06,
+      "rope_scaling": null,
+      "rope_theta": 10000,
+      "sep_token_id": null,
+      "sliding_window": null,
+      "suppress_tokens": null,
+      "task_specific_params": null,
+      "temperature": 1.0,
+      "tie_encoder_decoder": false,
+      "tie_word_embeddings": false,
+      "tokenizer_class": null,
+      "top_k": 50,
+      "top_p": 1.0,
+      "torchscript": false,
+      "typical_p": 1.0,
+      "use_cache": true,
+      "vocab_size": 2048
+    },
+    "codec_bos_id": 4197,
+    "codec_eos_token_id": 4198,
+    "codec_nothink_id": 4203,
+    "codec_pad_id": 4196,
+    "codec_think_bos_id": 4204,
+    "codec_think_eos_id": 4205,
+    "dtype": "bfloat16",
+    "image_token_id": 151655,
+    "model_type": "",
+    "num_code_groups": 32,
+    "position_id_per_seconds": 25,
+    "speaker_id": null,
+    "text_config": {
+      "_name_or_path": "",
+      "add_cross_attention": false,
+      "architectures": null,
+      "attention_bias": false,
+      "attention_dropout": 0,
+      "bad_words_ids": null,
+      "begin_suppress_tokens": null,
+      "bos_token_id": null,
+      "chunk_size_feed_forward": 0,
+      "cross_attention_hidden_size": null,
+      "decoder_sparse_step": 1,
+      "decoder_start_token_id": null,
+      "diversity_penalty": 0.0,
+      "do_sample": false,
+      "dtype": "bfloat16",
+      "early_stopping": false,
+      "encoder_no_repeat_ngram_size": 0,
+      "eos_token_id": null,
+      "exponential_decay_length_penalty": null,
+      "finetuning_task": null,
+      "forced_bos_token_id": null,
+      "forced_eos_token_id": null,
+      "hidden_act": "silu",
+      "hidden_size": 1024,
+      "id2label": {
+        "0": "LABEL_0",
+        "1": "LABEL_1"
+      },
+      "initializer_range": 0.02,
+      "intermediate_size": 2048,
+      "is_decoder": false,
+      "is_encoder_decoder": false,
+      "label2id": {
+        "LABEL_0": 0,
+        "LABEL_1": 1
+      },
+      "length_penalty": 1.0,
+      "max_length": 20,
+      "max_position_embeddings": 32768,
+      "min_length": 0,
+      "mlp_only_layers": [],
+      "model_type": "qwen3_omni_moe_talker_text",
+      "moe_intermediate_size": 384,
+      "no_repeat_ngram_size": 0,
+      "norm_topk_prob": false,
+      "num_attention_heads": 16,
+      "num_beam_groups": 1,
+      "num_beams": 1,
+      "num_experts": 128,
+      "num_experts_per_tok": 8,
+      "num_hidden_layers": 20,
+      "num_key_value_heads": 2,
+      "num_return_sequences": 1,
+      "output_attentions": false,
+      "output_hidden_states": false,
+      "output_router_logits": false,
+      "output_scores": false,
+      "pad_token_id": 151643,
+      "prefix": null,
+      "problem_type": null,
+      "pruned_heads": {},
+      "remove_invalid_values": false,
+      "repetition_penalty": 1.0,
+      "return_dict": true,
+      "return_dict_in_generate": false,
+      "rms_norm_eps": 1e-06,
+      "rope_scaling": null,
+      "rope_theta": 10000,
+      "router_aux_loss_coef": 0.0,
+      "sep_token_id": null,
+      "sliding_window": null,
+      "suppress_tokens": null,
+      "task_specific_params": null,
+      "temperature": 1.0,
+      "tie_encoder_decoder": false,
+      "tie_word_embeddings": false,
+      "tokenizer_class": null,
+      "top_k": 50,
+      "top_p": 1.0,
+      "torchscript": false,
+      "typical_p": 1.0,
+      "use_cache": false,
+      "vocab_size": 3072
+    },
+    "thinker_hidden_size": 2048,
+    "video_token_id": 151656,
+    "vision_start_token_id": 151652
+  },
+  "thinker_config": {
+    "audio_config": {
+      "_name_or_path": "",
+      "activation_dropout": 0,
+      "activation_function": "gelu",
+      "add_cross_attention": false,
+      "architectures": null,
+      "attention_dropout": 0,
+      "bad_words_ids": null,
+      "begin_suppress_tokens": null,
+      "bos_token_id": null,
+      "chunk_size_feed_forward": 0,
+      "conv_chunksize": 500,
+      "cross_attention_hidden_size": null,
+      "d_model": 1280,
+      "decoder_start_token_id": null,
+      "diversity_penalty": 0.0,
+      "do_sample": false,
+      "downsample_hidden_size": 480,
+      "dropout": 0,
+      "dtype": "bfloat16",
+      "early_stopping": false,
+      "encoder_attention_heads": 20,
+      "encoder_ffn_dim": 5120,
+      "encoder_layers": 32,
+      "encoder_no_repeat_ngram_size": 0,
+      "eos_token_id": null,
+      "exponential_decay_length_penalty": null,
+      "finetuning_task": null,
+      "forced_bos_token_id": null,
+      "forced_eos_token_id": null,
+      "id2label": {
+        "0": "LABEL_0",
+        "1": "LABEL_1"
+      },
+      "initializer_range": 0.02,
+      "is_decoder": false,
+      "is_encoder_decoder": false,
+      "label2id": {
+        "LABEL_0": 0,
+        "LABEL_1": 1
+      },
+      "length_penalty": 1.0,
+      "max_length": 20,
+      "max_source_positions": 1500,
+      "min_length": 0,
+      "model_type": "qwen3_omni_moe_audio_encoder",
+      "n_window": 50,
+      "n_window_infer": 800,
+      "no_repeat_ngram_size": 0,
+      "num_beam_groups": 1,
+      "num_beams": 1,
+      "num_hidden_layers": 32,
+      "num_mel_bins": 128,
+      "num_return_sequences": 1,
+      "output_attentions": false,
+      "output_dim": 2048,
+      "output_hidden_states": false,
+      "output_scores": false,
+      "pad_token_id": null,
+      "prefix": null,
+      "problem_type": null,
+      "pruned_heads": {},
+      "remove_invalid_values": false,
+      "repetition_penalty": 1.0,
+      "return_dict": true,
+      "return_dict_in_generate": false,
+      "scale_embedding": false,
+      "sep_token_id": null,
+      "suppress_tokens": null,
+      "task_specific_params": null,
+      "temperature": 1.0,
+      "tf_legacy_loss": false,
+      "tie_encoder_decoder": false,
+      "tie_word_embeddings": true,
+      "tokenizer_class": null,
+      "top_k": 50,
+      "top_p": 1.0,
+      "torchscript": false,
+      "typical_p": 1.0,
+      "use_bfloat16": false
+    },
+    "audio_end_token_id": 151670,
+    "audio_start_token_id": 151669,
+    "dtype": "bfloat16",
+    "initializer_range": 0.02,
+    "model_type": "qwen3_omni_moe_thinker",
+    "position_id_per_seconds": 13,
+    "seconds_per_chunk": 2,
+    "text_config": {
+      "_name_or_path": "",
+      "add_cross_attention": false,
+      "architectures": null,
+      "attention_bias": false,
+      "attention_dropout": 0.0,
+      "bad_words_ids": null,
+      "begin_suppress_tokens": null,
+      "bos_token_id": null,
+      "chunk_size_feed_forward": 0,
+      "cross_attention_hidden_size": null,
+      "decoder_sparse_step": 1,
+      "decoder_start_token_id": null,
+      "diversity_penalty": 0.0,
+      "do_sample": false,
+      "dtype": "bfloat16",
+      "early_stopping": false,
+      "encoder_no_repeat_ngram_size": 0,
+      "eos_token_id": null,
+      "exponential_decay_length_penalty": null,
+      "finetuning_task": null,
+      "forced_bos_token_id": null,
+      "forced_eos_token_id": null,
+      "head_dim": 128,
+      "hidden_act": "silu",
+      "hidden_size": 2048,
+      "id2label": {
+        "0": "LABEL_0",
+        "1": "LABEL_1"
+      },
+      "initializer_range": 0.02,
+      "intermediate_size": 768,
+      "is_decoder": false,
+      "is_encoder_decoder": false,
+      "label2id": {
+        "LABEL_0": 0,
+        "LABEL_1": 1
+      },
+      "length_penalty": 1.0,
+      "max_length": 20,
+      "max_position_embeddings": 65536,
+      "min_length": 0,
+      "mlp_only_layers": [],
+      "model_type": "qwen3_omni_moe_text",
+      "moe_intermediate_size": 768,
+      "no_repeat_ngram_size": 0,
+      "norm_topk_prob": true,
+      "num_attention_heads": 32,
+      "num_beam_groups": 1,
+      "num_beams": 1,
+      "num_experts": 128,
+      "num_experts_per_tok": 8,
+      "num_hidden_layers": 48,
+      "num_key_value_heads": 4,
+      "num_return_sequences": 1,
+      "output_attentions": false,
+      "output_hidden_states": false,
+      "output_router_logits": false,
+      "output_scores": false,
+      "pad_token_id": 151643,
+      "prefix": null,
+      "problem_type": null,
+      "pruned_heads": {},
+      "remove_invalid_values": false,
+      "repetition_penalty": 1.0,
+      "return_dict": true,
+      "return_dict_in_generate": false,
+      "rms_norm_eps": 1e-06,
+      "rope_scaling": {
+        "interleaved": true,
+        "mrope_interleaved": true,
+        "mrope_section": [
+          24,
+          20,
+          20
+        ],
+        "rope_type": "default",
+        "type": "default"
+      },
+      "rope_theta": 1000000,
+      "router_aux_loss_coef": 0.0,
+      "sep_token_id": null,
+      "shared_expert_intermediate_size": 0,
+      "sliding_window": null,
+      "suppress_tokens": null,
+      "task_specific_params": null,
+      "temperature": 1.0,
+      "tf_legacy_loss": false,
+      "tie_encoder_decoder": false,
+      "tie_word_embeddings": false,
+      "tokenizer_class": null,
+      "top_k": 50,
+      "top_p": 1.0,
+      "torchscript": false,
+      "typical_p": 1.0,
+      "use_bfloat16": false,
+      "use_cache": false,
+      "use_qk_norm": true,
+      "use_sliding_window": false,
+      "vocab_size": 152064
+    },
+    "user_token_id": 872,
+    "vision_config": {
+      "_name_or_path": "",
+      "add_cross_attention": false,
+      "apply_vit_abs_pos_embed": true,
+      "architectures": null,
+      "bad_words_ids": null,
+      "begin_suppress_tokens": null,
+      "bos_token_id": null,
+      "chunk_size_feed_forward": 0,
+      "cross_attention_hidden_size": null,
+      "decoder_start_token_id": null,
+      "deepstack_visual_indexes": [
+        8,
+        16,
+        24
+      ],
+      "depth": 27,
+      "diversity_penalty": 0.0,
+      "do_sample": false,
+      "dtype": "bfloat16",
+      "early_stopping": false,
+      "encoder_no_repeat_ngram_size": 0,
+      "eos_token_id": null,
+      "exponential_decay_length_penalty": null,
+      "finetuning_task": null,
+      "forced_bos_token_id": null,
+      "forced_eos_token_id": null,
+      "hidden_act": "gelu_pytorch_tanh",
+      "hidden_size": 1152,
+      "id2label": {
+        "0": "LABEL_0",
+        "1": "LABEL_1"
+      },
+      "image_size": 768,
+      "in_channels": 3,
+      "in_chans": 3,
+      "initializer_range": 0.02,
+      "intermediate_size": 4304,
+      "is_decoder": false,
+      "is_encoder_decoder": false,
+      "label2id": {
+        "LABEL_0": 0,
+        "LABEL_1": 1
+      },
+      "length_penalty": 1.0,
+      "max_length": 20,
+      "min_length": 0,
+      "model_type": "qwen3_omni_moe_vision_encoder",
+      "no_repeat_ngram_size": 0,
+      "num_beam_groups": 1,
+      "num_beams": 1,
+      "num_heads": 16,
+      "num_position_embeddings": 2304,
+      "num_return_sequences": 1,
+      "out_hidden_size": 2048,
+      "output_attentions": false,
+      "output_hidden_states": false,
+      "output_scores": false,
+      "pad_token_id": null,
+      "patch_size": 16,
+      "prefix": null,
+      "problem_type": null,
+      "pruned_heads": {},
+      "remove_invalid_values": false,
+      "repetition_penalty": 1.0,
+      "return_dict": true,
+      "return_dict_in_generate": false,
+      "sep_token_id": null,
+      "spatial_merge_size": 2,
+      "spatial_patch_size": 16,
+      "suppress_tokens": null,
+      "task_specific_params": null,
+      "temperature": 1.0,
+      "temporal_patch_size": 2,
+      "tf_legacy_loss": false,
+      "tie_encoder_decoder": false,
+      "tie_word_embeddings": true,
+      "tokenizer_class": null,
+      "tokens_per_second": 2,
+      "top_k": 50,
+      "top_p": 1.0,
+      "torchscript": false,
+      "typical_p": 1.0,
+      "use_bfloat16": false
+    },
+    "vision_end_token_id": 151653,
+    "vision_start_token_id": 151652,
+    "audio_token_id": 151675,
+    "image_token_id": 151655,
+    "video_token_id": 151656
+  },
+  "transformers_version": "4.57.0.dev0",
+  "tts_bos_token_id": 151672,
+  "tts_eos_token_id": 151673,
+  "tts_pad_token_id": 151671,
+  "user_token_id": 872
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "max_new_tokens": 32768,
+  "transformers_version": "4.57.0.dev0"
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model-00001-of-00013.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9a61f9f9993e4dc253adddc5bab30e091d16697deaff3a8c8c32bfab693530e0
+size 4997899632

model-00002-of-00013.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:16f280659dd5fa6aa5c680d25b8ac8ab2b1f5a34032b139efff58b39b064619a
+size 4997754216

model-00003-of-00013.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:30fe2cf9da3c163a60ab59e7a4645aec952bea5bf77d444999a7af0efa4e381d
+size 4997754216

model-00004-of-00013.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1cdad401f119a96af853ac574cc6c4704a1c38c8355a29729795789f9042783d
+size 4997755648

model-00005-of-00013.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f4ef2f2449a7ac5bcc0e557d9e66387ce81c18a23ba61b9536a3fea57be7d700
+size 4997755792

model-00006-of-00013.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:88df851f05f49bd0245525180e269456b9233d932a0fc197723b0fed96b23f16
+size 4997755792

model-00007-of-00013.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4d3386341ad28f7d7259b0df1937f7c17d02030c4137c5b15fde4fc4b34997e4
+size 4997755792

model-00008-of-00013.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4a4b92f477a380454611d4e1ffe1866b69713bcda78a0da1508bdfd6bf9cde0b
+size 4997755792

model-00009-of-00013.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2a6864550fec0cc51baa1655927f8d4d8680d902a966e3f1eecd3e89964d1447
+size 4997755792

model-00010-of-00013.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6668a94fab0b2ea9d4fcdc103c118744871e1045fb7122d0bdd0d89bdf955afc
+size 4997755792

model-00011-of-00013.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e773787f068201b302a79e1259db3bf9944b7674bf89ecb6b09762b47b014603
+size 4997755792

model-00012-of-00013.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:31c3bdfdc7c039fffd0143c51f61a92b057f2767c43dc3eb8ad4fe319f2d18ee
+size 4997755792

model-00013-of-00013.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:61a228dc948d390f9b6679f9e17e05bc5dba70fc03232952bf58b4b3570c0920
+size 3467789632

model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "dither": 0.0,
+  "feature_extractor_type": "WhisperFeatureExtractor",
+  "feature_size": 128,
+  "hop_length": 160,
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_processor_type": "Qwen2VLImageProcessor",
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "max_pixels": 12845056,
+  "merge_size": 2,
+  "min_pixels": 3136,
+  "n_fft": 400,
+  "n_samples": 4800000,
+  "nb_max_frames": 30000,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "patch_size": 16,
+  "processor_class": "Qwen3OmniMoeProcessor",
+  "return_attention_mask": true,
+  "sampling_rate": 16000,
+  "temporal_patch_size": 2
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>",
+    "<|audio_start|>",
+    "<|audio_end|>",
+    "<tts_pad>",
+    "<tts_text_bos>",
+    "<tts_text_bos_single>",
+    "<|audio_pad|>"
+  ],
+  "audio_bos_token": "<|audio_start|>",
+  "audio_eos_token": "<|audio_end|>",
+  "audio_token": "<|audio_pad|>",
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "image_token": "<|image_pad|>",
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "video_token": "<|video_pad|>",
+  "vision_bos_token": "<|vision_start|>",
+  "vision_eos_token": "<|vision_end|>"
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:09267689b8362020b9763b65dd5be7e086b31e28d72e02837a9e781de9a91bc7
+size 11423986

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,317 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151666": {
+      "content": "</tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151667": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151668": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151669": {
+      "content": "<|audio_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151670": {
+      "content": "<|audio_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151671": {
+      "content": "<tts_pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151672": {
+      "content": "<tts_text_bos>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151673": {
+      "content": "<tts_text_eod>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151674": {
+      "content": "<tts_text_bos_single>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151675": {
+      "content": "<|audio_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>",
+    "<|audio_start|>",
+    "<|audio_end|>",
+    "<tts_pad>",
+    "<tts_text_bos>",
+    "<tts_text_bos_single>",
+    "<|audio_pad|>"
+  ],
+  "audio_bos_token": "<|audio_start|>",
+  "audio_eos_token": "<|audio_end|>",
+  "audio_token": "<|audio_pad|>",
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {
+    "audio_bos_token": "<|audio_start|>",
+    "audio_eos_token": "<|audio_end|>",
+    "audio_token": "<|audio_pad|>",
+    "image_token": "<|image_pad|>",
+    "video_token": "<|video_pad|>",
+    "vision_bos_token": "<|vision_start|>",
+    "vision_eos_token": "<|vision_end|>"
+  },
+  "image_token": "<|image_pad|>",
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "processor_class": "Qwen3OmniMoeProcessor",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null,
+  "video_token": "<|video_pad|>",
+  "vision_bos_token": "<|vision_start|>",
+  "vision_eos_token": "<|vision_end|>"
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1354 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 200.0,
+  "global_step": 1641,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0006093845216331506,
+      "grad_norm": 7.29836574807662,
+      "learning_rate": 2.0000000000000002e-07,
+      "loss": 0.6879574656486511,
+      "step": 1,
+      "token_acc": 0.8069400259219983
+    },
+    {
+      "epoch": 0.006093845216331505,
+      "grad_norm": 3.9069477397555814,
+      "learning_rate": 2.0000000000000003e-06,
+      "loss": 0.6863329675462511,
+      "step": 10,
+      "token_acc": 0.8062499341798103
+    },
+    {
+      "epoch": 0.01218769043266301,
+      "grad_norm": 1.6499482285217686,
+      "learning_rate": 4.000000000000001e-06,
+      "loss": 0.5632639408111573,
+      "step": 20,
+      "token_acc": 0.832315593221387
+    },
+    {
+      "epoch": 0.018281535648994516,
+      "grad_norm": 1.3391083762662725,
+      "learning_rate": 6e-06,
+      "loss": 0.4509421348571777,
+      "step": 30,
+      "token_acc": 0.859350495238436
+    },
+    {
+      "epoch": 0.02437538086532602,
+      "grad_norm": 1.0720631754188148,
+      "learning_rate": 8.000000000000001e-06,
+      "loss": 0.4215505599975586,
+      "step": 40,
+      "token_acc": 0.8660933700604836
+    },
+    {
+      "epoch": 0.030469226081657527,
+      "grad_norm": 1.300502159057601,
+      "learning_rate": 1e-05,
+      "loss": 0.3916645526885986,
+      "step": 50,
+      "token_acc": 0.8749446762027628
+    },
+    {
+      "epoch": 0.03656307129798903,
+      "grad_norm": 1.2222066565570464,
+      "learning_rate": 9.999025267866269e-06,
+      "loss": 0.37738680839538574,
+      "step": 60,
+      "token_acc": 0.8773714810281518
+    },
+    {
+      "epoch": 0.042656916514320534,
+      "grad_norm": 1.1003373829333203,
+      "learning_rate": 9.996101451506166e-06,
+      "loss": 0.36339468955993653,
+      "step": 70,
+      "token_acc": 0.8811438359423324
+    },
+    {
+      "epoch": 0.04875076173065204,
+      "grad_norm": 0.9139572833064542,
+      "learning_rate": 9.991229690894796e-06,
+      "loss": 0.35523133277893065,
+      "step": 80,
+      "token_acc": 0.8833139693331612
+    },
+    {
+      "epoch": 0.054844606946983544,
+      "grad_norm": 1.0649357265795398,
+      "learning_rate": 9.984411885496807e-06,
+      "loss": 0.36147160530090333,
+      "step": 90,
+      "token_acc": 0.8827829089555647
+    },
+    {
+      "epoch": 0.06093845216331505,
+      "grad_norm": 1.0474850371515747,
+      "learning_rate": 9.975650693525798e-06,
+      "loss": 0.35729637145996096,
+      "step": 100,
+      "token_acc": 0.8842431348706591
+    },
+    {
+      "epoch": 0.06703229737964655,
+      "grad_norm": 1.0275221386997506,
+      "learning_rate": 9.964949530907907e-06,
+      "loss": 0.3474123477935791,
+      "step": 110,
+      "token_acc": 0.8856792866706549
+    },
+    {
+      "epoch": 0.07312614259597806,
+      "grad_norm": 1.2902357608957626,
+      "learning_rate": 9.952312569949963e-06,
+      "loss": 0.3479644775390625,
+      "step": 120,
+      "token_acc": 0.8849104859335039
+    },
+    {
+      "epoch": 0.07921998781230957,
+      "grad_norm": 1.3451160019315398,
+      "learning_rate": 9.937744737712734e-06,
+      "loss": 0.3466474533081055,
+      "step": 130,
+      "token_acc": 0.8861058585962355
+    },
+    {
+      "epoch": 0.08531383302864107,
+      "grad_norm": 1.0790851469433436,
+      "learning_rate": 9.921251714089898e-06,
+      "loss": 0.34614810943603513,
+      "step": 140,
+      "token_acc": 0.8854811515034624
+    },
+    {
+      "epoch": 0.09140767824497258,
+      "grad_norm": 1.0038008030386316,
+      "learning_rate": 9.9028399295935e-06,
+      "loss": 0.3384540557861328,
+      "step": 150,
+      "token_acc": 0.8879619162858826
+    },
+    {
+      "epoch": 0.09750152346130408,
+      "grad_norm": 1.027349218243697,
+      "learning_rate": 9.882516562846735e-06,
+      "loss": 0.33826944828033445,
+      "step": 160,
+      "token_acc": 0.8878969612617404
+    },
+    {
+      "epoch": 0.1035953686776356,
+      "grad_norm": 1.0944757240532788,
+      "learning_rate": 9.860289537785058e-06,
+      "loss": 0.3368839740753174,
+      "step": 170,
+      "token_acc": 0.8883107398785887
+    },
+    {
+      "epoch": 0.10968921389396709,
+      "grad_norm": 0.9672890666466603,
+      "learning_rate": 9.83616752056669e-06,
+      "loss": 0.3455761194229126,
+      "step": 180,
+      "token_acc": 0.8851188684923262
+    },
+    {
+      "epoch": 0.1157830591102986,
+      "grad_norm": 0.8886431201198384,
+      "learning_rate": 9.810159916193763e-06,
+      "loss": 0.32952630519866943,
+      "step": 190,
+      "token_acc": 0.8905697047489018
+    },
+    {
+      "epoch": 0.1218769043266301,
+      "grad_norm": 0.9444272816074001,
+      "learning_rate": 9.782276864845351e-06,
+      "loss": 0.33125505447387693,
+      "step": 200,
+      "token_acc": 0.8897695109589824
+    },
+    {
+      "epoch": 0.12797074954296161,
+      "grad_norm": 1.0093535144294836,
+      "learning_rate": 9.752529237923914e-06,
+      "loss": 0.3311288833618164,
+      "step": 210,
+      "token_acc": 0.8905364268561583
+    },
+    {
+      "epoch": 0.1340645947592931,
+      "grad_norm": 1.1532189931201864,
+      "learning_rate": 9.720928633816596e-06,
+      "loss": 0.3244771003723145,
+      "step": 220,
+      "token_acc": 0.8915392526998382
+    },
+    {
+      "epoch": 0.14015843997562463,
+      "grad_norm": 0.9598378464215558,
+      "learning_rate": 9.687487373373103e-06,
+      "loss": 0.3279410362243652,
+      "step": 230,
+      "token_acc": 0.8906910502215741
+    },
+    {
+      "epoch": 0.14625228519195613,
+      "grad_norm": 0.9540187948014937,
+      "learning_rate": 9.652218495101894e-06,
+      "loss": 0.3265339136123657,
+      "step": 240,
+      "token_acc": 0.8910028614336833
+    },
+    {
+      "epoch": 0.15234613040828762,
+      "grad_norm": 1.0470189428654273,
+      "learning_rate": 9.61513575008656e-06,
+      "loss": 0.33319640159606934,
+      "step": 250,
+      "token_acc": 0.8888186484938951
+    },
+    {
+      "epoch": 0.15843997562461914,
+      "grad_norm": 0.9615038649371879,
+      "learning_rate": 9.576253596624367e-06,
+      "loss": 0.32928056716918946,
+      "step": 260,
+      "token_acc": 0.8897860391237342
+    },
+    {
+      "epoch": 0.16453382084095064,
+      "grad_norm": 1.09800599181465,
+      "learning_rate": 9.53558719458908e-06,
+      "loss": 0.32557024955749514,
+      "step": 270,
+      "token_acc": 0.8914715787293208
+    },
+    {
+      "epoch": 0.17062766605728213,
+      "grad_norm": 1.0743262974854428,
+      "learning_rate": 9.49315239952023e-06,
+      "loss": 0.32053494453430176,
+      "step": 280,
+      "token_acc": 0.8929576222604401
+    },
+    {
+      "epoch": 0.17672151127361366,
+      "grad_norm": 1.060412117175443,
+      "learning_rate": 9.448965756441154e-06,
+      "loss": 0.3243874073028564,
+      "step": 290,
+      "token_acc": 0.8921667614562232
+    },
+    {
+      "epoch": 0.18281535648994515,
+      "grad_norm": 0.9594753815838422,
+      "learning_rate": 9.403044493408205e-06,
+      "loss": 0.3233642578125,
+      "step": 300,
+      "token_acc": 0.891047436596846
+    },
+    {
+      "epoch": 0.18890920170627665,
+      "grad_norm": 0.9403281436285019,
+      "learning_rate": 9.355406514793667e-06,
+      "loss": 0.31829214096069336,
+      "step": 310,
+      "token_acc": 0.8938932609968795
+    },
+    {
+      "epoch": 0.19500304692260817,
+      "grad_norm": 0.9536634938537397,
+      "learning_rate": 9.306070394304955e-06,
+      "loss": 0.3202193260192871,
+      "step": 320,
+      "token_acc": 0.8931520198180799
+    },
+    {
+      "epoch": 0.20109689213893966,
+      "grad_norm": 1.119379822493263,
+      "learning_rate": 9.255055367742868e-06,
+      "loss": 0.3239091396331787,
+      "step": 330,
+      "token_acc": 0.8923521607278241
+    },
+    {
+      "epoch": 0.2071907373552712,
+      "grad_norm": 1.0373576096304553,
+      "learning_rate": 9.202381325501683e-06,
+      "loss": 0.31700589656829836,
+      "step": 340,
+      "token_acc": 0.8944783352337514
+    },
+    {
+      "epoch": 0.21328458257160268,
+      "grad_norm": 1.0632563437214946,
+      "learning_rate": 9.148068804814032e-06,
+      "loss": 0.31794281005859376,
+      "step": 350,
+      "token_acc": 0.8930956012903548
+    },
+    {
+      "epoch": 0.21937842778793418,
+      "grad_norm": 1.0242050960110551,
+      "learning_rate": 9.092138981743588e-06,
+      "loss": 0.3202871799468994,
+      "step": 360,
+      "token_acc": 0.8935469022061816
+    },
+    {
+      "epoch": 0.2254722730042657,
+      "grad_norm": 0.8239921572139911,
+      "learning_rate": 9.034613662928665e-06,
+      "loss": 0.3142183542251587,
+      "step": 370,
+      "token_acc": 0.8951745718050066
+    },
+    {
+      "epoch": 0.2315661182205972,
+      "grad_norm": 0.9147511550012487,
+      "learning_rate": 8.975515277079961e-06,
+      "loss": 0.3087962865829468,
+      "step": 380,
+      "token_acc": 0.8958298740422705
+    },
+    {
+      "epoch": 0.2376599634369287,
+      "grad_norm": 0.8794833827260621,
+      "learning_rate": 8.91486686623577e-06,
+      "loss": 0.3132402658462524,
+      "step": 390,
+      "token_acc": 0.8948639533970186
+    },
+    {
+      "epoch": 0.2437538086532602,
+      "grad_norm": 1.0069623307664877,
+      "learning_rate": 8.85269207677806e-06,
+      "loss": 0.31006736755371095,
+      "step": 400,
+      "token_acc": 0.8951928192311975
+    },
+    {
+      "epoch": 0.2498476538695917,
+      "grad_norm": 0.9808015041824597,
+      "learning_rate": 8.789015150212907e-06,
+      "loss": 0.30683579444885256,
+      "step": 410,
+      "token_acc": 0.8967586393232839
+    },
+    {
+      "epoch": 0.25594149908592323,
+      "grad_norm": 0.9081237770188716,
+      "learning_rate": 8.72386091371891e-06,
+      "loss": 0.3061988830566406,
+      "step": 420,
+      "token_acc": 0.8959391589507399
+    },
+    {
+      "epoch": 0.2620353443022547,
+      "grad_norm": 1.04219527083527,
+      "learning_rate": 8.657254770467252e-06,
+      "loss": 0.3091754674911499,
+      "step": 430,
+      "token_acc": 0.8954508616603208
+    },
+    {
+      "epoch": 0.2681291895185862,
+      "grad_norm": 1.0065133793639498,
+      "learning_rate": 8.58922268971719e-06,
+      "loss": 0.30993127822875977,
+      "step": 440,
+      "token_acc": 0.895664191270881
+    },
+    {
+      "epoch": 0.2742230347349177,
+      "grad_norm": 0.9080797671925362,
+      "learning_rate": 8.51979119669081e-06,
+      "loss": 0.31555490493774413,
+      "step": 450,
+      "token_acc": 0.8941405988077487
+    },
+    {
+      "epoch": 0.28031687995124926,
+      "grad_norm": 0.9841139463866474,
+      "learning_rate": 8.448987362231054e-06,
+      "loss": 0.30534186363220217,
+      "step": 460,
+      "token_acc": 0.8968707588256722
+    },
+    {
+      "epoch": 0.28641072516758076,
+      "grad_norm": 0.9677823622528902,
+      "learning_rate": 8.376838792246978e-06,
+      "loss": 0.3050978422164917,
+      "step": 470,
+      "token_acc": 0.8967596979985816
+    },
+    {
+      "epoch": 0.29250457038391225,
+      "grad_norm": 0.8117589456035273,
+      "learning_rate": 8.303373616950408e-06,
+      "loss": 0.3012993335723877,
+      "step": 480,
+      "token_acc": 0.898916481794861
+    },
+    {
+      "epoch": 0.29859841560024375,
+      "grad_norm": 0.8967761049487325,
+      "learning_rate": 8.228620479888172e-06,
+      "loss": 0.2984607219696045,
+      "step": 490,
+      "token_acc": 0.8986162002706045
+    },
+    {
+      "epoch": 0.30469226081657524,
+      "grad_norm": 0.7934114582439064,
+      "learning_rate": 8.152608526774188e-06,
+      "loss": 0.3049586057662964,
+      "step": 500,
+      "token_acc": 0.8968112886022876
+    },
+    {
+      "epoch": 0.31078610603290674,
+      "grad_norm": 0.825580955342704,
+      "learning_rate": 8.075367394125755e-06,
+      "loss": 0.30215206146240237,
+      "step": 510,
+      "token_acc": 0.8978885397098497
+    },
+    {
+      "epoch": 0.3168799512492383,
+      "grad_norm": 0.8296290441677941,
+      "learning_rate": 7.996927197708486e-06,
+      "loss": 0.3088541507720947,
+      "step": 520,
+      "token_acc": 0.8963321107035679
+    },
+    {
+      "epoch": 0.3229737964655698,
+      "grad_norm": 0.8755135202445912,
+      "learning_rate": 7.917318520794395e-06,
+      "loss": 0.30083427429199217,
+      "step": 530,
+      "token_acc": 0.899119480167394
+    },
+    {
+      "epoch": 0.3290676416819013,
+      "grad_norm": 0.9101072984644949,
+      "learning_rate": 7.836572402237683e-06,
+      "loss": 0.3058091878890991,
+      "step": 540,
+      "token_acc": 0.896643718272106
+    },
+    {
+      "epoch": 0.3351614868982328,
+      "grad_norm": 0.9771967807763615,
+      "learning_rate": 7.754720324372924e-06,
+      "loss": 0.30214991569519045,
+      "step": 550,
+      "token_acc": 0.8980588639486945
+    },
+    {
+      "epoch": 0.34125533211456427,
+      "grad_norm": 1.0026225580388461,
+      "learning_rate": 7.67179420074032e-06,
+      "loss": 0.3041478395462036,
+      "step": 560,
+      "token_acc": 0.8965942594865093
+    },
+    {
+      "epoch": 0.3473491773308958,
+      "grad_norm": 0.9388665918318329,
+      "learning_rate": 7.587826363642845e-06,
+      "loss": 0.30187268257141114,
+      "step": 570,
+      "token_acc": 0.8980740928392202
+    },
+    {
+      "epoch": 0.3534430225472273,
+      "grad_norm": 0.9610197211126468,
+      "learning_rate": 7.502849551540106e-06,
+      "loss": 0.2962314605712891,
+      "step": 580,
+      "token_acc": 0.8994921135841125
+    },
+    {
+      "epoch": 0.3595368677635588,
+      "grad_norm": 0.832216076371822,
+      "learning_rate": 7.4168968962838524e-06,
+      "loss": 0.2948365926742554,
+      "step": 590,
+      "token_acc": 0.8995369426034115
+    },
+    {
+      "epoch": 0.3656307129798903,
+      "grad_norm": 0.9377431212404606,
+      "learning_rate": 7.330001910200111e-06,
+      "loss": 0.29007649421691895,
+      "step": 600,
+      "token_acc": 0.9010131261293394
+    },
+    {
+      "epoch": 0.3717245581962218,
+      "grad_norm": 0.8726611852126548,
+      "learning_rate": 7.242198473022958e-06,
+      "loss": 0.2962885856628418,
+      "step": 610,
+      "token_acc": 0.9000062303355035
+    },
+    {
+      "epoch": 0.3778184034125533,
+      "grad_norm": 0.9153282793617801,
+      "learning_rate": 7.15352081868506e-06,
+      "loss": 0.30144367218017576,
+      "step": 620,
+      "token_acc": 0.8989331770222744
+    },
+    {
+      "epoch": 0.38391224862888484,
+      "grad_norm": 0.993391313101372,
+      "learning_rate": 7.0640035219701085e-06,
+      "loss": 0.301465106010437,
+      "step": 630,
+      "token_acc": 0.8974685325619576
+    },
+    {
+      "epoch": 0.39000609384521634,
+      "grad_norm": 1.0046408788594328,
+      "learning_rate": 6.973681485032359e-06,
+      "loss": 0.2955395460128784,
+      "step": 640,
+      "token_acc": 0.8996091046695718
+    },
+    {
+      "epoch": 0.39609993906154783,
+      "grad_norm": 0.822820271911727,
+      "learning_rate": 6.8825899237885215e-06,
+      "loss": 0.2931050300598145,
+      "step": 650,
+      "token_acc": 0.901203589259751
+    },
+    {
+      "epoch": 0.40219378427787933,
+      "grad_norm": 0.8482496681393756,
+      "learning_rate": 6.7907643541873446e-06,
+      "loss": 0.29596996307373047,
+      "step": 660,
+      "token_acc": 0.8996866207121305
+    },
+    {
+      "epoch": 0.4082876294942108,
+      "grad_norm": 0.8775663994372018,
+      "learning_rate": 6.698240578362179e-06,
+      "loss": 0.29141840934753416,
+      "step": 670,
+      "token_acc": 0.9003262426482238
+    },
+    {
+      "epoch": 0.4143814747105424,
+      "grad_norm": 0.984669646190565,
+      "learning_rate": 6.6050546706719984e-06,
+      "loss": 0.29290521144866943,
+      "step": 680,
+      "token_acc": 0.9014104043327218
+    },
+    {
+      "epoch": 0.42047531992687387,
+      "grad_norm": 0.8784418931211103,
+      "learning_rate": 6.511242963636257e-06,
+      "loss": 0.29056534767150877,
+      "step": 690,
+      "token_acc": 0.9016642094853267
+    },
+    {
+      "epoch": 0.42656916514320536,
+      "grad_norm": 1.0470361792821843,
+      "learning_rate": 6.416842033769106e-06,
+      "loss": 0.2978256940841675,
+      "step": 700,
+      "token_acc": 0.8997917186822428
+    },
+    {
+      "epoch": 0.43266301035953686,
+      "grad_norm": 0.9613791001197699,
+      "learning_rate": 6.321888687318457e-06,
+      "loss": 0.2870903253555298,
+      "step": 710,
+      "token_acc": 0.903113691147251
+    },
+    {
+      "epoch": 0.43875685557586835,
+      "grad_norm": 0.8405716630112535,
+      "learning_rate": 6.2264199459155105e-06,
+      "loss": 0.29581589698791505,
+      "step": 720,
+      "token_acc": 0.9003898532372131
+    },
+    {
+      "epoch": 0.4448507007921999,
+      "grad_norm": 0.9817927857442479,
+      "learning_rate": 6.130473032140272e-06,
+      "loss": 0.29129691123962403,
+      "step": 730,
+      "token_acc": 0.9009383225625913
+    },
+    {
+      "epoch": 0.4509445460085314,
+      "grad_norm": 0.9100915684781385,
+      "learning_rate": 6.0340853550087345e-06,
+      "loss": 0.29650187492370605,
+      "step": 740,
+      "token_acc": 0.9002656385758284
+    },
+    {
+      "epoch": 0.4570383912248629,
+      "grad_norm": 0.9238619342391209,
+      "learning_rate": 5.937294495387377e-06,
+      "loss": 0.2921621561050415,
+      "step": 750,
+      "token_acc": 0.9008455874319925
+    },
+    {
+      "epoch": 0.4631322364411944,
+      "grad_norm": 0.8289061064281614,
+      "learning_rate": 5.840138191340651e-06,
+      "loss": 0.28725643157958985,
+      "step": 760,
+      "token_acc": 0.9028466795835374
+    },
+    {
+      "epoch": 0.4692260816575259,
+      "grad_norm": 0.8901360785145829,
+      "learning_rate": 5.7426543234171736e-06,
+      "loss": 0.2865636348724365,
+      "step": 770,
+      "token_acc": 0.90197109501604
+    },
+    {
+      "epoch": 0.4753199268738574,
+      "grad_norm": 0.8709058451908881,
+      "learning_rate": 5.644880899880382e-06,
+      "loss": 0.2886040687561035,
+      "step": 780,
+      "token_acc": 0.9023270689287564
+    },
+    {
+      "epoch": 0.48141377209018893,
+      "grad_norm": 0.9306196525173549,
+      "learning_rate": 5.546856041889374e-06,
+      "loss": 0.28833470344543455,
+      "step": 790,
+      "token_acc": 0.9016039529639475
+    },
+    {
+      "epoch": 0.4875076173065204,
+      "grad_norm": 0.9401250944884257,
+      "learning_rate": 5.448617968635741e-06,
+      "loss": 0.28241567611694335,
+      "step": 800,
+      "token_acc": 0.9046351860634857
+    },
+    {
+      "epoch": 0.4936014625228519,
+      "grad_norm": 0.849983180158667,
+      "learning_rate": 5.35020498244219e-06,
+      "loss": 0.2863471508026123,
+      "step": 810,
+      "token_acc": 0.9020820443108771
+    },
+    {
+      "epoch": 0.4996953077391834,
+      "grad_norm": 0.7275676892245573,
+      "learning_rate": 5.251655453828728e-06,
+      "loss": 0.28403263092041015,
+      "step": 820,
+      "token_acc": 0.9032200331101135
+    },
+    {
+      "epoch": 0.505789152955515,
+      "grad_norm": 0.8630110541652776,
+      "learning_rate": 5.153007806552275e-06,
+      "loss": 0.28420357704162597,
+      "step": 830,
+      "token_acc": 0.9033704118180856
+    },
+    {
+      "epoch": 0.5118829981718465,
+      "grad_norm": 0.8835421688612489,
+      "learning_rate": 5.054300502625517e-06,
+      "loss": 0.2866727352142334,
+      "step": 840,
+      "token_acc": 0.9032091030720939
+    },
+    {
+      "epoch": 0.517976843388178,
+      "grad_norm": 0.8544875287993453,
+      "learning_rate": 4.9555720273208475e-06,
+      "loss": 0.289061975479126,
+      "step": 850,
+      "token_acc": 0.9017317721145331
+    },
+    {
+      "epoch": 0.5240706886045094,
+      "grad_norm": 0.8549205024097043,
+      "learning_rate": 4.856860874165218e-06,
+      "loss": 0.2889714241027832,
+      "step": 860,
+      "token_acc": 0.9025821278082484
+    },
+    {
+      "epoch": 0.5301645338208409,
+      "grad_norm": 0.9236105201664164,
+      "learning_rate": 4.758205529931808e-06,
+      "loss": 0.2887147903442383,
+      "step": 870,
+      "token_acc": 0.9019780647042623
+    },
+    {
+      "epoch": 0.5362583790371724,
+      "grad_norm": 0.8682794949168545,
+      "learning_rate": 4.659644459634293e-06,
+      "loss": 0.27901973724365237,
+      "step": 880,
+      "token_acc": 0.9043348147353298
+    },
+    {
+      "epoch": 0.5423522242535039,
+      "grad_norm": 0.8729641279912889,
+      "learning_rate": 4.56121609152961e-06,
+      "loss": 0.2851783275604248,
+      "step": 890,
+      "token_acc": 0.9031912203833561
+    },
+    {
+      "epoch": 0.5484460694698354,
+      "grad_norm": 0.8418875200344721,
+      "learning_rate": 4.462958802135069e-06,
+      "loss": 0.27748913764953614,
+      "step": 900,
+      "token_acc": 0.9059390881360567
+    },
+    {
+      "epoch": 0.5545399146861669,
+      "grad_norm": 0.8894129853584928,
+      "learning_rate": 4.364910901265607e-06,
+      "loss": 0.28034243583679197,
+      "step": 910,
+      "token_acc": 0.9040050510001095
+    },
+    {
+      "epoch": 0.5606337599024985,
+      "grad_norm": 0.8334588350840866,
+      "learning_rate": 4.2671106170970734e-06,
+      "loss": 0.2801810264587402,
+      "step": 920,
+      "token_acc": 0.9042555097117814
+    },
+    {
+      "epoch": 0.56672760511883,
+      "grad_norm": 0.8763484647820953,
+      "learning_rate": 4.169596081261332e-06,
+      "loss": 0.2837662696838379,
+      "step": 930,
+      "token_acc": 0.9037383810780553
+    },
+    {
+      "epoch": 0.5728214503351615,
+      "grad_norm": 0.8713237221620964,
+      "learning_rate": 4.072405313979021e-06,
+      "loss": 0.27712116241455076,
+      "step": 940,
+      "token_acc": 0.9053036654966837
+    },
+    {
+      "epoch": 0.578915295551493,
+      "grad_norm": 0.8844118885887313,
+      "learning_rate": 3.975576209235726e-06,
+      "loss": 0.2806640625,
+      "step": 950,
+      "token_acc": 0.9047340125759082
+    },
+    {
+      "epoch": 0.5850091407678245,
+      "grad_norm": 0.8719900072150049,
+      "learning_rate": 3.879146520007399e-06,
+      "loss": 0.27962145805358884,
+      "step": 960,
+      "token_acc": 0.9052189543003484
+    },
+    {
+      "epoch": 0.591102985984156,
+      "grad_norm": 0.8621214557871747,
+      "learning_rate": 3.7831538435407344e-06,
+      "loss": 0.281157398223877,
+      "step": 970,
+      "token_acc": 0.9040866660422715
+    },
+    {
+      "epoch": 0.5971968312004875,
+      "grad_norm": 0.85966956497571,
+      "learning_rate": 3.687635606694271e-06,
+      "loss": 0.2849492073059082,
+      "step": 980,
+      "token_acc": 0.9041384613065175
+    },
+    {
+      "epoch": 0.603290676416819,
+      "grad_norm": 0.8505152160082087,
+      "learning_rate": 3.592629051345936e-06,
+      "loss": 0.2792569637298584,
+      "step": 990,
+      "token_acc": 0.9054755884673447
+    },
+    {
+      "epoch": 0.6093845216331505,
+      "grad_norm": 0.9214402604733031,
+      "learning_rate": 3.4981712198726956e-06,
+      "loss": 0.2757925033569336,
+      "step": 1000,
+      "token_acc": 0.9061934946027913
+    },
+    {
+      "epoch": 0.615478366849482,
+      "grad_norm": 0.8580050185956459,
+      "learning_rate": 3.4042989407079986e-06,
+      "loss": 0.2790709972381592,
+      "step": 1010,
+      "token_acc": 0.9051715866568587
+    },
+    {
+      "epoch": 0.6215722120658135,
+      "grad_norm": 0.7762593811197912,
+      "learning_rate": 3.311048813982627e-06,
+      "loss": 0.2719182014465332,
+      "step": 1020,
+      "token_acc": 0.9072872717021148
+    },
+    {
+      "epoch": 0.6276660572821451,
+      "grad_norm": 0.8305900083620258,
+      "learning_rate": 3.218457197254583e-06,
+      "loss": 0.27586350440979,
+      "step": 1030,
+      "token_acc": 0.9060086339753238
+    },
+    {
+      "epoch": 0.6337599024984766,
+      "grad_norm": 0.8955059982745348,
+      "learning_rate": 3.1265601913335196e-06,
+      "loss": 0.2731196403503418,
+      "step": 1040,
+      "token_acc": 0.9076037121001682
+    },
+    {
+      "epoch": 0.6398537477148081,
+      "grad_norm": 0.8712242634564721,
+      "learning_rate": 3.035393626205306e-06,
+      "loss": 0.2795309066772461,
+      "step": 1050,
+      "token_acc": 0.9047484454494065
+    },
+    {
+      "epoch": 0.6459475929311396,
+      "grad_norm": 0.8162886626845998,
+      "learning_rate": 2.944993047062161e-06,
+      "loss": 0.26994550228118896,
+      "step": 1060,
+      "token_acc": 0.9082915598041501
+    },
+    {
+      "epoch": 0.6520414381474711,
+      "grad_norm": 0.8874044395879559,
+      "learning_rate": 2.8553937004438425e-06,
+      "loss": 0.2744093418121338,
+      "step": 1070,
+      "token_acc": 0.9072907727436752
+    },
+    {
+      "epoch": 0.6581352833638026,
+      "grad_norm": 0.8288310546310844,
+      "learning_rate": 2.766630520495277e-06,
+      "loss": 0.2674886226654053,
+      "step": 1080,
+      "token_acc": 0.9087633615660454
+    },
+    {
+      "epoch": 0.664229128580134,
+      "grad_norm": 0.8828846811452266,
+      "learning_rate": 2.67873811534598e-06,
+      "loss": 0.2735260486602783,
+      "step": 1090,
+      "token_acc": 0.9060899523658108
+    },
+    {
+      "epoch": 0.6703229737964655,
+      "grad_norm": 0.8055682508984224,
+      "learning_rate": 2.591750753616596e-06,
+      "loss": 0.2687216758728027,
+      "step": 1100,
+      "token_acc": 0.9077474362897096
+    },
+    {
+      "epoch": 0.676416819012797,
+      "grad_norm": 0.8527567804445506,
+      "learning_rate": 2.505702351057804e-06,
+      "loss": 0.27487955093383787,
+      "step": 1110,
+      "token_acc": 0.9064443638076686
+    },
+    {
+      "epoch": 0.6825106642291285,
+      "grad_norm": 0.8043496565707575,
+      "learning_rate": 2.4206264573268174e-06,
+      "loss": 0.2709942102432251,
+      "step": 1120,
+      "token_acc": 0.9082038753361505
+    },
+    {
+      "epoch": 0.68860450944546,
+      "grad_norm": 0.8177848047582682,
+      "learning_rate": 2.336556242906608e-06,
+      "loss": 0.26909465789794923,
+      "step": 1130,
+      "token_acc": 0.907756650686803
+    },
+    {
+      "epoch": 0.6946983546617916,
+      "grad_norm": 0.8281752422683824,
+      "learning_rate": 2.2535244861729707e-06,
+      "loss": 0.27281508445739744,
+      "step": 1140,
+      "token_acc": 0.9068872307019957
+    },
+    {
+      "epoch": 0.7007921998781231,
+      "grad_norm": 0.7368812719716331,
+      "learning_rate": 2.1715635606144653e-06,
+      "loss": 0.2704050064086914,
+      "step": 1150,
+      "token_acc": 0.9086829548350435
+    },
+    {
+      "epoch": 0.7068860450944546,
+      "grad_norm": 0.8983810091681733,
+      "learning_rate": 2.0907054222102367e-06,
+      "loss": 0.2690997362136841,
+      "step": 1160,
+      "token_acc": 0.9079458353782861
+    },
+    {
+      "epoch": 0.7129798903107861,
+      "grad_norm": 0.976946993038541,
+      "learning_rate": 2.0109815969705922e-06,
+      "loss": 0.2747433423995972,
+      "step": 1170,
+      "token_acc": 0.9060301301519122
+    },
+    {
+      "epoch": 0.7190737355271176,
+      "grad_norm": 0.8007237087596002,
+      "learning_rate": 1.9324231686452478e-06,
+      "loss": 0.2671233654022217,
+      "step": 1180,
+      "token_acc": 0.9086050565301521
+    },
+    {
+      "epoch": 0.7251675807434491,
+      "grad_norm": 0.8064570085543009,
+      "learning_rate": 1.8550607666039877e-06,
+      "loss": 0.27011594772338865,
+      "step": 1190,
+      "token_acc": 0.9079702457204528
+    },
+    {
+      "epoch": 0.7312614259597806,
+      "grad_norm": 0.8831329237202693,
+      "learning_rate": 1.7789245538944971e-06,
+      "loss": 0.2661958456039429,
+      "step": 1200,
+      "token_acc": 0.909048799129166
+    },
+    {
+      "epoch": 0.7373552711761121,
+      "grad_norm": 0.8430483750865159,
+      "learning_rate": 1.7040442154820036e-06,
+      "loss": 0.2669236183166504,
+      "step": 1210,
+      "token_acc": 0.9086229167124993
+    },
+    {
+      "epoch": 0.7434491163924436,
+      "grad_norm": 0.8347549917161227,
+      "learning_rate": 1.6304489466753237e-06,
+      "loss": 0.26542019844055176,
+      "step": 1220,
+      "token_acc": 0.9091426534148126
+    },
+    {
+      "epoch": 0.7495429616087751,
+      "grad_norm": 0.830454588444548,
+      "learning_rate": 1.5581674417438143e-06,
+      "loss": 0.2647353410720825,
+      "step": 1230,
+      "token_acc": 0.909506020348688
+    },
+    {
+      "epoch": 0.7556368068251066,
+      "grad_norm": 0.8676010280531331,
+      "learning_rate": 1.4872278827296855e-06,
+      "loss": 0.2685891628265381,
+      "step": 1240,
+      "token_acc": 0.9081622979570555
+    },
+    {
+      "epoch": 0.7617306520414382,
+      "grad_norm": 0.707455832514829,
+      "learning_rate": 1.417657928460029e-06,
+      "loss": 0.2678367614746094,
+      "step": 1250,
+      "token_acc": 0.9088005125349524
+    },
+    {
+      "epoch": 0.7678244972577697,
+      "grad_norm": 0.9332592296684585,
+      "learning_rate": 1.349484703762834e-06,
+      "loss": 0.2678724765777588,
+      "step": 1260,
+      "token_acc": 0.9090774872882107
+    },
+    {
+      "epoch": 0.7739183424741012,
+      "grad_norm": 0.9124536066814944,
+      "learning_rate": 1.2827347888912057e-06,
+      "loss": 0.2636892795562744,
+      "step": 1270,
+      "token_acc": 0.9094603622970171
+    },
+    {
+      "epoch": 0.7800121876904327,
+      "grad_norm": 0.8868523419233089,
+      "learning_rate": 1.2174342091599277e-06,
+      "loss": 0.2640355587005615,
+      "step": 1280,
+      "token_acc": 0.9101203136208611
+    },
+    {
+      "epoch": 0.7861060329067642,
+      "grad_norm": 0.8162281839833351,
+      "learning_rate": 1.1536084247983626e-06,
+      "loss": 0.2618927717208862,
+      "step": 1290,
+      "token_acc": 0.9093984578881031
+    },
+    {
+      "epoch": 0.7921998781230957,
+      "grad_norm": 0.8334510756887459,
+      "learning_rate": 1.0912823210237033e-06,
+      "loss": 0.2639930725097656,
+      "step": 1300,
+      "token_acc": 0.9095154304277207
+    },
+    {
+      "epoch": 0.7982937233394272,
+      "grad_norm": 0.9484830756554262,
+      "learning_rate": 1.0304801983383989e-06,
+      "loss": 0.2679661750793457,
+      "step": 1310,
+      "token_acc": 0.9085439305540266
+    },
+    {
+      "epoch": 0.8043875685557587,
+      "grad_norm": 0.7917038864004372,
+      "learning_rate": 9.712257630555589e-07,
+      "loss": 0.263914155960083,
+      "step": 1320,
+      "token_acc": 0.9098282765579997
+    },
+    {
+      "epoch": 0.8104814137720902,
+      "grad_norm": 0.8164310323072432,
+      "learning_rate": 9.135421180560394e-07,
+      "loss": 0.27391440868377687,
+      "step": 1330,
+      "token_acc": 0.9072812991094814
+    },
+    {
+      "epoch": 0.8165752589884216,
+      "grad_norm": 0.7878349824156636,
+      "learning_rate": 8.574517537807897e-07,
+      "loss": 0.2658750057220459,
+      "step": 1340,
+      "token_acc": 0.9089495350890863
+    },
+    {
+      "epoch": 0.8226691042047533,
+      "grad_norm": 0.7620095983862565,
+      "learning_rate": 8.029765394619899e-07,
+      "loss": 0.25719194412231444,
+      "step": 1350,
+      "token_acc": 0.911888654763225
+    },
+    {
+      "epoch": 0.8287629494210847,
+      "grad_norm": 0.8206579913283775,
+      "learning_rate": 7.501377145963939e-07,
+      "loss": 0.2592960834503174,
+      "step": 1360,
+      "token_acc": 0.9114338606023208
+    },
+    {
+      "epoch": 0.8348567946374162,
+      "grad_norm": 0.8789992765077687,
+      "learning_rate": 6.98955880664205e-07,
+      "loss": 0.26435413360595705,
+      "step": 1370,
+      "token_acc": 0.9108234231521902
+    },
+    {
+      "epoch": 0.8409506398537477,
+      "grad_norm": 0.9837537034286392,
+      "learning_rate": 6.494509930967019e-07,
+      "loss": 0.2641714572906494,
+      "step": 1380,
+      "token_acc": 0.9101989856105199
+    },
+    {
+      "epoch": 0.8470444850700792,
+      "grad_norm": 0.8346126227296959,
+      "learning_rate": 6.016423534957616e-07,
+      "loss": 0.26149678230285645,
+      "step": 1390,
+      "token_acc": 0.9105589320112891
+    },
+    {
+      "epoch": 0.8531383302864107,
+      "grad_norm": 0.789773058927434,
+      "learning_rate": 5.555486021082979e-07,
+      "loss": 0.25979223251342776,
+      "step": 1400,
+      "token_acc": 0.9105615762961907
+    },
+    {
+      "epoch": 0.8592321755027422,
+      "grad_norm": 0.7391262213112039,
+      "learning_rate": 5.111877105585672e-07,
+      "loss": 0.2619319915771484,
+      "step": 1410,
+      "token_acc": 0.9112515917773331
+    },
+    {
+      "epoch": 0.8653260207190737,
+      "grad_norm": 0.732756554862386,
+      "learning_rate": 4.6857697484116006e-07,
+      "loss": 0.26052017211914064,
+      "step": 1420,
+      "token_acc": 0.9111355670436785
+    },
+    {
+      "epoch": 0.8714198659354052,
+      "grad_norm": 0.9052605008388693,
+      "learning_rate": 4.277330085774156e-07,
+      "loss": 0.26050865650177,
+      "step": 1430,
+      "token_acc": 0.9113159185335296
+    },
+    {
+      "epoch": 0.8775137111517367,
+      "grad_norm": 0.8239425361941399,
+      "learning_rate": 3.886717365378867e-07,
+      "loss": 0.2652243137359619,
+      "step": 1440,
+      "token_acc": 0.9098248347337728
+    },
+    {
+      "epoch": 0.8836075563680682,
+      "grad_norm": 0.8321718064306127,
+      "learning_rate": 3.5140838843339073e-07,
+      "loss": 0.2614146709442139,
+      "step": 1450,
+      "token_acc": 0.9103242825028786
+    },
+    {
+      "epoch": 0.8897014015843998,
+      "grad_norm": 0.9427110487674982,
+      "learning_rate": 3.159574929770515e-07,
+      "loss": 0.26317219734191893,
+      "step": 1460,
+      "token_acc": 0.9102542106779491
+    },
+    {
+      "epoch": 0.8957952468007313,
+      "grad_norm": 0.8005907233947733,
+      "learning_rate": 2.8233287221965555e-07,
+      "loss": 0.2689415216445923,
+      "step": 1470,
+      "token_acc": 0.9084669140620019
+    },
+    {
+      "epoch": 0.9018890920170628,
+      "grad_norm": 0.8834142513691242,
+      "learning_rate": 2.5054763616053967e-07,
+      "loss": 0.26386346817016604,
+      "step": 1480,
+      "token_acc": 0.9098926633899981
+    },
+    {
+      "epoch": 0.9079829372333943,
+      "grad_norm": 0.8652226986660423,
+      "learning_rate": 2.2061417763608818e-07,
+      "loss": 0.2603492259979248,
+      "step": 1490,
+      "token_acc": 0.9111148919621807
+    },
+    {
+      "epoch": 0.9140767824497258,
+      "grad_norm": 0.7761477175475302,
+      "learning_rate": 1.9254416748786086e-07,
+      "loss": 0.2592171669006348,
+      "step": 1500,
+      "token_acc": 0.9112373322356396
+    },
+    {
+      "epoch": 0.9201706276660573,
+      "grad_norm": 0.7766751712855907,
+      "learning_rate": 1.6634855001221195e-07,
+      "loss": 0.258951997756958,
+      "step": 1510,
+      "token_acc": 0.9106356546794409
+    },
+    {
+      "epoch": 0.9262644728823888,
+      "grad_norm": 0.856909898768609,
+      "learning_rate": 1.4203753869318882e-07,
+      "loss": 0.2605564117431641,
+      "step": 1520,
+      "token_acc": 0.9109015609309732
+    },
+    {
+      "epoch": 0.9323583180987203,
+      "grad_norm": 0.8678261922910359,
+      "learning_rate": 1.196206122203647e-07,
+      "loss": 0.267201566696167,
+      "step": 1530,
+      "token_acc": 0.9091924387660025
+    },
+    {
+      "epoch": 0.9384521633150518,
+      "grad_norm": 0.8245437796092319,
+      "learning_rate": 9.910651079316824e-08,
+      "loss": 0.25865275859832765,
+      "step": 1540,
+      "token_acc": 0.9117370919567883
+    },
+    {
+      "epoch": 0.9445460085313833,
+      "grad_norm": 0.7648349491441419,
+      "learning_rate": 8.050323271314331e-08,
+      "loss": 0.2569366216659546,
+      "step": 1550,
+      "token_acc": 0.9122892575583048
+    },
+    {
+      "epoch": 0.9506398537477148,
+      "grad_norm": 0.844132664732268,
+      "learning_rate": 6.381803126546405e-08,
+      "loss": 0.26746933460235595,
+      "step": 1560,
+      "token_acc": 0.9087516916083089
+    },
+    {
+      "epoch": 0.9567336989640464,
+      "grad_norm": 0.8550282187735159,
+      "learning_rate": 4.9057411890933714e-08,
+      "loss": 0.2634291172027588,
+      "step": 1570,
+      "token_acc": 0.9101502847948816
+    },
+    {
+      "epoch": 0.9628275441803779,
+      "grad_norm": 0.8962920945122091,
+      "learning_rate": 3.622712964956032e-08,
+      "loss": 0.26028733253479003,
+      "step": 1580,
+      "token_acc": 0.9110691577022408
+    },
+    {
+      "epoch": 0.9689213893967094,
+      "grad_norm": 0.8191620838439264,
+      "learning_rate": 2.5332186976697037e-08,
+      "loss": 0.26295406818389894,
+      "step": 1590,
+      "token_acc": 0.9106372558253433
+    },
+    {
+      "epoch": 0.9750152346130408,
+      "grad_norm": 0.803005796954641,
+      "learning_rate": 1.637683173263238e-08,
+      "loss": 0.2601941585540771,
+      "step": 1600,
+      "token_acc": 0.9106438532047947
+    },
+    {
+      "epoch": 0.9811090798293723,
+      "grad_norm": 1.0200184560604955,
+      "learning_rate": 9.364555546375054e-09,
+      "loss": 0.265762186050415,
+      "step": 1610,
+      "token_acc": 0.9099375217270665
+    },
+    {
+      "epoch": 0.9872029250457038,
+      "grad_norm": 0.8217240197064228,
+      "learning_rate": 4.2980924542984634e-09,
+      "loss": 0.261862587928772,
+      "step": 1620,
+      "token_acc": 0.9104295425993519
+    },
+    {
+      "epoch": 0.9932967702620353,
+      "grad_norm": 0.8981159929317022,
+      "learning_rate": 1.179417834153429e-09,
+      "loss": 0.2626341342926025,
+      "step": 1630,
+      "token_acc": 0.9100063135380294
+    },
+    {
+      "epoch": 0.9993906154783668,
+      "grad_norm": 0.8766885423326849,
+      "learning_rate": 9.74763488759134e-12,
+      "loss": 0.2605599880218506,
+      "step": 1640,
+      "token_acc": 0.9109949846594887
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 1641,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1566399809454080.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

video_preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,54 @@

+{
+  "crop_size": null,
+  "data_format": "channels_first",
+  "default_to_square": true,
+  "device": null,
+  "dither": 0.0,
+  "do_center_crop": null,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "do_sample_frames": false,
+  "feature_extractor_type": "WhisperFeatureExtractor",
+  "feature_size": 128,
+  "fps": null,
+  "hop_length": 160,
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "input_data_format": null,
+  "max_frames": 768,
+  "max_pixels": 12845056,
+  "merge_size": 2,
+  "min_frames": 4,
+  "min_pixels": 3136,
+  "n_fft": 400,
+  "n_samples": 4800000,
+  "nb_max_frames": 30000,
+  "num_frames": null,
+  "pad_size": null,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "patch_size": 16,
+  "processor_class": "Qwen3OmniMoeProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "return_attention_mask": true,
+  "return_metadata": false,
+  "sampling_rate": 16000,
+  "size": {
+    "longest_edge": 12845056,
+    "shortest_edge": 3136
+  },
+  "temporal_patch_size": 2,
+  "video_metadata": null,
+  "video_processor_type": "Qwen2VLVideoProcessor"
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff