feat: modify file type of .py, .txt, etc. to change storage method

Browse files

Files changed (15) hide show

.gitattributes +1 -0
added_tokens.json +34 -3
chat_template.jinja +85 -3
chat_template.json +3 -3
generation_config.json +7 -3
merges.txt +0 -0
model.safetensors.index.json +798 -3
modeling_projector.py +308 -3
modeling_valley.py +1 -0
modeling_vision_tower.py +323 -3
preprocessor_config.json +6 -3
processing_valley.py +618 -3
special_tokens_map.json +37 -3
tokenizer_config.json +298 -3
utils.py +409 -3

.gitattributes CHANGED Viewed

@@ -38,3 +38,4 @@ model-00004-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text
 model-00002-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text
 model-00003-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text
 valley_structure.png filter=lfs diff=lfs merge=lfs -text

 model-00002-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text
 model-00003-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text
 valley_structure.png filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

added_tokens.json CHANGED Viewed

@@ -1,3 +1,34 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:87e5cd31a0e03650b23635178999a8a3942978e9270f041f3cf33ee3270c252f
-size 839

+{
+  "</think>": 151668,
+  "</tool_call>": 151658,
+  "</tool_response>": 151666,
+  "<\\cor>": 151674,
+  "<cor>": 151673,
+  "<im_end>": 151670,
+  "<im_start>": 151669,
+  "<think>": 151667,
+  "<tool_call>": 151657,
+  "<tool_response>": 151665,
+  "<vi_end>": 151672,
+  "<vi_start>": 151671,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

chat_template.jinja CHANGED Viewed

@@ -1,3 +1,85 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:87a2728cb8dc9fe424d624542f6060ec05a1d285ebbec578bb078900e33396b5
-size 4116

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}
+        {{- messages[0].content + '\n\n' }}
+    {%- endif %}
+    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+    {%- set index = (messages|length - 1) - loop.index0 %}
+    {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
+        {%- set ns.multi_step_tool = false %}
+        {%- set ns.last_query_index = index %}
+    {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {%- set content = message.content %}
+        {%- set reasoning_content = '' %}
+        {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+            {%- set reasoning_content = message.reasoning_content %}
+        {%- else %}
+            {%- if '</think>' in message.content %}
+                {%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
+                {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+            {%- endif %}
+        {%- endif %}
+        {%- if loop.index0 > ns.last_query_index %}
+            {%- if loop.last or (not loop.last and reasoning_content) %}
+                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
+            {%- else %}
+                {{- '<|im_start|>' + message.role + '\n' + content }}
+            {%- endif %}
+        {%- else %}
+            {{- '<|im_start|>' + message.role + '\n' + content }}
+        {%- endif %}
+        {%- if message.tool_calls %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if (loop.first and content) or (not loop.first) %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if tool_call.function %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<tool_call>\n{"name": "' }}
+                {{- tool_call.name }}
+                {{- '", "arguments": ' }}
+                {%- if tool_call.arguments is string %}
+                    {{- tool_call.arguments }}
+                {%- else %}
+                    {{- tool_call.arguments | tojson }}
+                {%- endif %}
+                {{- '}\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+    {%- if enable_thinking is defined and enable_thinking is false %}
+        {{- '<think>\n\n</think>\n\n' }}
+    {%- endif %}
+{%- endif %}

chat_template.json CHANGED Viewed

@@ -1,3 +1,3 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:8eedba4e39df3e45cccc86e7681c3c58fd90199fb601a8ab2b430be8b89bf8b3
-size 4306

+{
+    "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0].role == 'system' %}\n        {{- messages[0].content + '\\n\\n' }}\n    {%- endif %}\n    {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0].role == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n    {%- set index = (messages|length - 1) - loop.index0 %}\n    {%- if ns.multi_step_tool and message.role == \"user\" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}\n        {%- set ns.multi_step_tool = false %}\n        {%- set ns.last_query_index = index %}\n    {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {%- set content = message.content %}\n        {%- set reasoning_content = '' %}\n        {%- if message.reasoning_content is defined and message.reasoning_content is not none %}\n            {%- set reasoning_content = message.reasoning_content %}\n        {%- else %}\n            {%- if '</think>' in message.content %}\n                {%- set content = message.content.split('</think>')[-1].lstrip('\\n') %}\n                {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n            {%- endif %}\n        {%- endif %}\n        {%- if loop.index0 > ns.last_query_index %}\n            {%- if loop.last or (not loop.last and reasoning_content) %}\n                {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}\n            {%- else %}\n                {{- '<|im_start|>' + message.role + '\\n' + content }}\n            {%- endif %}\n        {%- else %}\n            {{- '<|im_start|>' + message.role + '\\n' + content }}\n        {%- endif %}\n        {%- if message.tool_calls %}\n            {%- for tool_call in message.tool_calls %}\n                {%- if (loop.first and content) or (not loop.first) %}\n                    {{- '\\n' }}\n                {%- endif %}\n                {%- if tool_call.function %}\n                    {%- set tool_call = tool_call.function %}\n                {%- endif %}\n                {{- '<tool_call>\\n{\"name\": \"' }}\n                {{- tool_call.name }}\n                {{- '\", \"arguments\": ' }}\n                {%- if tool_call.arguments is string %}\n                    {{- tool_call.arguments }}\n                {%- else %}\n                    {{- tool_call.arguments | tojson }}\n                {%- endif %}\n                {{- '}\\n</tool_call>' }}\n            {%- endfor %}\n        {%- endif %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n    {%- if enable_thinking is defined and enable_thinking is false %}\n        {{- '<think>\\n\\n</think>\\n\\n' }}\n    {%- endif %}\n{%- endif %}"
+}

generation_config.json CHANGED Viewed

@@ -1,3 +1,7 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f307d8ac4048390cd67f8cd0111b62d14b82b613213c4ec76aa6e9873d8505e1
-size 142

+{
+  "_from_model_config": true,
+  "eos_token_id": 151645,
+  "pad_token_id": 151643,
+  "transformers_version": "4.54.0",
+  "use_cache": true
+}

merges.txt CHANGED Viewed

The diff for this file is too large to render. See raw diff

model.safetensors.index.json CHANGED Viewed

@@ -1,3 +1,798 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:47309e187318e605ae100a165c50e5202a12171431bd91c4d5ee691942d7d5f9
-size 69452

+{
+  "metadata": {
+    "total_parameters": 9423832576,
+    "total_size": 18847665152
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00004-of-00004.safetensors",
+    "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.0.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.0.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.1.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.10.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.10.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.11.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.11.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.12.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.12.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.13.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.14.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.14.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.16.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.16.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.17.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.17.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.18.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.19.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.2.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.2.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.20.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.20.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.21.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.21.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.22.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.22.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.23.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.23.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.24.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.24.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.25.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.26.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.26.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.27.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.27.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.28.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.28.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.28.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.28.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.28.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.28.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.28.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.28.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.28.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.29.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.29.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.29.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.3.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.3.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.30.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.30.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.30.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.30.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.31.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.32.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.32.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.32.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.32.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.32.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.32.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.32.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.32.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.32.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.32.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.32.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.33.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.33.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.33.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.33.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.33.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.33.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.33.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.33.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.33.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.33.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.33.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.34.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.34.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.34.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.34.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.34.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.34.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.34.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.34.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.34.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.34.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.34.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.35.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.35.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.35.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.35.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.35.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.35.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.35.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.35.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.35.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.35.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.35.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.4.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.4.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.6.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.7.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.8.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.8.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.9.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.norm.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.0.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.0.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.0.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.0.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.0.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.0.mlp.fc1.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.0.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.0.mlp.fc2.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.0.norm1.bias": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.0.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.0.norm2.bias": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.0.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.1.attn.proj.bias": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.1.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.1.attn.qkv.bias": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.1.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.1.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.1.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.1.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.1.mlp.fc2.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.1.norm1.bias": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.1.norm1.weight": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.1.norm2.bias": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.1.norm2.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.10.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.10.attn.proj.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.10.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.10.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.10.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.10.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.10.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.10.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.10.norm1.bias": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.10.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.10.norm2.bias": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.10.norm2.weight": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.11.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.11.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.11.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.11.attn.qkv.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.11.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.11.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.11.mlp.fc2.bias": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.11.mlp.fc2.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.11.norm1.bias": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.11.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.11.norm2.bias": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.11.norm2.weight": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.12.attn.proj.bias": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.12.attn.proj.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.12.attn.qkv.bias": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.12.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.12.mlp.fc1.bias": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.12.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.12.mlp.fc2.bias": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.12.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.12.norm1.bias": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.12.norm1.weight": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.12.norm2.bias": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.12.norm2.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.13.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.13.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.13.attn.qkv.bias": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.13.attn.qkv.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.13.mlp.fc1.bias": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.13.mlp.fc1.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.13.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.13.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.13.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.13.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.13.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.13.norm2.weight": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.14.attn.proj.bias": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.14.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.14.attn.qkv.bias": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.14.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.14.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.14.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.14.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.14.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.14.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.14.norm1.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.14.norm2.bias": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.14.norm2.weight": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.15.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.15.attn.proj.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.15.attn.qkv.bias": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.15.attn.qkv.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.15.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.15.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.15.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.15.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.15.norm1.bias": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.15.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.15.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.15.norm2.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.16.attn.proj.bias": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.16.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.16.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.16.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.16.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.16.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.16.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.16.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.16.norm1.bias": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.16.norm1.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.16.norm2.bias": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.16.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.17.attn.proj.bias": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.17.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.17.attn.qkv.bias": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.17.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.17.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.17.mlp.fc1.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.17.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.17.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.17.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.17.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.17.norm2.bias": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.17.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.18.attn.proj.bias": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.18.attn.proj.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.18.attn.qkv.bias": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.18.attn.qkv.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.18.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.18.mlp.fc1.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.18.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.18.mlp.fc2.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.18.norm1.bias": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.18.norm1.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.18.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.18.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.19.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.19.attn.proj.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.19.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.19.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.19.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.19.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.19.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.19.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.19.norm1.bias": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.19.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.19.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.19.norm2.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.2.attn.proj.bias": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.2.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.2.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.2.attn.qkv.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.2.mlp.fc1.bias": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.2.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.2.mlp.fc2.bias": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.2.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.2.norm1.bias": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.2.norm1.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.2.norm2.bias": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.2.norm2.weight": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.20.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.20.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.20.attn.qkv.bias": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.20.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.20.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.20.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.20.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.20.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.20.norm1.bias": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.20.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.20.norm2.bias": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.20.norm2.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.21.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.21.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.21.attn.qkv.bias": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.21.attn.qkv.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.21.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.21.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.21.mlp.fc2.bias": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.21.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.21.norm1.bias": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.21.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.21.norm2.bias": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.21.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.22.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.22.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.22.attn.qkv.bias": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.22.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.22.mlp.fc1.bias": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.22.mlp.fc1.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.22.mlp.fc2.bias": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.22.mlp.fc2.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.22.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.22.norm1.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.22.norm2.bias": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.22.norm2.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.23.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.23.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.23.attn.qkv.bias": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.23.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.23.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.23.mlp.fc1.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.23.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.23.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.23.norm1.bias": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.23.norm1.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.23.norm2.bias": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.23.norm2.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.24.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.24.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.24.attn.qkv.bias": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.24.attn.qkv.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.24.mlp.fc1.bias": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.24.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.24.mlp.fc2.bias": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.24.mlp.fc2.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.24.norm1.bias": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.24.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.24.norm2.bias": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.24.norm2.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.25.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.25.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.25.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.25.attn.qkv.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.25.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.25.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.25.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.25.mlp.fc2.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.25.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.25.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.25.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.25.norm2.weight": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.26.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.26.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.26.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.26.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.26.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.26.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.26.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.26.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.26.norm1.bias": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.26.norm1.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.26.norm2.bias": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.26.norm2.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.27.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.27.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.27.attn.qkv.bias": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.27.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.27.mlp.fc1.bias": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.27.mlp.fc1.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.27.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.27.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.27.norm1.bias": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.27.norm1.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.27.norm2.bias": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.27.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.28.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.28.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.28.attn.qkv.bias": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.28.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.28.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.28.mlp.fc1.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.28.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.28.mlp.fc2.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.28.norm1.bias": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.28.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.28.norm2.bias": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.28.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.29.attn.proj.bias": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.29.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.29.attn.qkv.bias": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.29.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.29.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.29.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.29.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.29.mlp.fc2.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.29.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.29.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.29.norm2.bias": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.29.norm2.weight": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.3.attn.proj.bias": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.3.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.3.attn.qkv.bias": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.3.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.3.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.3.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.3.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.3.mlp.fc2.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.3.norm1.bias": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.3.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.3.norm2.bias": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.3.norm2.weight": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.30.attn.proj.bias": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.30.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.30.attn.qkv.bias": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.30.attn.qkv.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.30.mlp.fc1.bias": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.30.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.30.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.30.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.30.norm1.bias": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.30.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.30.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.30.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.31.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.31.attn.proj.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.31.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.31.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.31.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.31.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.31.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.31.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.31.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.31.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.31.norm2.bias": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.31.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.4.attn.proj.bias": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.4.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.4.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.4.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.4.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.4.mlp.fc1.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.4.mlp.fc2.bias": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.4.mlp.fc2.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.4.norm1.bias": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.4.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.4.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.4.norm2.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.5.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.5.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.5.attn.qkv.bias": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.5.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.5.mlp.fc1.bias": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.5.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.5.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.5.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.5.norm1.bias": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.5.norm1.weight": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.5.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.5.norm2.weight": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.6.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.6.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.6.attn.qkv.bias": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.6.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.6.mlp.fc1.bias": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.6.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.6.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.6.mlp.fc2.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.6.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.6.norm1.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.6.norm2.bias": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.6.norm2.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.7.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.7.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.7.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.7.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.7.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.7.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.7.mlp.fc2.bias": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.7.mlp.fc2.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.7.norm1.bias": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.7.norm1.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.7.norm2.bias": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.7.norm2.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.8.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.8.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.8.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.8.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.8.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.8.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.8.mlp.fc2.bias": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.8.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.8.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.8.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.8.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.8.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.9.attn.proj.bias": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.9.attn.proj.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.9.attn.qkv.bias": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.9.attn.qkv.weight": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.9.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.9.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.9.mlp.fc2.bias": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.9.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.9.norm1.bias": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.9.norm1.weight": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.9.norm2.bias": "model-00002-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.blocks.9.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.merger.ln_q.bias": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.merger.ln_q.weight": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.merger.mlp.0.bias": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.merger.mlp.0.weight": "model-00003-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.merger.mlp.2.bias": "model-00004-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.merger.mlp.2.weight": "model-00001-of-00004.safetensors",
+    "model.qwen2vl_vision_tower.patch_embed.proj.weight": "model-00003-of-00004.safetensors"
+  }
+}

modeling_projector.py CHANGED Viewed

@@ -1,3 +1,308 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ad432832007fb6483072944a20a929927548d7b39eb49a9c9a9492fcffae233c
-size 13141

+import math
+import torch
+import torch.nn as nn
+from .utils import IMAGE_INDICATOR_IDS
+def build_vision_projector(config, delay_load=False, **kwargs):
+    projector_type = getattr(config, 'mm_projector_type', 'linear')
+    if projector_type == 'conv_adapter':
+        return ConvAdapter(config.mm_hidden_size, config.hidden_size, getattr(config, "mlp_hidden_dim", None))
+    elif projector_type == 'mlp_pixel_shuffle':
+        return MlpPixelShuffle(config.mm_hidden_size, config.hidden_size,
+                               config.pixelshuffle_downsample_ratio, getattr(config, "mlp_hidden_dim", None))
+    elif projector_type == 'ovis_conv_adapter':
+        return OvisConvAdapter(config.mm_hidden_size, config.hidden_size, getattr(config, "mlp_hidden_dim", 32000),
+                               getattr(config, "tokenize_function", "softmax"))
+    elif projector_type == 'ovis2_adapter':
+        return Ovis2Adapter(config.mm_hidden_size, config.hidden_size, getattr(config, "mlp_hidden_dim", 66536),
+                            getattr(config, "hidden_stride", 2), getattr(config, "pooling_stride", 1), getattr(config, "tokenize_function", "softmax"))
+    elif projector_type == 'ovis_conv_adapter_navit':
+        return OvisConvAdapterNavit(1280, config.hidden_size, getattr(config, "mlp_hidden_dim", 32000), getattr(config, "tokenize_function", "softmax"))
+    raise ValueError(f'Unknown projector type: {projector_type}')
+class ConvAdapter(nn.Module):
+    def __init__(self, dim_in, dim_out, mlp_hidden_dim=None):
+        super().__init__()
+        self.mm_projector_type = 'conv_adapter'
+        if mlp_hidden_dim is None:
+            self.mlp = nn.Sequential(
+                nn.Linear(dim_in, dim_out),
+                nn.GELU(),
+                nn.Linear(dim_out, dim_out)
+            )
+        else:
+            self.mlp = nn.Sequential(
+                nn.Linear(dim_in, mlp_hidden_dim),
+                nn.GELU(),
+                nn.Linear(mlp_hidden_dim, dim_out)
+            )
+        self.conv = nn.Conv2d(dim_out, dim_out, kernel_size=(3, 3), stride=(2, 2), padding=1)
+    def forward(self, x):
+        """
+        Args:
+            x (torch.Tensor): image features
+                shape (F, v, D)
+        Returns:
+            shape (F, n, D) where n is token_num that has been reduced
+        """
+        x = self.mlp(x)
+        f, v, d = x.shape
+        s = int(math.sqrt(v - 1))
+        x = x[:, 1:, :]  # remove cls_token
+        x = x.reshape(f, s, s, d).permute([0, 3, 1, 2])
+        x = self.conv(x)
+        x = x.permute([0, 2, 3, 1]).reshape(f, -1, d)
+        return x
+class MlpPixelShuffle(nn.Module):
+    def __init__(self, dim_in, dim_out, pixelshuffle_downsample_ratio, mlp_hidden_dim=None):
+        super().__init__()
+        self.mm_projector_type = 'mlp_pixel_shuffle'
+        if mlp_hidden_dim is None:
+            self.mlp = nn.Sequential(
+                nn.Linear(int(dim_in * (pixelshuffle_downsample_ratio ** 2)), dim_out),
+                nn.GELU(),
+                nn.Linear(dim_out, dim_out)
+            )
+        else:
+            self.mlp = nn.Sequential(
+                nn.Linear(int(dim_in * (pixelshuffle_downsample_ratio ** 2)), mlp_hidden_dim),
+                nn.GELU(),
+                nn.Linear(mlp_hidden_dim, dim_out)
+            )
+        self.scale_factor = pixelshuffle_downsample_ratio
+    def pixel_shuffle(self, x, scale_factor=2):
+        # change scale_factor from float to int
+        n, w, h, c = x.size()
+        # N, W, H, C --> N, W, H / scale, C * scale
+        x = x.view(n, w, int(h / scale_factor), int(c * scale_factor))
+        # N, W, H / scale, C * scale --> N, H / scale, W, C * scale
+        x = x.permute(0, 2, 1, 3).contiguous()
+        # N, H / scale, W, C * scale --> N, H / scale, W / scale, C * (scale ** 2)
+        x = x.view(n, int(h / scale_factor), int(w / scale_factor),
+                   int(c * (scale_factor * scale_factor)))
+        x = x.permute(0, 2, 1, 3).contiguous()
+        return x
+    def forward(self, x):
+        """
+        Args:
+            x (torch.Tensor): image features
+                shape (F, v, D)
+        Returns:
+            shape (F, n, D) where n is token_num that has been reduced
+        """
+        x = x[:, 1:, :]  # remove cls_token
+        h = w = int(x.shape[1] ** 0.5)
+        x = x.view(x.shape[0], h, w, -1)
+        x = self.pixel_shuffle(x, self.scale_factor)
+        x = self.mlp(x)
+        x = x.view(x.shape[0],-1,x.shape[-1])
+        return x
+class OvisConvAdapter(nn.Module):
+    def __init__(self, dim_in, dim_out, vocab_size, tokenize_function="softmax"):
+        super().__init__()
+        self.mm_projector_type = 'ovis_conv_adapter'
+        self.conv = nn.Conv2d(dim_in, dim_in, kernel_size=(3, 3), stride=(2, 2), padding=1)
+        self.mlp = torch.nn.Sequential(
+            torch.nn.Linear(dim_in, vocab_size, bias=False),
+            torch.nn.LayerNorm(vocab_size)
+        )
+        self.embedding = torch.nn.Embedding(vocab_size, dim_out)
+        self.tokenize_function = tokenize_function
+    def tokenize(self, logits):
+        def st_argmax(y_soft, dim):  # straight-through softmax
+            index = y_soft.max(dim, keepdim=True)[1]
+            y_hard = torch.zeros_like(y_soft, memory_format=torch.legacy_contiguous_format).scatter_(dim, index, 1.0)
+            ret = y_hard - y_soft.detach() + y_soft
+            return ret
+        if self.tokenize_function == 'softmax':
+            tokens = torch.nn.functional.softmax(logits, dim=-1)
+        elif self.tokenize_function == 'gumbel_argmax':
+            tokens = torch.nn.functional.gumbel_softmax(logits, tau=self.config.tau, hard=True)
+        elif self.tokenize_function == 'st_argmax':
+            tokens = st_argmax(logits, dim=-1)
+        else:
+            raise ValueError(
+                'Invalid `max_type`, expected softmax or gumbel_argmax or st_argmax,'
+                f' but got {self.config.tokenize_function}'
+            )
+        return tokens
+    def forward(self, x):
+        """
+        Args:
+            x (torch.Tensor): image features
+                shape (F, v, D)
+        Returns:
+            shape (F, n, D) where n is token_num that has been reduced
+        """
+        # conv
+        f, v, d = x.shape
+        s = int(math.sqrt(v - 1))
+        x = x[:, 1:, :]  # remove cls_token
+        x = x.reshape(f, s, s, d).permute([0, 3, 1, 2])
+        x = self.conv(x)
+        x = x.permute([0, 2, 3, 1]).reshape(f, -1, d)
+        # tokenize
+        logits = self.mlp(x)
+        visual_tokens = self.tokenize(logits)
+        # get embeddings
+        out = torch.matmul(visual_tokens, self.embedding.weight)
+        return out
+class Ovis2Adapter(nn.Module):
+    def __init__(self, dim_in, dim_out, vocab_size, hidden_stride=2, pooling_stride=1, tokenize_function="softmax"):
+        super().__init__()
+        head_dim = vocab_size - len(IMAGE_INDICATOR_IDS)
+        self.mm_projector_type = 'ovis2_adapter'
+        self.hidden_stride = hidden_stride
+        self.tokenize_function = tokenize_function
+        self.head = torch.nn.Sequential(
+            torch.nn.Linear(
+                dim_in * self.hidden_stride * self.hidden_stride, head_dim,
+                bias=False
+            ),
+            torch.nn.LayerNorm(head_dim)
+        )
+        self.embedding = torch.nn.Embedding(vocab_size, dim_out)
+        self.pool_s = pooling_stride
+        print("pooling_stride: ", pooling_stride)
+    def encode(self, features):
+        # merge number of `hidden_stride * hidden_stride` hidden states together to reduce token sequence length
+        # e.g., for hidden_stride=2, this leads to a token length reduction: 1024 -> 256 for aimv2
+        features = features[:, 1:, :]
+        if self.hidden_stride > 1:
+            n, l, d = features.shape  # this `d` maybe different from the above `d
+            sqrt_l = int(l ** 0.5)
+            assert sqrt_l ** 2 == l, "The token sequence length should be a perfect square."
+            features = features.reshape(n, sqrt_l, sqrt_l, d)
+            pl = (self.hidden_stride - (sqrt_l % self.hidden_stride)) % self.hidden_stride
+            features = torch.nn.functional.pad(features, (0, 0, 0, pl, 0, pl), "constant", 0)
+            sqrt_l += pl
+            features = features.reshape(n, sqrt_l // self.hidden_stride, self.hidden_stride,
+                                        sqrt_l // self.hidden_stride, self.hidden_stride, d)
+            features = features.permute(0, 1, 3, 2, 4, 5)  # [n, sqrt_l/hs, sqrt_l/hs, hs, hs, d]
+            features = features.flatten(3)  # [n, sqrt_l/hs, sqrt_l/hs, hs*hs*d]
+            features = features.reshape(
+                n, -1, self.hidden_stride * self.hidden_stride * d)
+        return features
+    def tokenize(self, logits):
+        def st_argmax(y_soft, dim):  # straight-through softmax
+            index = y_soft.max(dim, keepdim=True)[1]
+            y_hard = torch.zeros_like(y_soft, memory_format=torch.legacy_contiguous_format).scatter_(dim, index, 1.0)
+            ret = y_hard - y_soft.detach() + y_soft
+            return ret
+        if self.tokenize_function == 'softmax':
+            tokens = torch.nn.functional.softmax(logits, dim=-1)
+        elif self.tokenize_function == 'gumbel_argmax':
+            tokens = torch.nn.functional.gumbel_softmax(logits, tau=self.config.tau, hard=True)  # here need to be check？？？
+        elif self.tokenize_function == 'st_argmax':
+            tokens = st_argmax(logits, dim=-1)
+        else:
+            raise ValueError(
+                f'Invalid `max_type`, expected softmax or gumbel_argmax or st_argmax, but got {self.config.tokenize_function}')
+        return tokens
+    def forward(self, x):
+        """
+        Args:
+            x (torch.Tensor): image features
+                shape (F, v, D)
+        Returns:
+            shape (F, n, D) where n is token_num that has been reduced
+        """
+        # pixelshuffle
+        # f, v, d = x.shape
+        # s = int(math.sqrt(v))
+        x = self.encode(x)
+        # tokenize
+        logits = self.head(x)
+        visual_tokens = self.tokenize(logits)
+        batch_size, token_len, _ = visual_tokens.shape
+        padding_tensor = torch.zeros(size=(batch_size, token_len, len(IMAGE_INDICATOR_IDS)),
+                                dtype=visual_tokens.dtype,
+                                device=visual_tokens.device,
+                                layout=visual_tokens.layout,
+                                requires_grad=False)
+        visual_tokens = torch.cat([visual_tokens, padding_tensor], dim=2)
+        # get embeddings here need to change argmax
+        out = torch.matmul(visual_tokens, self.embedding.weight)
+        if self.pool_s > 1:
+            f, v, d = out.shape
+            s = int(math.sqrt(v))
+            out = out.reshape(f, s, s, d)
+            out = out.reshape(f, s // self.pool_s, self.pool_s, s // self.pool_s, self.pool_s, d)
+            out = out.permute([0, 1, 3, 5, 2, 4]).reshape(f, s // self.pool_s * s // self.pool_s, d, -1).mean(-1)
+        return out
+class OvisConvAdapterNavit(nn.Module):
+    def __init__(self, dim_in, dim_out, vocab_size, tokenize_function="softmax"):
+        super().__init__()
+        self.mm_projector_type = 'ovis_conv_adapter_navit'
+        self.conv = nn.Conv2d(dim_in, dim_in, kernel_size=(2, 2), stride=(2, 2))
+        self.mlp = torch.nn.Sequential(
+            torch.nn.Linear(dim_in, vocab_size, bias=False),
+            torch.nn.LayerNorm(vocab_size)
+        )
+        self.embedding = torch.nn.Embedding(vocab_size, dim_out)
+        self.tokenize_function = tokenize_function
+    def tokenize(self, logits):
+        def st_argmax(y_soft, dim):  # straight-through softmax
+            index = y_soft.max(dim, keepdim=True)[1]
+            y_hard = torch.zeros_like(y_soft, memory_format=torch.legacy_contiguous_format).scatter_(dim, index, 1.0)
+            ret = y_hard - y_soft.detach() + y_soft
+            return ret
+        if self.tokenize_function == 'softmax':
+            tokens = torch.nn.functional.softmax(logits, dim=-1)
+        elif self.tokenize_function == 'gumbel_argmax':
+            tokens = torch.nn.functional.gumbel_softmax(logits, tau=self.config.tau, hard=True)
+        elif self.tokenize_function == 'st_argmax':
+            tokens = st_argmax(logits, dim=-1)
+        else:
+            raise ValueError(
+                f'Invalid `max_type`, expected softmax or gumbel_argmax or st_argmax, but got {self.config.tokenize_function}')
+        return tokens
+    def forward(self, x):
+        """
+        Args:
+            x (torch.Tensor): image features of navit
+                shape (v, D)
+        Returns:
+            shape (n, D) where n is token_num that has been reduced
+        """
+        # conv
+        _, d = x.shape
+        x = x.reshape(-1, 2, 2, d).permute([0, 3, 1, 2])
+        x = self.conv(x)
+        x = x.permute([0, 2, 3, 1]).reshape(-1, d)
+        # tokenize
+        logits = self.mlp(x)
+        visual_tokens = self.tokenize(logits)
+        # get embeddings
+        out = torch.matmul(visual_tokens, self.embedding.weight)
+        return out

modeling_valley.py CHANGED Viewed

@@ -589,6 +589,7 @@ class ValleyQwen3ForCausalLM(Qwen3ForCausalLM, ValleyMetaForCausalLM):
             shift_labels = shift_labels.to(shift_logits.device)
             loss = torch.stack([loss_fct(shift_logits[i], shift_labels[i]) for i in range(bs)])
         if not return_dict:
             output = (logits,) + outputs[1:]
             return (loss,) + output if loss is not None else output

             shift_labels = shift_labels.to(shift_logits.device)
             loss = torch.stack([loss_fct(shift_logits[i], shift_labels[i]) for i in range(bs)])
         if not return_dict:
             output = (logits,) + outputs[1:]
             return (loss,) + output if loss is not None else output

modeling_vision_tower.py CHANGED Viewed

@@ -1,3 +1,323 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:73071f97b2ad6a714bd77d272d1df0491ba96eeb33faaef3131748b0dc8e8dd3
-size 13063

+import torch
+import torch.nn as nn
+from transformers.models.qwen2_vl.modeling_qwen2_vl import Qwen2VisionTransformerPretrainedModel
+from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import Qwen2_5_VisionTransformerPretrainedModel
+from transformers import PretrainedConfig
+siglip_config = PretrainedConfig.from_dict(
+    {
+        "attention_dropout": 0.0,
+        "hidden_act": "gelu_pytorch_tanh",
+        "hidden_size": 1152,
+        "image_size": 384,
+        "intermediate_size": 4304,
+        "layer_norm_eps": 1e-06,
+        "model_type": "siglip_vision_model",
+        "num_attention_heads": 16,
+        "num_channels": 3,
+        "num_hidden_layers": 27,
+        "patch_size": 14,
+    }
+)
+qwen2vl_vit_config = PretrainedConfig.from_dict(
+    {
+        "depth": 32,
+        "embed_dim": 1280,
+        "hidden_act": "quick_gelu",
+        "hidden_size": 3584,
+        "in_channels": 3,
+        "in_chans": 3,
+        "mlp_ratio": 4,
+        "model_type": "qwen2_vl",
+        "num_heads": 16,
+        "patch_size": 14,
+        "spatial_merge_size": 2,
+        "spatial_patch_size": 14,
+        "temporal_patch_size": 2,
+        "_attn_implementation": "flash_attention_2",
+        "_attn_implementation_internal": "flash_attention_2"
+    }
+)
+qwen2_5vl_vit_config = PretrainedConfig.from_dict(
+    {
+        "depth": 32,
+        "hidden_act": "silu",
+        "hidden_size": 1280,
+        "intermediate_size": 3420,
+        "num_heads": 16,
+        "in_chans": 3,
+        "out_hidden_size": 3584,
+        "patch_size": 14,
+        "spatial_merge_size": 2,
+        "spatial_patch_size": 14,
+        "window_size": 112,
+        "fullatt_block_indexes": [
+            7,
+            15,
+            23,
+            31
+        ],
+        "tokens_per_second": 2,
+        "temporal_patch_size": 2
+    }
+)
+aimv2_config = PretrainedConfig.from_dict(
+    {
+        "hidden_size": 1024,
+        "image_size": 448,
+        "intermediate_size": 2816,
+        "model_type": "aimv2",
+        "num_attention_heads": 8,
+        "num_channels": 3,
+        "num_hidden_layers": 24,
+        "patch_size": 14,
+        "projection_dropout": 0.0,
+        "qkv_bias": False,
+        "rms_norm_eps": 1e-05,
+        "torch_dtype": "float32",
+        "transformers_version": "4.46.3",
+        "auto_map": {
+            "AutoConfig": "configuration_aimv2.AIMv2Config",
+            "AutoModel": "modeling_aimv2.AIMv2Model",
+        },
+    }
+)
+def wrapped_qwen2vl_vision_tower(vision_tower_cfg, qwen2vl_vision_tower):
+    if getattr(vision_tower_cfg, "only_navit", False) and \
+            getattr(vision_tower_cfg, "navit_use_mm_projector", False):
+        qwen2vl_vision_tower.merger = torch.nn.Identity()
+        print("navit_use_mm_projector is NOT None, so we need to initialize a new merger...")
+    else:
+        old_linear = qwen2vl_vision_tower.merger.mlp[-1] # shape: 5120 * 3584, 3584 is dim of LLM, 5120 is the hidden_dim of merger
+        navit_merger_hidden_dim = getattr(vision_tower_cfg, "navit_merger_hidden_dim", None)
+        rule1 = old_linear.out_features != vision_tower_cfg.hidden_size
+        rule2 = navit_merger_hidden_dim is not None and navit_merger_hidden_dim != old_linear.in_features
+        if rule1 or rule2:
+            del qwen2vl_vision_tower.merger
+            qwen2vl_vision_tower.merger = CustomPatchMerger(
+                dim=vision_tower_cfg.hidden_size, # output_dim of merger, also the dim of LLM
+                context_dim=1280, # 1280 is the hidden_dim of qwen2vl_vision_tower, so input_dim of merger is 1280*4=5120 (2*2 pixel shuffle)
+                hidden_dim=navit_merger_hidden_dim if navit_merger_hidden_dim is not None else old_linear.in_features # hidden_dim of merger
+            )
+            print("output_dim of original merger is not match or navit_merger_hidden_dim is not match, we need to initialize a new merger...")
+    return qwen2vl_vision_tower
+def build_vision_tower(vision_tower_cfg, **kwargs):
+    vision_tower = getattr(vision_tower_cfg, "mm_vision_tower", getattr(vision_tower_cfg, "vision_tower", None))
+    if "siglip-so400m-patch14-384" in vision_tower or "Oryx-ViT" in vision_tower or "navit" in vision_tower.lower():
+        # if 'navit' in vision_tower, vision_tower_cfg.eagle_vision_tower is not None and vision_tower_cfg.only_navit is True
+        if "navit" in vision_tower.lower():
+            assert getattr(vision_tower_cfg, "only_navit", False) and \
+                   getattr(vision_tower_cfg, "eagle_vision_tower", None) is not None
+        if getattr(vision_tower_cfg, "eagle_vision_tower", None) is not None:
+            if "Qwen2.5-VL" in vision_tower_cfg.eagle_vision_tower:
+                if getattr(vision_tower_cfg, "_vit_attn_implementation", None) is not None:
+                    qwen2_5vl_vit_config._attn_implementation = vision_tower_cfg._vit_attn_implementation
+                    qwen2_5vl_vit_config._attn_implementation_internal = vision_tower_cfg._vit_attn_implementation
+                qwen2vl_vision_tower = Qwen2_5_VisionTransformerPretrainedModel._from_config(qwen2_5vl_vit_config)
+            elif "Qwen2-VL" in vision_tower_cfg.eagle_vision_tower:
+                if getattr(vision_tower_cfg, "_vit_attn_implementation", None) is not None:
+                    qwen2vl_vit_config._attn_implementation = vision_tower_cfg._vit_attn_implementation
+                    qwen2vl_vit_config._attn_implementation_internal = vision_tower_cfg._vit_attn_implementation
+                qwen2vl_vision_tower = Qwen2VisionTransformerPretrainedModel._from_config(qwen2vl_vit_config)
+            else:
+                raise ValueError(f"Unknown vision tower: {vision_tower_cfg.eagle_vision_tower}")
+            qwen2vl_vision_tower = wrapped_qwen2vl_vision_tower(vision_tower_cfg, qwen2vl_vision_tower)
+            qwen2vl_vision_tower.requires_grad_(False)
+            if getattr(vision_tower_cfg, "only_navit", False):
+                return None, qwen2vl_vision_tower
+            else:
+                siglip_vision_tower = SigLipVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
+                return siglip_vision_tower, qwen2vl_vision_tower
+        # only return siglip vision tower if eagle vision tower is None
+        else:
+            return SigLipVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
+    elif "aimv2-huge-patch14-448" in vision_tower or "Ovis2-8B-visual" in vision_tower:
+        return AIMv2VisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
+    elif "aimv2-large-patch14-448" in vision_tower or "Ovis2-2B-visual" in vision_tower:
+        return AIMv2VisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
+    else:
+        raise ValueError(f"Unknown vision tower: {vision_tower}")
+class SigLipVisionTower(nn.Module):
+    def __init__(self, vision_tower, args, delay_load=False, cache_dir="./cache_dir"):
+        super().__init__()
+        self.is_loaded = False
+        self.image_tower_name = vision_tower
+        self.select_layer = args.mm_vision_select_layer
+        self.select_feature = getattr(args, "mm_vision_select_feature", "patch")
+        self.cache_dir = cache_dir
+        if not delay_load:
+            self.load_model()
+        else:
+            from transformers import SiglipVisionModel
+            self.cfg_only = siglip_config
+            self.vision_tower = SiglipVisionModel._from_config(siglip_config)  # dummy-load
+    def load_model(self):
+        from transformers import SiglipVisionModel
+        self.vision_tower = SiglipVisionModel._from_config(siglip_config)
+        self.vision_tower.requires_grad_(False)
+        self.is_loaded = True
+    def feature_select(self, image_forward_outs):
+        assert self.select_feature == "cls_patch"
+        image_features = torch.cat([image_forward_outs[:, :1, :], image_forward_outs], dim=1)
+        return image_features
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_forward_out = self.vision_tower(
+                    image.to(device=self.device, dtype=self.dtype).unsqueeze(0),
+                    output_hidden_states=True,
+                    return_dict=True,
+                )
+                image_feature = self.feature_select(image_forward_out.last_hidden_state).to(image.dtype)
+                image_features.append(image_feature)
+        else:
+            image_forward_outs = self.vision_tower(
+                images.to(device=self.device, dtype=self.dtype),
+                output_hidden_states=True,
+                return_dict=True,
+            )
+            image_features = self.feature_select(image_forward_outs.last_hidden_state).to(images.dtype)
+        return image_features
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+    @property
+    def dtype(self):
+        return self.vision_tower.dtype
+    @property
+    def device(self):
+        return self.vision_tower.device
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.vision_tower.config
+        else:
+            return self.cfg_only
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size
+    @property
+    def num_patches(self):
+        return (self.config.image_size // self.config.patch_size) ** 2
+class CustomPatchMerger(nn.Module):
+    def __init__(self, dim: int, context_dim: int, hidden_dim: int, spatial_merge_size: int = 2) -> None:
+        super().__init__()
+        self.input_dim = context_dim * (spatial_merge_size**2)
+        self.ln_q = nn.LayerNorm(context_dim, eps=1e-6)
+        self.mlp = nn.Sequential(
+            nn.Linear(self.input_dim, hidden_dim),
+            nn.GELU(),
+            nn.Linear(hidden_dim, dim),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.mlp(self.ln_q(x).view(-1, self.input_dim))
+        return x
+class AIMv2VisionTower(nn.Module):
+    def __init__(self, vision_tower, args, delay_load=False, cache_dir='./cache_dir'):
+        super().__init__()
+        self.is_loaded = False
+        self.image_tower_name = vision_tower
+        self.select_layer = args.mm_vision_select_layer
+        self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
+        self.cache_dir = cache_dir
+        if not delay_load:
+            self.load_model()
+        else:
+            from transformers import AutoConfig, AutoModel
+            # self.cfg_only = AutoConfig.from_pretrained(self.image_tower_name, cache_dir=self.cache_dir, trust_remote_code=True)
+            # self.vision_tower = AutoModel.from_pretrained(self.vision_tower_name, trust_remote_code=True)  # dummy-load
+            self.cfg_only = aimv2_config
+            self.vision_tower = AutoModel._from_config(aimv2_config)  # dummy-load
+    def load_model(self):
+        from transformers import AutoConfig, AutoModel, AutoProcessor
+        self.image_processor = AutoProcessor.from_pretrained(self.image_tower_name, trust_remote_code=True)
+        self.vision_tower = AutoModel.from_pretrained(self.image_tower_name, trust_remote_code=True)
+        self.vision_tower.requires_grad_(False)
+        # self.image_processor.crop_size = self.image_processor.crop_size['height']
+        self.image_processor.crop_size = self.image_processor.size["shortest_edge"]
+        self.is_loaded = True
+    def feature_select(self, image_forward_outs):
+        assert self.select_feature == 'cls_patch'
+        image_features = torch.cat([image_forward_outs[:, :1, :], image_forward_outs], dim=1)
+        return image_features
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True
+                    ,return_dict=True,)
+                image_feature = self.feature_select(image_forward_out.last_hidden_state).to(image.dtype)
+                image_features.append(image_feature)
+        else:
+            image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True
+                    ,return_dict=True,)
+            image_features = self.feature_select(image_forward_outs.last_hidden_state).to(images.dtype)
+        return image_features
+    # @property
+    # def dummy_feature(self):
+    #     return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+    @property
+    def dtype(self):
+        return self.vision_tower.dtype
+    @property
+    def device(self):
+        return self.vision_tower.device
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.vision_tower.config
+        else:
+            return self.cfg_only
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size
+    @property
+    def num_patches(self):
+        return (self.config.image_size // self.config.patch_size) ** 2

preprocessor_config.json CHANGED Viewed

@@ -1,3 +1,6 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:580004e4d551f368ffe2173ef7da9a20c99d5d5fb4145d8f07a02d5ae6ef5ffa
-size 131

+ {
+    "processor_class": "ValleyProcessor",
+    "auto_map": {
+        "AutoProcessor": "processing_valley.ValleyProcessor"
+    }
+}

processing_valley.py CHANGED Viewed

@@ -1,3 +1,618 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:211d2edfcd4c52e98f5192298409adfa77d6ae03a969b97112f6214d0b538600
-size 25837

+import re
+import types
+import io
+import torch
+import os
+from PIL import Image
+import argparse
+from qwen_vl_utils import fetch_image
+from transformers import (
+    ProcessorMixin,
+    SiglipImageProcessor,
+    BatchFeature,
+    Qwen2VLImageProcessor,
+    PreTrainedTokenizer,
+    AutoImageProcessor,
+    CLIPImageProcessor,
+)
+from .utils import (
+    process_anyres_image,
+    preprocess_image_ovis,
+    ovis_template_process,
+    BLACK_IMG_ENV,
+    DEFAULT_IM_END_TOKEN,
+    DEFAULT_IM_START_TOKEN,
+    DEFAULT_IMAGE_TOKEN,
+    DEFAULT_VI_END_TOKEN,
+    DEFAULT_VI_START_TOKEN,
+    DEFAULT_VIDEO_TOKEN,
+    IMAGE_TOKEN_INDEX,
+    SEQ_MAX_LEN,
+    IGNORE_INDEX,
+)
+siglip_processor_config = {
+    "do_normalize": True,
+    "do_rescale": True,
+    "do_resize": True,
+    "image_mean": [
+        0.5,
+        0.5,
+        0.5
+    ],
+    "image_processor_type": "SiglipImageProcessor",
+    "image_std": [
+        0.5,
+        0.5,
+        0.5
+    ],
+    "processor_class": "SiglipProcessor",
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "size": {
+        "height": 384,
+        "width": 384
+    }
+}
+qwen2vl_processor_config = {
+    "min_pixels": 3136,
+    "max_pixels": 12845056,
+    "patch_size": 14,
+    "temporal_patch_size": 2,
+    "merge_size": 2,
+    "image_mean": [
+        0.48145466,
+        0.4578275,
+        0.40821073
+    ],
+    "image_std": [
+        0.26862954,
+        0.26130258,
+        0.27577711
+    ],
+    "image_processor_type": "Qwen2VLImageProcessor",
+    "processor_class": "Qwen2VLProcessor"
+}
+aimv2_processor_config = {
+    "crop_size": {
+        "height": 448,
+        "width": 448
+    },
+    "do_center_crop": True,
+    "do_convert_rgb": True,
+    "do_normalize": True,
+    "do_rescale": True,
+    "do_resize": True,
+    "image_mean": [
+        0.48145466,
+        0.4578275,
+        0.40821073
+    ],
+    "image_processor_type": "CLIPImageProcessor",
+    "image_std": [
+        0.26862954,
+        0.26130258,
+        0.27577711
+    ],
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "size": {
+        "shortest_edge": 448
+    }
+}
+class ValleyProcessor(ProcessorMixin):
+    attributes = ["tokenizer"]
+    optional_attributes = [
+        "max_pixels",
+        "min_pixels",
+        "anyres",
+        "only_crop_single_image",
+        "grid_pinpoints",
+        "use_special_start_end_token",
+        "only_navit",
+        "chat_template",
+        "process_mode",
+    ]
+    tokenizer_class = "AutoTokenizer"
+    def __init__(self, tokenizer=None, chat_template=None, **kwargs):
+        super().__init__(tokenizer=tokenizer, chat_template=chat_template, **kwargs)
+        self.black_img = BLACK_IMG_ENV
+        self.siglip_image_processor = SiglipImageProcessor.from_dict(siglip_processor_config)
+        self.qwen2vl_image_processor = Qwen2VLImageProcessor.from_dict(qwen2vl_processor_config)
+        self.aimv2_image_processor = CLIPImageProcessor.from_dict(aimv2_processor_config)
+        self.anyres = kwargs.get("anyres", True)
+        self.grid_pinpoints = kwargs.get("grid_pinpoints", "(1x1),...,(3x3)")
+        self.only_crop_single_image = kwargs.get("only_crop_single_image", True)
+        self.use_special_start_end_token = kwargs.get("use_special_start_end_token", True)
+        self.only_navit = kwargs.get("only_navit", False)
+        self.process_mode = kwargs.get("process_mode", "qwen3")
+        self.aimv2_crop_size = self.aimv2_image_processor.size["shortest_edge"]
+    def preprocess_images_siglip(self, images) -> torch.FloatTensor:
+        if isinstance(images[0], str):
+            images_pil = [Image.open(img).convert("RGB") for img in images]
+        elif isinstance(images[0], Image.Image):
+            images_pil = [img.convert("RGB") for img in images]
+        elif isinstance(images[0], bytes):
+            images_pil = [Image.open(io.BytesIO(img)).convert("RGB") for img in images]
+        else:
+            raise ValueError("unsupported type")
+        processed_images = []
+        have_multi_images = len(images_pil) > 1
+        for img in images_pil:
+            if self.anyres:
+                if not self.only_crop_single_image or not have_multi_images:
+                    image = process_anyres_image(img, self.siglip_image_processor, self.grid_pinpoints)
+                else:
+                    image = [self.siglip_image_processor(img, return_tensors="pt")["pixel_values"][0]]
+            else:
+                image = self.siglip_image_processor(img, return_tensors="pt")["pixel_values"][0]
+            processed_images.append(image)
+        if not self.anyres:
+            return torch.stack(processed_images, dim=0)
+        else:
+            return [torch.stack(img, dim=0) for img in processed_images]
+    def preprocess_images_qwen2vl(self, images) -> dict:
+        if isinstance(images[0], str):
+            images_pil = [Image.open(img).convert("RGB") for img in images]
+        elif isinstance(images[0], Image.Image):
+            images_pil = [img.convert("RGB") for img in images]
+        elif isinstance(images[0], bytes):
+            images_pil = [Image.open(io.BytesIO(img)).convert("RGB") for img in images]
+        else:
+            raise ValueError("unsupported type")
+        image_sizes = [[x.size for x in images_pil]]
+        data_dict_qwen2vl = self.qwen2vl_image_processor(
+            [fetch_image({"image": img}) for img in images_pil],
+            return_tensors="pt"
+        )
+        data_dict_qwen2vl["image_sizes"] = image_sizes
+        return data_dict_qwen2vl
+    def preprocess_multimodal(self, conversations):
+        for sentence in conversations:
+            if sentence["role"] == "system":
+                continue
+            segs = re.split(DEFAULT_IMAGE_TOKEN, sentence["content"])
+            if self.use_special_start_end_token:
+                sentence["content"] = (DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN).join(segs)
+            else:
+                sentence["content"] = DEFAULT_IMAGE_TOKEN.join(segs)
+        return conversations
+    def preprocess_images_aimv2(self, images) -> torch.FloatTensor:
+        processed_images = []
+        image_sizes_list = []
+        have_multi_images = len(images) > 1
+        for image_file in images:
+            if isinstance(image_file, str):
+                img = Image.open(image_file).convert("RGB")
+            elif isinstance(image_file, Image.Image):
+                img = image_file.convert("RGB")
+            elif isinstance(image_file, bytes):
+                img = Image.open(io.BytesIO(image_file)).convert("RGB")
+            else:
+                raise ValueError("unsupported type")
+            image_sizes_list.append(img.size)
+            if self.anyres:
+                if not self.only_crop_single_image or not have_multi_images:
+                    img, ovis_image_placeholders = preprocess_image_ovis(img, image_processor=self.aimv2_image_processor, crop_size=self.aimv2_crop_size, max_partition=9)
+                else:
+                    img, ovis_image_placeholders = preprocess_image_ovis(img, image_processor=self.aimv2_image_processor, crop_size=self.aimv2_crop_size, max_partition=1)
+            else:
+                img, ovis_image_placeholders = preprocess_image_ovis(img, image_processor=self.aimv2_image_processor, crop_size=self.aimv2_crop_size, max_partition=1)
+            img = (img, ovis_image_placeholders)
+            processed_images.append(img)
+        if not self.anyres:
+            return [(img[0], img[1]) for img in processed_images], [image_sizes_list]
+        else:
+            return [(torch.cat(img[0], dim=0), img[1]) for img in processed_images], [image_sizes_list]
+    def preprocess_qwen2(
+        self,
+        conversations,
+        tokenizer: PreTrainedTokenizer,
+        has_image: bool = False,
+        inference: bool = False,
+        only_mask_system: bool = False,
+    ) -> dict:
+        conv = types.SimpleNamespace(
+            system="You are a helpful assistant.",
+            roles=("user", "assistant"),
+            version="qwen2",
+            offset=0,
+            sep="<|im_start|>",
+            sep2="<|im_end|>\n",
+        )
+        # Check system prompt
+        assert conversations[0]["role"] == "system"
+        if conversations[0]["content"] == None:
+            conversations[0]["content"] = conv.system # use default system prompt
+        # Check conversation sequence
+        for j, sentence in enumerate(conversations[1:]):
+            role = sentence["role"]
+            assert role == conv.roles[j % 2], "The conversation sequence is incorrect."
+        conversation_str = tokenizer.apply_chat_template(conversations, tokenize=False, add_generation_prompt=inference)
+        # Mask targets
+        rounds = conversation_str.split(conv.sep2)
+        input_ids_ = torch.tensor([], dtype=torch.int64)
+        targets_ = torch.tensor([], dtype=torch.int64)
+        for i, rou in enumerate(rounds):
+            if rou == "":
+                continue
+            if (not inference) or (i < (len(rounds) - 1)):
+                rou += conv.sep2
+            if has_image:
+                cur_input_ids_ = self.tokenizer_image_token(rou, tokenizer, return_tensors='pt')
+                input_ids_ = torch.cat([input_ids_, cur_input_ids_], dim=0)
+                if only_mask_system:
+                    mask_len = len(self.tokenizer_image_token(re.sub(rf'{conv.roles[0]}\n[\s\S]*', f'{conv.roles[0]}:', rou),
+                                                        tokenizer))
+                else:
+                    mask_len = len(self.tokenizer_image_token(re.sub(rf'{conv.roles[1]}\n[\s\S]*', f'{conv.roles[1]}:', rou),
+                                                        tokenizer))
+                targets_ = torch.cat([targets_, torch.tensor([-100] * mask_len), cur_input_ids_[mask_len:]], dim=0)
+            else:
+                cur_input_ids_ = tokenizer(rou, return_tensors='pt')["input_ids"][0, :]
+                input_ids_ = torch.cat([input_ids_, cur_input_ids_], dim=0)
+                mask_len = len(tokenizer(re.sub(rf'{conv.roles[1]}\n[\s\S]*', rf'{conv.roles[1]}:', rou))["input_ids"][:])
+                targets_ = torch.cat([targets_, torch.tensor([-100] * mask_len), cur_input_ids_[mask_len:]], dim=0)
+        return {"input_ids": input_ids_, "labels": targets_}
+    def preprocess_qwen3(
+        self,
+        conversations,
+        tokenizer: PreTrainedTokenizer,
+        has_image: bool = False,
+        inference: bool = False,
+        only_mask_system: bool = False,
+        enable_thinking: bool = False, #ZYF Modify to support enable_thinking
+    ) -> dict:
+        conv = types.SimpleNamespace(
+            system="You are a helpful assistant.",
+            roles=("user", "assistant"),
+            version="qwen3",
+            offset=0,
+            sep="<|im_start|>",
+            sep2="<|im_end|>\n",
+        )
+        #print(conversations)
+        # Check system prompt
+        assert conversations[0]["role"] == "system"
+        if conversations[0]["content"] == None:
+            conversations[0]["content"] = conv.system # use default system prompt
+        # if conversations[0]['role'] == "system":
+        #     conversations = conversations[1:]
+        # Check conversation sequence
+        # print(conversations)
+        for j, sentence in enumerate(conversations[1:]):
+            role = sentence["role"]
+            assert role == conv.roles[j % 2], "The conversation sequence is incorrect."
+        conversation_str = tokenizer.apply_chat_template(conversations, tokenize=False, add_generation_prompt=inference, enable_thinking=enable_thinking) #ZYF Modify to support thinking
+        # Mask targets
+        rounds = conversation_str.split(conv.sep2)
+        input_ids_ = torch.tensor([], dtype=torch.int64)
+        targets_ = torch.tensor([], dtype=torch.int64)
+        for i, rou in enumerate(rounds):
+            if rou == "":
+                continue
+            if (not inference) or (i < (len(rounds) - 1)):
+                rou += conv.sep2
+            if has_image:
+                cur_input_ids_ = self.tokenizer_image_token(rou, tokenizer, return_tensors='pt')
+                input_ids_ = torch.cat([input_ids_, cur_input_ids_], dim=0)
+                if only_mask_system:
+                    mask_len = len(self.tokenizer_image_token(re.sub(rf'{conv.roles[0]}\n[\s\S]*', f'{conv.roles[0]}:', rou),
+                                                        tokenizer))
+                else:
+                    mask_len = len(self.tokenizer_image_token(re.sub(rf'{conv.roles[1]}\n[\s\S]*', f'{conv.roles[1]}:', rou),
+                                                        tokenizer))
+                targets_ = torch.cat([targets_, torch.tensor([-100] * mask_len), cur_input_ids_[mask_len:]], dim=0)
+            else:
+                cur_input_ids_ = tokenizer(rou, return_tensors='pt')["input_ids"][0, :]
+                input_ids_ = torch.cat([input_ids_, cur_input_ids_], dim=0)
+                mask_len = len(tokenizer(re.sub(rf'{conv.roles[1]}\n[\s\S]*', rf'{conv.roles[1]}:', rou))["input_ids"][:])
+                targets_ = torch.cat([targets_, torch.tensor([-100] * mask_len), cur_input_ids_[mask_len:]], dim=0)
+        return {"input_ids": input_ids_, "labels": targets_}
+    def preprocess_ovis2(
+        self,
+        source,  # do not include system prompt
+        tokenizer: PreTrainedTokenizer,
+        has_image: bool = False,
+        inference: bool = False,
+        only_mask_system: bool = False,
+        video_len: int = 0,
+    ):
+        # print(source)
+        judge_format = "from" in source[0].keys()
+        if judge_format:
+            if source[-1]["from"] == "gpt":
+                source = source[:-1]
+            roles = {"human": 'user', "gpt": 'assistant'}
+            input_ids = []
+            labels = []
+            messages = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+            for message in source:
+                if message["from"] == "human":
+                    user = message["value"]
+                    if '<image>' not in user and '<video>' not in user:
+                        messages += f"<|im_start|>{roles['human']}\n" + user + "<|im_end|>\n"
+                    if '<image>' in user:
+                        # import re
+                        # image_count = user.count('<image>')
+                        # user = re.sub(r'<image>', '', user).strip()
+                        # user = '\n'.join([f'Image {i+1}: <image>' for i in range(image_count)]) + '\n' + user
+                        messages += f"<|im_start|>{roles['human']}\n" + user + "<|im_end|>\n"
+                    if '<video>' in user:
+                        user = user.replace('<video>', '\n'.join(['<image>'] * video_len) + '\n')
+                        messages += f"<|im_start|>{roles['human']}\n" + user + "<|im_end|>\n"
+                elif message["from"] == "gpt":
+                    assistant = message["value"]
+                    messages += f"<|im_start|>{roles['gpt']}\n" + assistant + "<|im_end|>\n"
+            if inference:
+                messages += f"<|im_start|>{roles['gpt']}\n"
+            else:
+                messages = messages[:-1]  # remove the final '\n'，keep <|im_end|> as the end
+            messages = messages.split('<image>')
+            messages = [tokenizer.encode(m) for m in messages]
+            for m in messages[:-1]:
+                input_ids += m
+                input_ids += [IMAGE_TOKEN_INDEX]
+            input_ids += messages[-1]
+            # mask last assistant
+            head_id = tokenizer.encode(f'<|im_start|>{roles["gpt"]}\n')
+            last_id = None
+            for i, id in enumerate(input_ids):
+                if input_ids[i:i+len(head_id)] == head_id:
+                    last_id = i+len(head_id)
+                if i+len(head_id) > len(input_ids):
+                    break
+            assert last_id != None
+            labels = len(input_ids) * [IGNORE_INDEX]
+            labels[last_id:] = input_ids[last_id:]
+            return {"input_ids": torch.tensor(input_ids), "labels": torch.tensor(labels)}
+        else:
+            if source[-1]["role"] == "assistant":
+                source = source[:-1]
+            input_ids = []
+            labels = []
+            messages = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+            for message in source:
+                if message["role"] == "user":
+                    user = message["value"]
+                    if '<image>' not in user and '<video>' not in user:
+                        messages += f"<|im_start|>user\n" + user + "<|im_end|>\n"
+                    if '<image>' in user:
+                        # import re
+                        # image_count = user.count('<image>')
+                        # user = re.sub(r'<image>', '', user).strip()
+                        # user = '\n'.join([f'Image {i+1}: <image>' for i in range(image_count)]) + '\n' + user
+                        messages += f"<|im_start|>user\n" + user + "<|im_end|>\n"
+                    if '<video>' in user:
+                        user = user.replace('<video>', '\n'.join(['<image>'] * video_len) + '\n')
+                        messages += f"<|im_start|>user\n" + user + "<|im_end|>\n"
+                elif message["role"] == "assistant":
+                    assistant = message["value"]
+                    messages += f"<|im_start|>assistant\n" + assistant + "<|im_end|>\n"
+            if inference:
+                messages += f"<|im_start|>assistant\n"
+            else:
+                messages = messages[:-1]  # remove the final '\n'，keep <|im_end|> as the end
+            messages = messages.split('<image>')
+            messages = [tokenizer.encode(m) for m in messages]
+            for m in messages[:-1]:
+                input_ids += m
+                input_ids += [IMAGE_TOKEN_INDEX]
+            input_ids += messages[-1]
+            # mask last assistant
+            head_id = tokenizer.encode(f'<|im_start|>assistant\n')
+            last_id = None
+            for i, id in enumerate(input_ids):
+                if input_ids[i:i+len(head_id)] == head_id:
+                    last_id = i+len(head_id)
+                if i+len(head_id) > len(input_ids):
+                    break
+            assert last_id != None
+            labels = len(input_ids) * [IGNORE_INDEX]
+            labels[last_id:] = input_ids[last_id:]
+            return {"input_ids": torch.tensor(input_ids), "labels": torch.tensor(labels)}
+    def tokenizer_image_token(
+        self,
+        prompt,
+        tokenizer,
+        image_token_index=IMAGE_TOKEN_INDEX,
+        return_tensors=None,
+    ):
+        def split_with_token(string, token):
+            result = string.split(token)
+            for i in range(len(result) - 1):
+                result.insert(i * 2 + 1, token)
+            return result
+        if len(prompt) > SEQ_MAX_LEN:
+            raise ValueError("sequence is too long !!!")
+        prompt_chunks = split_with_token(prompt, DEFAULT_IMAGE_TOKEN)
+        input_ids, offset = ([tokenizer.bos_token_id], 1) if getattr(tokenizer,'bos_token',None) else ([], 0)
+        token2index = {DEFAULT_IMAGE_TOKEN: image_token_index}
+        for chunk in prompt_chunks:
+            if chunk in token2index:
+                input_ids.append(token2index[chunk])
+            else:
+                chunk_ids = tokenizer(chunk).input_ids
+                if chunk_ids[0] != getattr(tokenizer,'bos_token_id', None):
+                    offset = 0
+                input_ids.extend(chunk_ids[offset:])
+        if return_tensors is not None:
+            if return_tensors == "pt":
+                return torch.tensor(input_ids, dtype=torch.long)
+            raise ValueError(f"Unsupported tensor type: {return_tensors}")
+        return input_ids
+    def __call__(self, messages, inference=True, **kwargs) -> BatchFeature:
+        # print("+++++++++++"*5+"Process get"+"++++++++++"*5)
+        # print(messages)
+        # print("+++++++++++"*10)
+        process_mode = self.process_mode
+        if process_mode == "ovis2":
+            video_len = kwargs.get('video_len', 0)
+            # max_tile_num = kwargs.get('max_tile_num', 1)
+            if "images" not in messages or not messages["images"] or not messages["images"][0]:
+                images = [self.black_img]
+            elif type(messages["images"]) == str:
+                images = [messages["images"]]
+            else:
+                images = messages["images"]
+            conversations = messages["conversations"]
+            # adapt for user-assistant format, transform to human-gpt format
+            if "role" in conversations[0]:
+                new_conversations = []
+                for conversation in conversations:
+                    if conversation["role"] == "system":
+                        new_conversations.append({"from": "system", "value": conversation["content"]})
+                    elif conversation["role"] == "user":
+                        new_conversations.append({"from": "human", "value": conversation["content"]})
+                    elif conversation["role"] == "assistant":
+                        new_conversations.append({"from": "gpt", "value": conversation["content"]})
+                conversations = new_conversations
+            # add <image> token
+            first_conv = conversations[1] if conversations[0]["from"] == "system" else conversations[0]
+            if images and "<image>" not in first_conv["value"]:
+                image_token = "\n".join(["<image>"] * len(images))
+                first_conv["value"] = f"{image_token}\n{first_conv['value']}"
+            data_dict = self.preprocess_ovis2(conversations, self.tokenizer, has_image=True, only_mask_system=False, inference=inference, video_len=video_len)
+            data_dict['images'], data_dict['image_sizes'] = self.preprocess_images_aimv2(images)
+            data_dict = ovis_template_process(data_dict)
+            # be batch
+            data_dict['images'] = [data_dict['images']]
+            data_dict['input_ids'] = data_dict['input_ids'].unsqueeze(0)
+            return BatchFeature(data={**data_dict})
+        elif process_mode == "qwen2" or process_mode == "qwen3":
+            max_pixels=kwargs.get("max_pixels", self.max_pixels)
+            min_pixels=kwargs.get("min_pixels", self.min_pixels)
+            if max_pixels is not None:
+                self.qwen2vl_image_processor.max_pixels = max_pixels
+            if min_pixels is not None:
+                self.qwen2vl_image_processor.min_pixels = min_pixels
+            # Deal with images
+            if "images" not in messages or not messages["images"] or not messages["images"][0]:
+                images = [self.black_img]
+            elif type(messages["images"]) == str:
+                images = [messages["images"]]
+            else:
+                images = messages["images"]
+            # Deal with conversations
+            conversations = messages["conversations"]
+            if conversations[0]["role"] != "system":
+                conversations = [{"role":"system", "content": None}] + conversations  # dummy system prompt
+            # Insert special token `<image>`
+            assert conversations[1]["role"] == "user"
+            if images and "<image>" not in conversations[1]["content"]:
+                image_token = " ".join(["<image>"] * len(images))
+                conversations[1]["content"] = f"{image_token}\n{conversations[1]['content']}"
+            # The last message should be assistant if inference=True
+            if inference:
+                assert conversations[-1]["role"] == "user", "the last message should be assistant if inference=True"
+            # Image preprocess
+            if self.only_navit:
+                precessed_images_siglip = None
+            else:
+                precessed_images_siglip = self.preprocess_images_siglip(images)
+            processed_data_dict_qwen2vl = self.preprocess_images_qwen2vl(images)
+            source = self.preprocess_multimodal(conversations)
+            if process_mode == "qwen2":
+                data_dict = self.preprocess_qwen2(source, self.tokenizer, has_image=True, only_mask_system=False, inference=inference)
+            if process_mode == "qwen3":
+                # ZYF Modify to support thinking
+                enable_thinking = kwargs.get("enable_thinking", True) #默认开启
+                data_dict = self.preprocess_qwen3(source, self.tokenizer, has_image=True, only_mask_system=False, inference=inference, enable_thinking=enable_thinking)
+            # Construct batch data
+            data_dict["input_ids"] = data_dict["input_ids"].unsqueeze(0) # batch_size = 1
+            data_dict["labels"] = data_dict["labels"].unsqueeze(0)
+            data_dict["images"] = [precessed_images_siglip]
+            return BatchFeature(data={**data_dict, **processed_data_dict_qwen2vl})
+        else:
+            raise ValueError(f"Unsupported process mode: {process_mode}")
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)

special_tokens_map.json CHANGED Viewed

@@ -1,3 +1,37 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:8311bb5d0985329c5f9af218ea4b325fa39ab2b5a4655181ec65fc1a9f2702c7
-size 709

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>",
+    "<im_start>",
+    "<im_end>",
+    "<vi_start>",
+    "<vi_end>",
+    "<cor>",
+    "<\\cor>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer_config.json CHANGED Viewed

@@ -1,3 +1,298 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:eeb1cc1135873700e48cddc62037d02509513ddd52affce419806f26edbdf5f6
-size 6828

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151666": {
+      "content": "</tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151667": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151668": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151669": {
+      "content": "<im_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151670": {
+      "content": "<im_end>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151671": {
+      "content": "<vi_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151672": {
+      "content": "<vi_end>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151673": {
+      "content": "<cor>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151674": {
+      "content": "<\\cor>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>",
+    "<im_start>",
+    "<im_end>",
+    "<vi_start>",
+    "<vi_end>",
+    "<cor>",
+    "<\\cor>"
+  ],
+  "auto_map": {
+    "AutoProcessor": "/mnt/bn/ecomcommonnas/zhangshuo/easyguard/checkpoints/VALLEY_B8_V1_GTHINKER_ENABLE_THINKING_COLD_START_V0908_MERGED_V1/checkpoint-400--processing_valley.ValleyProcessor"
+  },
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 4096,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "right",
+  "processor_class": "ValleyProcessor",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

utils.py CHANGED Viewed

@@ -1,3 +1,409 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b00b4131a3462f2ac50dd86d99aaafd0d21e98f24bf843f349d622cbe0d2bcb2
-size 16377

+from PIL import Image
+from io import BytesIO
+import base64
+import math
+import ast
+import re
+import torch
+from transformers import StoppingCriteria
+IGNORE_INDEX = -100
+IMAGE_TOKEN_INDEX = -200
+GANDALF_TOKEN_INDEX = -300
+DEFAULT_PAD_TOKEN = "[PAD]"
+DEFAULT_EOS_TOKEN = "</s>"
+DEFAULT_BOS_TOKEN = "</s>"
+DEFAULT_UNK_TOKEN = "<unk>"
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
+DEFAULT_IM_START_TOKEN = "<im_start>"
+DEFAULT_IM_END_TOKEN = "<im_end>"
+DEFAULT_VIDEO_TOKEN = "<video>"
+DEFAULT_VIDEO_FRAME_TOKEN = "<vi_frame>"
+DEFAULT_VI_START_TOKEN = "<vi_start>"
+DEFAULT_VI_END_TOKEN = "<vi_end>"
+DEFAULT_EOC_TOKEN = "<eoc>"
+COR_START_TOKEN = "<cor>"
+COR_END_TOKEN = "<\cor>"
+SEQ_MAX_LEN = 50000
+BLACK_IMG_ENV = b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x03\x00\x00\x00\x03\x08\x02\x00\x00\x00\xd9J"\xe8\x00\x00\x00\x12IDAT\x08\x1dcd\x80\x01F\x06\x18`d\x80\x01\x00\x00Z\x00\x04we\x03N\x00\x00\x00\x00IEND\xaeB`\x82'
+def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
+    """
+    Calculate the shape of the image patch grid after the preprocessing for images of any resolution.
+    Args:
+        image_size (tuple): The size of the input image in the format (width, height).
+        grid_pinpoints (str): A string representation of a list of possible resolutions.
+        patch_size (int): The size of each image patch.
+    Returns:
+        tuple: The shape of the image patch grid in the format (width, height).
+    """
+    if isinstance(grid_pinpoints, str) and "x" in grid_pinpoints:
+        assert patch_size in [224, 336, 384, 448, 512], "patch_size should be in [224, 336, 384, 448, 512]"
+        # Use regex to extract the range from the input string
+        matches = re.findall(r"\((\d+)x(\d+)\)", grid_pinpoints)
+        range_start = tuple(map(int, matches[0]))
+        range_end = tuple(map(int, matches[-1]))
+        # Generate a matrix of tuples from (range_start[0], range_start[1]) to (range_end[0], range_end[1])
+        grid_pinpoints = [
+            (i, j)
+            for i in range(range_start[0], range_end[0] + 1)
+            for j in range(range_start[1], range_end[1] + 1)
+        ]
+        # Multiply all elements by patch_size
+        grid_pinpoints = [[dim * patch_size for dim in pair] for pair in grid_pinpoints]
+    if type(grid_pinpoints) is list:
+        possible_resolutions = grid_pinpoints
+    else:
+        possible_resolutions = ast.literal_eval(grid_pinpoints)
+    width, height = select_best_resolution(image_size, possible_resolutions)
+    return width // patch_size, height // patch_size
+def select_best_resolution(original_size, possible_resolutions):
+    """
+    Selects the best resolution from a list of possible resolutions based on the original size.
+    Args:
+        original_size (tuple): The original size of the image in the format (width, height).
+        possible_resolutions (list): A list of possible resolutions in the format
+                                    [(width1, height1), (width2, height2), ...].
+    Returns:
+        tuple: The best fit resolution in the format (width, height).
+    """
+    original_width, original_height = original_size
+    best_fit = None
+    max_effective_resolution = 0
+    min_wasted_resolution = float("inf")
+    for width, height in possible_resolutions:
+        # Calculate the downscaled size to keep the aspect ratio
+        scale = min(width / original_width, height / original_height)
+        downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
+        # Calculate effective and wasted resolutions
+        effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
+        wasted_resolution = (width * height) - effective_resolution
+        if effective_resolution > max_effective_resolution or \
+                (effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution):
+            max_effective_resolution = effective_resolution
+            min_wasted_resolution = wasted_resolution
+            best_fit = (width, height)
+    return best_fit
+def unpad_image(tensor, original_size):
+    """
+    Unpads a PyTorch tensor of a padded and resized image.
+    Args:
+    tensor (torch.Tensor): The image tensor, assumed to be in CxHxW format.
+    original_size (tuple): The original size of the image (height, width).
+    Returns:
+    torch.Tensor: The unpadded image tensor.
+    """
+    original_width, original_height = original_size
+    current_height, current_width = tensor.shape[1:]
+    # Compute aspect ratios
+    original_aspect_ratio = original_width / original_height
+    current_aspect_ratio = current_width / current_height
+    # Determine padding size and direction
+    if original_aspect_ratio > current_aspect_ratio:
+        # Padding was added to the height
+        scale_factor = current_width / original_width
+        new_height = int(original_height * scale_factor)
+        padding = (current_height - new_height) // 2
+        unpadded_tensor = tensor[:, padding: current_height - padding, :]
+    else:
+        # Padding was added to the width
+        scale_factor = current_height / original_height
+        new_width = int(original_width * scale_factor)
+        padding = (current_width - new_width) // 2
+        unpadded_tensor = tensor[:, :, padding: current_width - padding]
+    return unpadded_tensor
+def process_anyres_image(image, processor, grid_pinpoints):
+    """
+    Process an image with variable resolutions.
+    Args:
+        image (PIL.Image.Image): The input image to be processed.
+        processor: The image processor object.
+        grid_pinpoints (str): A string representation of a list of possible resolutions.
+    Returns:
+        torch.Tensor: A tensor containing the processed image patches.
+    """
+    # Convert grid_pinpoints from string to list
+    if isinstance(grid_pinpoints, str) and "x" in grid_pinpoints:
+        try:
+            patch_size = processor.size["height"]
+        except Exception:
+            patch_size = processor.size["shortest_edge"]
+        assert patch_size in [224, 336, 384, 448, 512], "patch_size should be in [224, 336, 384, 448, 512]"
+        # Use regex to extract the range from the input string
+        matches = re.findall(r"\((\d+)x(\d+)\)", grid_pinpoints)
+        range_start = tuple(map(int, matches[0]))
+        range_end = tuple(map(int, matches[-1]))
+        # Generate a matrix of tuples from (range_start[0], range_start[1]) to (range_end[0], range_end[1])
+        grid_pinpoints = [
+            (i, j)
+            for i in range(range_start[0], range_end[0] + 1)
+            for j in range(range_start[1], range_end[1] + 1)
+        ]
+        # Multiply all elements by patch_size
+        grid_pinpoints = [[dim * patch_size for dim in pair] for pair in grid_pinpoints]
+    if type(grid_pinpoints) is list:
+        possible_resolutions = grid_pinpoints
+    else:
+        possible_resolutions = ast.literal_eval(grid_pinpoints)
+    best_resolution = select_best_resolution(image.size, possible_resolutions)
+    image_padded = resize_and_pad_image(image, best_resolution)
+    patches = divide_to_patches(image_padded, processor.size["height"])
+    # FIXME: this seems to be a bug that it resizes instead of pad.
+    # but to keep it consistent with previous, i will keep it as it is
+    # TODO: uncomment below to ablate with the padding
+    if isinstance(processor.size, dict):
+        shortest_edge = processor.size["height"]
+    else:
+        shortest_edge = min(processor.size)
+    image_original_resize = image.resize((shortest_edge, shortest_edge))
+    # image_padded_square = expand2square(image, tuple(int(x*255) for x in processor.image_mean))
+    image_patches = [image_original_resize] + patches
+    image_patches = [
+        processor.preprocess(image_patch, return_tensors="pt")["pixel_values"][0]
+        for image_patch in image_patches
+    ]
+    # return torch.stack(image_patches, dim=0)
+    return image_patches
+def resize_and_pad_image(image, target_resolution):
+    """
+    Resize and pad an image to a target resolution while maintaining aspect ratio.
+    Args:
+        image (PIL.Image.Image): The input image.
+        target_resolution (tuple): The target resolution (width, height) of the image.
+    Returns:
+        PIL.Image.Image: The resized and padded image.
+    """
+    original_width, original_height = image.size
+    target_width, target_height = target_resolution
+    # Determine which dimension (width or height) to fill
+    scale_w = target_width / original_width
+    scale_h = target_height / original_height
+    if scale_w < scale_h:
+        # Width will be filled completely
+        new_width = target_width
+        new_height = min(math.ceil(original_height * scale_w), target_height)
+    else:
+        # Height will be filled completely
+        new_height = target_height
+        new_width = min(math.ceil(original_width * scale_h), target_width)
+    # Resize the image
+    resized_image = image.resize((new_width, new_height))
+    # Create a new image with the target size and paste the resized image onto it
+    new_image = Image.new("RGB", (target_width, target_height), (0, 0, 0))
+    paste_x = (target_width - new_width) // 2
+    paste_y = (target_height - new_height) // 2
+    new_image.paste(resized_image, (paste_x, paste_y))
+    return new_image
+def divide_to_patches(image, patch_size):
+    """
+    Divides an image into patches of a specified size.
+    Args:
+        image (PIL.Image.Image): The input image.
+        patch_size (int): The size of each patch.
+    Returns:
+        list: A list of PIL.Image.Image objects representing the patches.
+    """
+    patches = []
+    width, height = image.size
+    for i in range(0, height, patch_size):
+        for j in range(0, width, patch_size):
+            box = (j, i, j + patch_size, i + patch_size)
+            patch = image.crop(box)
+            patches.append(patch)
+    return patches
+from typing import List
+import PIL.Image
+import torch
+import transformers
+IGNORE_ID = -100
+IMAGE_TOKEN_ID = -200
+IMAGE_TOKEN = "<image>"
+IMAGE_ATOM_ID = -300
+IMAGE_INDICATOR_IDS = [-301, -302, -303, -304, -305]
+def construct_image_placeholders(grid):
+    image_placeholders = [IMAGE_INDICATOR_IDS[0], IMAGE_ATOM_ID, IMAGE_INDICATOR_IDS[1]]
+    if grid[0] * grid[1] > 1:
+        for r in range(grid[0]):
+            for c in range(grid[1]):
+                image_placeholders.append(IMAGE_ATOM_ID)
+                if c < grid[1] - 1:
+                    image_placeholders.append(IMAGE_INDICATOR_IDS[2])
+            if r < grid[0] - 1:
+                image_placeholders.append(IMAGE_INDICATOR_IDS[3])
+    image_placeholders.append(IMAGE_INDICATOR_IDS[4])
+    return image_placeholders
+def preprocess_image_ovis(image: PIL.Image.Image, image_processor, crop_size, max_partition=9, covering_threshold=0.9, convert_to_rgb=True):
+    def _preprocess(img: PIL.Image.Image, side):
+        # first resize and preprocess
+        w, h = img.size
+        if w == h:
+            new_width = new_height = side
+        elif w > h:
+            new_width = side
+            new_height = int(h / w * new_width)
+        else:
+            new_height = side
+            new_width = int(w / h * new_height)
+        new_size = dict(height=new_height, width=new_width)
+        pixel_values = image_processor.preprocess(img, size=new_size, return_tensors='pt')['pixel_values']
+        # then pad to square
+        square_values = torch.zeros([1, 3, side, side], dtype=pixel_values.dtype, device=pixel_values.device)
+        new_height, new_width = pixel_values.shape[2:]
+        if new_height == new_width:
+            square_values[:, :, :, :] = pixel_values
+        elif new_height > new_width:
+            from_index = (side - new_width) // 2
+            square_values[:, :, :, from_index:from_index + new_width] = pixel_values
+        else:
+            from_index = (side - new_height) // 2
+            square_values[:, :, from_index:from_index + new_height, :] = pixel_values
+        return square_values
+    def _partition(img, grid):
+        w, h = img.size
+        row_height = h // grid[0]
+        col_width = w // grid[1]
+        partition = []
+        for row in range(grid[0]):
+            for col in range(grid[1]):
+                left = col * col_width
+                upper = row * row_height
+                right = w if col == grid[1] - 1 else (col + 1) * col_width
+                lower = h if row == grid[0] - 1 else (row + 1) * row_height
+                partition.append((left, upper, right, lower))
+        return partition
+    def _covering_area(left, upper, right, lower, side):
+        w = right - left
+        h = lower - upper
+        w, h = max(w, h), min(w, h)
+        if w > side:
+            h = h / w * side
+            w = side
+        return w * h
+    def _get_best_grid(img, side):
+        img_area = img.size[0] * img.size[1]
+        candidate_grids = []
+        for i in range(1, max_partition + 1):
+            for j in range(1, max_partition + 1):
+                if i * j <= max_partition:
+                    candidate_grids.append((i, j))
+        all_grids = []
+        good_grids = []
+        for grid in candidate_grids:
+            partition = _partition(img, grid)
+            covering_ratio = sum([_covering_area(*p, side) for p in partition]) / img_area
+            assert covering_ratio <= 1.0
+            all_grids.append((grid, covering_ratio))
+            if covering_ratio > covering_threshold:
+                good_grids.append((grid, covering_ratio))
+        if len(good_grids) > 0:
+            # pick the good partition with minimum #sub_images and break the tie using covering_ratio
+            return sorted(good_grids, key=lambda x: (x[0][0] * x[0][1], -x[1]))[0][0]
+        else:
+            # pick the partition with maximum covering_ratio and break the tie using #sub_images
+            return sorted(all_grids, key=lambda x: (-x[1], x[0][0] * x[0][1]))[0][0]
+    if convert_to_rgb and image.mode != 'RGB':
+        image = image.convert('RGB')
+    # sides = self.get_image_size()
+    sides = [crop_size, crop_size]
+    if sides[0] != sides[1]:
+        raise ValueError('get_image_size() returns non-square size')
+    side = sides[0]
+    grid = _get_best_grid(image, side)
+    partition = _partition(image, grid)
+    crops = [image.crop(p) for p in partition]
+    if len(crops) > 1:
+        crops.insert(0, image)
+    # pixel_values = torch.cat([_preprocess(crop, side) for crop in crops], dim=0)
+    pixel_values = [_preprocess(crop, side) for crop in crops]  # cat in the outer function
+    image_placeholders = construct_image_placeholders(grid)
+    return pixel_values, image_placeholders
+def ovis_template_process(data_dict):
+    image = data_dict['images']
+    input_ids = data_dict['input_ids']
+    labels = data_dict['labels']
+    placeholder = []
+    new_input_ids = []
+    new_labels = []
+    for img in image:
+        placeholder.append(img[1])
+    indices = torch.nonzero(input_ids==IMAGE_TOKEN_ID).squeeze(1)
+    assert len(placeholder) == len(indices)
+    cnt = 0
+    idx = 0
+    for ids in input_ids:
+        if ids == IMAGE_TOKEN_ID:
+            for i in placeholder[cnt]:
+                new_input_ids.append(i)
+                new_labels.append(-100)
+            cnt += 1
+            idx += 1
+        else:
+            new_input_ids.append(input_ids[idx])
+            new_labels.append(labels[idx])
+            idx += 1
+    assert len(new_input_ids) == len(new_labels)
+    assert len(placeholder) == cnt
+    data_dict['images'] = [img[0] for img in data_dict['images']]  # (3,3,448,448)
+    data_dict['input_ids'] = torch.tensor(new_input_ids)
+    data_dict['labels'] = torch.tensor(new_labels)
+    return data_dict
+def pad_truncate_sequence(multimodal_max_length, sequences: List[torch.Tensor], batch_first: bool = True, padding_value: float = 0.0, left_padding: bool = False) -> torch.Tensor:
+    if not left_padding:
+        pad_sequence = torch.nn.utils.rnn.pad_sequence(sequences, batch_first=batch_first, padding_value=padding_value)
+        return pad_sequence[:,:multimodal_max_length]
+    else:
+        pad_sequence = torch.nn.utils.rnn.pad_sequence([i.flip(dims=[0]) for i in sequences],batch_first=True, padding_value=padding_value).flip(dims=[1])
+        return pad_sequence[:,multimodal_max_length:]

feat: modify file type of *.py, *.txt, etc. to change storage method

feat: modify file type of .py, .txt, etc. to change storage method