feat: modify file type of *.py, *.txt, etc. to change storage method
Browse files- .gitattributes +1 -0
- added_tokens.json +34 -3
- chat_template.jinja +85 -3
- chat_template.json +3 -3
- generation_config.json +7 -3
- merges.txt +0 -0
- model.safetensors.index.json +798 -3
- modeling_projector.py +308 -3
- modeling_valley.py +1 -0
- modeling_vision_tower.py +323 -3
- preprocessor_config.json +6 -3
- processing_valley.py +618 -3
- special_tokens_map.json +37 -3
- tokenizer_config.json +298 -3
- utils.py +409 -3
.gitattributes
CHANGED
|
@@ -38,3 +38,4 @@ model-00004-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text
|
|
| 38 |
model-00002-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 39 |
model-00003-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 40 |
valley_structure.png filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 38 |
model-00002-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 39 |
model-00003-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 40 |
valley_structure.png filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
added_tokens.json
CHANGED
|
@@ -1,3 +1,34 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"</think>": 151668,
|
| 3 |
+
"</tool_call>": 151658,
|
| 4 |
+
"</tool_response>": 151666,
|
| 5 |
+
"<\\cor>": 151674,
|
| 6 |
+
"<cor>": 151673,
|
| 7 |
+
"<im_end>": 151670,
|
| 8 |
+
"<im_start>": 151669,
|
| 9 |
+
"<think>": 151667,
|
| 10 |
+
"<tool_call>": 151657,
|
| 11 |
+
"<tool_response>": 151665,
|
| 12 |
+
"<vi_end>": 151672,
|
| 13 |
+
"<vi_start>": 151671,
|
| 14 |
+
"<|box_end|>": 151649,
|
| 15 |
+
"<|box_start|>": 151648,
|
| 16 |
+
"<|endoftext|>": 151643,
|
| 17 |
+
"<|file_sep|>": 151664,
|
| 18 |
+
"<|fim_middle|>": 151660,
|
| 19 |
+
"<|fim_pad|>": 151662,
|
| 20 |
+
"<|fim_prefix|>": 151659,
|
| 21 |
+
"<|fim_suffix|>": 151661,
|
| 22 |
+
"<|im_end|>": 151645,
|
| 23 |
+
"<|im_start|>": 151644,
|
| 24 |
+
"<|image_pad|>": 151655,
|
| 25 |
+
"<|object_ref_end|>": 151647,
|
| 26 |
+
"<|object_ref_start|>": 151646,
|
| 27 |
+
"<|quad_end|>": 151651,
|
| 28 |
+
"<|quad_start|>": 151650,
|
| 29 |
+
"<|repo_name|>": 151663,
|
| 30 |
+
"<|video_pad|>": 151656,
|
| 31 |
+
"<|vision_end|>": 151653,
|
| 32 |
+
"<|vision_pad|>": 151654,
|
| 33 |
+
"<|vision_start|>": 151652
|
| 34 |
+
}
|
chat_template.jinja
CHANGED
|
@@ -1,3 +1,85 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{%- if tools %}
|
| 2 |
+
{{- '<|im_start|>system\n' }}
|
| 3 |
+
{%- if messages[0].role == 'system' %}
|
| 4 |
+
{{- messages[0].content + '\n\n' }}
|
| 5 |
+
{%- endif %}
|
| 6 |
+
{{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
|
| 7 |
+
{%- for tool in tools %}
|
| 8 |
+
{{- "\n" }}
|
| 9 |
+
{{- tool | tojson }}
|
| 10 |
+
{%- endfor %}
|
| 11 |
+
{{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
|
| 12 |
+
{%- else %}
|
| 13 |
+
{%- if messages[0].role == 'system' %}
|
| 14 |
+
{{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
|
| 15 |
+
{%- endif %}
|
| 16 |
+
{%- endif %}
|
| 17 |
+
{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
|
| 18 |
+
{%- for message in messages[::-1] %}
|
| 19 |
+
{%- set index = (messages|length - 1) - loop.index0 %}
|
| 20 |
+
{%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
|
| 21 |
+
{%- set ns.multi_step_tool = false %}
|
| 22 |
+
{%- set ns.last_query_index = index %}
|
| 23 |
+
{%- endif %}
|
| 24 |
+
{%- endfor %}
|
| 25 |
+
{%- for message in messages %}
|
| 26 |
+
{%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
|
| 27 |
+
{{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
|
| 28 |
+
{%- elif message.role == "assistant" %}
|
| 29 |
+
{%- set content = message.content %}
|
| 30 |
+
{%- set reasoning_content = '' %}
|
| 31 |
+
{%- if message.reasoning_content is defined and message.reasoning_content is not none %}
|
| 32 |
+
{%- set reasoning_content = message.reasoning_content %}
|
| 33 |
+
{%- else %}
|
| 34 |
+
{%- if '</think>' in message.content %}
|
| 35 |
+
{%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
|
| 36 |
+
{%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
|
| 37 |
+
{%- endif %}
|
| 38 |
+
{%- endif %}
|
| 39 |
+
{%- if loop.index0 > ns.last_query_index %}
|
| 40 |
+
{%- if loop.last or (not loop.last and reasoning_content) %}
|
| 41 |
+
{{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
|
| 42 |
+
{%- else %}
|
| 43 |
+
{{- '<|im_start|>' + message.role + '\n' + content }}
|
| 44 |
+
{%- endif %}
|
| 45 |
+
{%- else %}
|
| 46 |
+
{{- '<|im_start|>' + message.role + '\n' + content }}
|
| 47 |
+
{%- endif %}
|
| 48 |
+
{%- if message.tool_calls %}
|
| 49 |
+
{%- for tool_call in message.tool_calls %}
|
| 50 |
+
{%- if (loop.first and content) or (not loop.first) %}
|
| 51 |
+
{{- '\n' }}
|
| 52 |
+
{%- endif %}
|
| 53 |
+
{%- if tool_call.function %}
|
| 54 |
+
{%- set tool_call = tool_call.function %}
|
| 55 |
+
{%- endif %}
|
| 56 |
+
{{- '<tool_call>\n{"name": "' }}
|
| 57 |
+
{{- tool_call.name }}
|
| 58 |
+
{{- '", "arguments": ' }}
|
| 59 |
+
{%- if tool_call.arguments is string %}
|
| 60 |
+
{{- tool_call.arguments }}
|
| 61 |
+
{%- else %}
|
| 62 |
+
{{- tool_call.arguments | tojson }}
|
| 63 |
+
{%- endif %}
|
| 64 |
+
{{- '}\n</tool_call>' }}
|
| 65 |
+
{%- endfor %}
|
| 66 |
+
{%- endif %}
|
| 67 |
+
{{- '<|im_end|>\n' }}
|
| 68 |
+
{%- elif message.role == "tool" %}
|
| 69 |
+
{%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
|
| 70 |
+
{{- '<|im_start|>user' }}
|
| 71 |
+
{%- endif %}
|
| 72 |
+
{{- '\n<tool_response>\n' }}
|
| 73 |
+
{{- message.content }}
|
| 74 |
+
{{- '\n</tool_response>' }}
|
| 75 |
+
{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
|
| 76 |
+
{{- '<|im_end|>\n' }}
|
| 77 |
+
{%- endif %}
|
| 78 |
+
{%- endif %}
|
| 79 |
+
{%- endfor %}
|
| 80 |
+
{%- if add_generation_prompt %}
|
| 81 |
+
{{- '<|im_start|>assistant\n' }}
|
| 82 |
+
{%- if enable_thinking is defined and enable_thinking is false %}
|
| 83 |
+
{{- '<think>\n\n</think>\n\n' }}
|
| 84 |
+
{%- endif %}
|
| 85 |
+
{%- endif %}
|
chat_template.json
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0].role == 'system' %}\n {{- messages[0].content + '\\n\\n' }}\n {%- endif %}\n {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0].role == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n {%- set index = (messages|length - 1) - loop.index0 %}\n {%- if ns.multi_step_tool and message.role == \"user\" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}\n {%- set ns.multi_step_tool = false %}\n {%- set ns.last_query_index = index %}\n {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {%- set content = message.content %}\n {%- set reasoning_content = '' %}\n {%- if message.reasoning_content is defined and message.reasoning_content is not none %}\n {%- set reasoning_content = message.reasoning_content %}\n {%- else %}\n {%- if '</think>' in message.content %}\n {%- set content = message.content.split('</think>')[-1].lstrip('\\n') %}\n {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n {%- endif %}\n {%- endif %}\n {%- if loop.index0 > ns.last_query_index %}\n {%- if loop.last or (not loop.last and reasoning_content) %}\n {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- if message.tool_calls %}\n {%- for tool_call in message.tool_calls %}\n {%- if (loop.first and content) or (not loop.first) %}\n {{- '\\n' }}\n {%- endif %}\n {%- if tool_call.function %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {%- if tool_call.arguments is string %}\n {{- tool_call.arguments }}\n {%- else %}\n {{- tool_call.arguments | tojson }}\n {%- endif %}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n {%- if enable_thinking is defined and enable_thinking is false %}\n {{- '<think>\\n\\n</think>\\n\\n' }}\n {%- endif %}\n{%- endif %}"
|
| 3 |
+
}
|
generation_config.json
CHANGED
|
@@ -1,3 +1,7 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_from_model_config": true,
|
| 3 |
+
"eos_token_id": 151645,
|
| 4 |
+
"pad_token_id": 151643,
|
| 5 |
+
"transformers_version": "4.54.0",
|
| 6 |
+
"use_cache": true
|
| 7 |
+
}
|
merges.txt
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
model.safetensors.index.json
CHANGED
|
@@ -1,3 +1,798 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"metadata": {
|
| 3 |
+
"total_parameters": 9423832576,
|
| 4 |
+
"total_size": 18847665152
|
| 5 |
+
},
|
| 6 |
+
"weight_map": {
|
| 7 |
+
"lm_head.weight": "model-00004-of-00004.safetensors",
|
| 8 |
+
"model.embed_tokens.weight": "model-00001-of-00004.safetensors",
|
| 9 |
+
"model.layers.0.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
| 10 |
+
"model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
| 11 |
+
"model.layers.0.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
| 12 |
+
"model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
| 13 |
+
"model.layers.0.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
| 14 |
+
"model.layers.0.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
|
| 15 |
+
"model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
| 16 |
+
"model.layers.0.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
| 17 |
+
"model.layers.0.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
|
| 18 |
+
"model.layers.0.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
|
| 19 |
+
"model.layers.0.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
|
| 20 |
+
"model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
| 21 |
+
"model.layers.1.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
| 22 |
+
"model.layers.1.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
| 23 |
+
"model.layers.1.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
| 24 |
+
"model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
| 25 |
+
"model.layers.1.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
|
| 26 |
+
"model.layers.1.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
| 27 |
+
"model.layers.1.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
| 28 |
+
"model.layers.1.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
|
| 29 |
+
"model.layers.1.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
|
| 30 |
+
"model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
| 31 |
+
"model.layers.10.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
| 32 |
+
"model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
| 33 |
+
"model.layers.10.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
| 34 |
+
"model.layers.10.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
|
| 35 |
+
"model.layers.10.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
|
| 36 |
+
"model.layers.10.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
|
| 37 |
+
"model.layers.10.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
| 38 |
+
"model.layers.10.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
| 39 |
+
"model.layers.10.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
|
| 40 |
+
"model.layers.10.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
| 41 |
+
"model.layers.10.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
|
| 42 |
+
"model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
| 43 |
+
"model.layers.11.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
|
| 44 |
+
"model.layers.11.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
| 45 |
+
"model.layers.11.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
| 46 |
+
"model.layers.11.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
| 47 |
+
"model.layers.11.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
|
| 48 |
+
"model.layers.11.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
| 49 |
+
"model.layers.11.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
| 50 |
+
"model.layers.11.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
|
| 51 |
+
"model.layers.11.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
|
| 52 |
+
"model.layers.11.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
| 53 |
+
"model.layers.12.input_layernorm.weight": "model-00004-of-00004.safetensors",
|
| 54 |
+
"model.layers.12.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
| 55 |
+
"model.layers.12.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
| 56 |
+
"model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
| 57 |
+
"model.layers.12.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
| 58 |
+
"model.layers.12.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
|
| 59 |
+
"model.layers.12.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
| 60 |
+
"model.layers.12.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
|
| 61 |
+
"model.layers.12.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
|
| 62 |
+
"model.layers.12.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
| 63 |
+
"model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
| 64 |
+
"model.layers.13.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
| 65 |
+
"model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
| 66 |
+
"model.layers.13.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
| 67 |
+
"model.layers.13.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
| 68 |
+
"model.layers.13.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
|
| 69 |
+
"model.layers.13.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
|
| 70 |
+
"model.layers.13.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
|
| 71 |
+
"model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
| 72 |
+
"model.layers.13.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
|
| 73 |
+
"model.layers.13.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
|
| 74 |
+
"model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
| 75 |
+
"model.layers.14.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
| 76 |
+
"model.layers.14.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
| 77 |
+
"model.layers.14.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
| 78 |
+
"model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
| 79 |
+
"model.layers.14.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
| 80 |
+
"model.layers.14.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
|
| 81 |
+
"model.layers.14.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
| 82 |
+
"model.layers.14.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
|
| 83 |
+
"model.layers.14.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
|
| 84 |
+
"model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
| 85 |
+
"model.layers.14.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
|
| 86 |
+
"model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
| 87 |
+
"model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
| 88 |
+
"model.layers.15.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
|
| 89 |
+
"model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
| 90 |
+
"model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
| 91 |
+
"model.layers.15.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
|
| 92 |
+
"model.layers.15.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
| 93 |
+
"model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
| 94 |
+
"model.layers.15.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
|
| 95 |
+
"model.layers.15.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
| 96 |
+
"model.layers.15.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
| 97 |
+
"model.layers.16.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
| 98 |
+
"model.layers.16.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
|
| 99 |
+
"model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
| 100 |
+
"model.layers.16.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
| 101 |
+
"model.layers.16.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
|
| 102 |
+
"model.layers.16.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
|
| 103 |
+
"model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
| 104 |
+
"model.layers.16.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
|
| 105 |
+
"model.layers.16.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
|
| 106 |
+
"model.layers.16.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
|
| 107 |
+
"model.layers.16.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
| 108 |
+
"model.layers.17.input_layernorm.weight": "model-00004-of-00004.safetensors",
|
| 109 |
+
"model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
| 110 |
+
"model.layers.17.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
| 111 |
+
"model.layers.17.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
| 112 |
+
"model.layers.17.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
| 113 |
+
"model.layers.17.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
|
| 114 |
+
"model.layers.17.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
| 115 |
+
"model.layers.17.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
| 116 |
+
"model.layers.17.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
|
| 117 |
+
"model.layers.17.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
| 118 |
+
"model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
| 119 |
+
"model.layers.18.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
| 120 |
+
"model.layers.18.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
| 121 |
+
"model.layers.18.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
| 122 |
+
"model.layers.18.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
| 123 |
+
"model.layers.18.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
| 124 |
+
"model.layers.18.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
|
| 125 |
+
"model.layers.18.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
| 126 |
+
"model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
| 127 |
+
"model.layers.18.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
|
| 128 |
+
"model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
| 129 |
+
"model.layers.18.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
| 130 |
+
"model.layers.19.input_layernorm.weight": "model-00004-of-00004.safetensors",
|
| 131 |
+
"model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
| 132 |
+
"model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
| 133 |
+
"model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
| 134 |
+
"model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
| 135 |
+
"model.layers.19.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
|
| 136 |
+
"model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
| 137 |
+
"model.layers.19.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
| 138 |
+
"model.layers.19.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
|
| 139 |
+
"model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
| 140 |
+
"model.layers.19.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
|
| 141 |
+
"model.layers.2.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
| 142 |
+
"model.layers.2.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
|
| 143 |
+
"model.layers.2.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
|
| 144 |
+
"model.layers.2.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
|
| 145 |
+
"model.layers.2.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
| 146 |
+
"model.layers.2.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
|
| 147 |
+
"model.layers.2.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
| 148 |
+
"model.layers.2.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
| 149 |
+
"model.layers.2.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
|
| 150 |
+
"model.layers.2.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
|
| 151 |
+
"model.layers.2.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
| 152 |
+
"model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
| 153 |
+
"model.layers.20.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
|
| 154 |
+
"model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
| 155 |
+
"model.layers.20.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
| 156 |
+
"model.layers.20.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
|
| 157 |
+
"model.layers.20.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
|
| 158 |
+
"model.layers.20.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
|
| 159 |
+
"model.layers.20.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
| 160 |
+
"model.layers.20.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
|
| 161 |
+
"model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
| 162 |
+
"model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
| 163 |
+
"model.layers.21.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
| 164 |
+
"model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
| 165 |
+
"model.layers.21.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
|
| 166 |
+
"model.layers.21.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
| 167 |
+
"model.layers.21.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
|
| 168 |
+
"model.layers.21.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
|
| 169 |
+
"model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
| 170 |
+
"model.layers.21.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
|
| 171 |
+
"model.layers.21.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
|
| 172 |
+
"model.layers.21.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
|
| 173 |
+
"model.layers.21.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
| 174 |
+
"model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
| 175 |
+
"model.layers.22.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
| 176 |
+
"model.layers.22.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
| 177 |
+
"model.layers.22.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
| 178 |
+
"model.layers.22.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
| 179 |
+
"model.layers.22.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
|
| 180 |
+
"model.layers.22.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
|
| 181 |
+
"model.layers.22.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
| 182 |
+
"model.layers.22.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
|
| 183 |
+
"model.layers.22.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
| 184 |
+
"model.layers.22.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
| 185 |
+
"model.layers.23.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
| 186 |
+
"model.layers.23.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
| 187 |
+
"model.layers.23.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
| 188 |
+
"model.layers.23.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
| 189 |
+
"model.layers.23.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
| 190 |
+
"model.layers.23.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
|
| 191 |
+
"model.layers.23.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
|
| 192 |
+
"model.layers.23.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
| 193 |
+
"model.layers.23.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
|
| 194 |
+
"model.layers.23.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
| 195 |
+
"model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
| 196 |
+
"model.layers.24.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
| 197 |
+
"model.layers.24.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
| 198 |
+
"model.layers.24.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
| 199 |
+
"model.layers.24.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
| 200 |
+
"model.layers.24.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
| 201 |
+
"model.layers.24.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
|
| 202 |
+
"model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
| 203 |
+
"model.layers.24.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
|
| 204 |
+
"model.layers.24.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
|
| 205 |
+
"model.layers.24.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
|
| 206 |
+
"model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
| 207 |
+
"model.layers.25.input_layernorm.weight": "model-00004-of-00004.safetensors",
|
| 208 |
+
"model.layers.25.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
|
| 209 |
+
"model.layers.25.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
| 210 |
+
"model.layers.25.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
|
| 211 |
+
"model.layers.25.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
|
| 212 |
+
"model.layers.25.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
|
| 213 |
+
"model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
| 214 |
+
"model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
| 215 |
+
"model.layers.25.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
|
| 216 |
+
"model.layers.25.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
| 217 |
+
"model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
| 218 |
+
"model.layers.26.input_layernorm.weight": "model-00004-of-00004.safetensors",
|
| 219 |
+
"model.layers.26.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
| 220 |
+
"model.layers.26.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
| 221 |
+
"model.layers.26.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
|
| 222 |
+
"model.layers.26.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
| 223 |
+
"model.layers.26.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
|
| 224 |
+
"model.layers.26.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
| 225 |
+
"model.layers.26.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
| 226 |
+
"model.layers.26.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
|
| 227 |
+
"model.layers.26.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
|
| 228 |
+
"model.layers.26.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
|
| 229 |
+
"model.layers.27.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
| 230 |
+
"model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
| 231 |
+
"model.layers.27.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
| 232 |
+
"model.layers.27.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
| 233 |
+
"model.layers.27.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
| 234 |
+
"model.layers.27.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
|
| 235 |
+
"model.layers.27.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
| 236 |
+
"model.layers.27.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
| 237 |
+
"model.layers.27.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
|
| 238 |
+
"model.layers.27.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
| 239 |
+
"model.layers.27.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
| 240 |
+
"model.layers.28.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
| 241 |
+
"model.layers.28.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
|
| 242 |
+
"model.layers.28.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
| 243 |
+
"model.layers.28.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
| 244 |
+
"model.layers.28.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
| 245 |
+
"model.layers.28.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
|
| 246 |
+
"model.layers.28.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
|
| 247 |
+
"model.layers.28.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
| 248 |
+
"model.layers.28.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
|
| 249 |
+
"model.layers.28.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
| 250 |
+
"model.layers.28.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
| 251 |
+
"model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
| 252 |
+
"model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
| 253 |
+
"model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
| 254 |
+
"model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
| 255 |
+
"model.layers.29.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
|
| 256 |
+
"model.layers.29.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
|
| 257 |
+
"model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
| 258 |
+
"model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
| 259 |
+
"model.layers.29.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
|
| 260 |
+
"model.layers.29.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
| 261 |
+
"model.layers.29.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
| 262 |
+
"model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
| 263 |
+
"model.layers.3.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
| 264 |
+
"model.layers.3.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
| 265 |
+
"model.layers.3.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
|
| 266 |
+
"model.layers.3.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
|
| 267 |
+
"model.layers.3.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
|
| 268 |
+
"model.layers.3.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
| 269 |
+
"model.layers.3.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
| 270 |
+
"model.layers.3.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
|
| 271 |
+
"model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
| 272 |
+
"model.layers.3.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
|
| 273 |
+
"model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
| 274 |
+
"model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
| 275 |
+
"model.layers.30.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
| 276 |
+
"model.layers.30.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
|
| 277 |
+
"model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
| 278 |
+
"model.layers.30.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
|
| 279 |
+
"model.layers.30.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
| 280 |
+
"model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
| 281 |
+
"model.layers.30.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
|
| 282 |
+
"model.layers.30.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
| 283 |
+
"model.layers.30.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
|
| 284 |
+
"model.layers.31.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
| 285 |
+
"model.layers.31.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
| 286 |
+
"model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
| 287 |
+
"model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
| 288 |
+
"model.layers.31.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
| 289 |
+
"model.layers.31.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
|
| 290 |
+
"model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
| 291 |
+
"model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
| 292 |
+
"model.layers.31.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
|
| 293 |
+
"model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
| 294 |
+
"model.layers.31.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
| 295 |
+
"model.layers.32.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
| 296 |
+
"model.layers.32.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
| 297 |
+
"model.layers.32.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
| 298 |
+
"model.layers.32.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
| 299 |
+
"model.layers.32.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
| 300 |
+
"model.layers.32.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
|
| 301 |
+
"model.layers.32.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
| 302 |
+
"model.layers.32.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
| 303 |
+
"model.layers.32.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
|
| 304 |
+
"model.layers.32.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
| 305 |
+
"model.layers.32.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
| 306 |
+
"model.layers.33.input_layernorm.weight": "model-00004-of-00004.safetensors",
|
| 307 |
+
"model.layers.33.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
| 308 |
+
"model.layers.33.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
| 309 |
+
"model.layers.33.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
| 310 |
+
"model.layers.33.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
| 311 |
+
"model.layers.33.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
|
| 312 |
+
"model.layers.33.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
| 313 |
+
"model.layers.33.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
| 314 |
+
"model.layers.33.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
|
| 315 |
+
"model.layers.33.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
|
| 316 |
+
"model.layers.33.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
|
| 317 |
+
"model.layers.34.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
| 318 |
+
"model.layers.34.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
| 319 |
+
"model.layers.34.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
| 320 |
+
"model.layers.34.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
| 321 |
+
"model.layers.34.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
|
| 322 |
+
"model.layers.34.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
|
| 323 |
+
"model.layers.34.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
| 324 |
+
"model.layers.34.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
|
| 325 |
+
"model.layers.34.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
|
| 326 |
+
"model.layers.34.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
| 327 |
+
"model.layers.34.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
| 328 |
+
"model.layers.35.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
| 329 |
+
"model.layers.35.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
|
| 330 |
+
"model.layers.35.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
| 331 |
+
"model.layers.35.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
| 332 |
+
"model.layers.35.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
|
| 333 |
+
"model.layers.35.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
|
| 334 |
+
"model.layers.35.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
| 335 |
+
"model.layers.35.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
| 336 |
+
"model.layers.35.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
|
| 337 |
+
"model.layers.35.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
| 338 |
+
"model.layers.35.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
| 339 |
+
"model.layers.4.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
| 340 |
+
"model.layers.4.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
| 341 |
+
"model.layers.4.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
| 342 |
+
"model.layers.4.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
|
| 343 |
+
"model.layers.4.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
| 344 |
+
"model.layers.4.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
|
| 345 |
+
"model.layers.4.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
| 346 |
+
"model.layers.4.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
|
| 347 |
+
"model.layers.4.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
|
| 348 |
+
"model.layers.4.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
| 349 |
+
"model.layers.4.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
| 350 |
+
"model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
| 351 |
+
"model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
| 352 |
+
"model.layers.5.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
| 353 |
+
"model.layers.5.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
| 354 |
+
"model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
| 355 |
+
"model.layers.5.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
|
| 356 |
+
"model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
| 357 |
+
"model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
| 358 |
+
"model.layers.5.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
|
| 359 |
+
"model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
| 360 |
+
"model.layers.5.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
| 361 |
+
"model.layers.6.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
| 362 |
+
"model.layers.6.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
|
| 363 |
+
"model.layers.6.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
| 364 |
+
"model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
| 365 |
+
"model.layers.6.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
| 366 |
+
"model.layers.6.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
|
| 367 |
+
"model.layers.6.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
|
| 368 |
+
"model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
| 369 |
+
"model.layers.6.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
|
| 370 |
+
"model.layers.6.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
|
| 371 |
+
"model.layers.6.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
|
| 372 |
+
"model.layers.7.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
| 373 |
+
"model.layers.7.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
| 374 |
+
"model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
| 375 |
+
"model.layers.7.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
|
| 376 |
+
"model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
| 377 |
+
"model.layers.7.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
|
| 378 |
+
"model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
| 379 |
+
"model.layers.7.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
| 380 |
+
"model.layers.7.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
|
| 381 |
+
"model.layers.7.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
| 382 |
+
"model.layers.7.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
| 383 |
+
"model.layers.8.input_layernorm.weight": "model-00004-of-00004.safetensors",
|
| 384 |
+
"model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
| 385 |
+
"model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
| 386 |
+
"model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
| 387 |
+
"model.layers.8.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
| 388 |
+
"model.layers.8.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
|
| 389 |
+
"model.layers.8.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
|
| 390 |
+
"model.layers.8.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
| 391 |
+
"model.layers.8.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
|
| 392 |
+
"model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
| 393 |
+
"model.layers.8.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
| 394 |
+
"model.layers.9.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
| 395 |
+
"model.layers.9.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
|
| 396 |
+
"model.layers.9.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
|
| 397 |
+
"model.layers.9.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
| 398 |
+
"model.layers.9.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
|
| 399 |
+
"model.layers.9.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
|
| 400 |
+
"model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
| 401 |
+
"model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
| 402 |
+
"model.layers.9.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
|
| 403 |
+
"model.layers.9.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
| 404 |
+
"model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
| 405 |
+
"model.norm.weight": "model-00002-of-00004.safetensors",
|
| 406 |
+
"model.qwen2vl_vision_tower.blocks.0.attn.proj.bias": "model-00004-of-00004.safetensors",
|
| 407 |
+
"model.qwen2vl_vision_tower.blocks.0.attn.proj.weight": "model-00003-of-00004.safetensors",
|
| 408 |
+
"model.qwen2vl_vision_tower.blocks.0.attn.qkv.bias": "model-00001-of-00004.safetensors",
|
| 409 |
+
"model.qwen2vl_vision_tower.blocks.0.attn.qkv.weight": "model-00001-of-00004.safetensors",
|
| 410 |
+
"model.qwen2vl_vision_tower.blocks.0.mlp.fc1.bias": "model-00001-of-00004.safetensors",
|
| 411 |
+
"model.qwen2vl_vision_tower.blocks.0.mlp.fc1.weight": "model-00002-of-00004.safetensors",
|
| 412 |
+
"model.qwen2vl_vision_tower.blocks.0.mlp.fc2.bias": "model-00004-of-00004.safetensors",
|
| 413 |
+
"model.qwen2vl_vision_tower.blocks.0.mlp.fc2.weight": "model-00002-of-00004.safetensors",
|
| 414 |
+
"model.qwen2vl_vision_tower.blocks.0.norm1.bias": "model-00003-of-00004.safetensors",
|
| 415 |
+
"model.qwen2vl_vision_tower.blocks.0.norm1.weight": "model-00003-of-00004.safetensors",
|
| 416 |
+
"model.qwen2vl_vision_tower.blocks.0.norm2.bias": "model-00001-of-00004.safetensors",
|
| 417 |
+
"model.qwen2vl_vision_tower.blocks.0.norm2.weight": "model-00004-of-00004.safetensors",
|
| 418 |
+
"model.qwen2vl_vision_tower.blocks.1.attn.proj.bias": "model-00002-of-00004.safetensors",
|
| 419 |
+
"model.qwen2vl_vision_tower.blocks.1.attn.proj.weight": "model-00001-of-00004.safetensors",
|
| 420 |
+
"model.qwen2vl_vision_tower.blocks.1.attn.qkv.bias": "model-00002-of-00004.safetensors",
|
| 421 |
+
"model.qwen2vl_vision_tower.blocks.1.attn.qkv.weight": "model-00004-of-00004.safetensors",
|
| 422 |
+
"model.qwen2vl_vision_tower.blocks.1.mlp.fc1.bias": "model-00001-of-00004.safetensors",
|
| 423 |
+
"model.qwen2vl_vision_tower.blocks.1.mlp.fc1.weight": "model-00001-of-00004.safetensors",
|
| 424 |
+
"model.qwen2vl_vision_tower.blocks.1.mlp.fc2.bias": "model-00003-of-00004.safetensors",
|
| 425 |
+
"model.qwen2vl_vision_tower.blocks.1.mlp.fc2.weight": "model-00002-of-00004.safetensors",
|
| 426 |
+
"model.qwen2vl_vision_tower.blocks.1.norm1.bias": "model-00001-of-00004.safetensors",
|
| 427 |
+
"model.qwen2vl_vision_tower.blocks.1.norm1.weight": "model-00001-of-00004.safetensors",
|
| 428 |
+
"model.qwen2vl_vision_tower.blocks.1.norm2.bias": "model-00003-of-00004.safetensors",
|
| 429 |
+
"model.qwen2vl_vision_tower.blocks.1.norm2.weight": "model-00002-of-00004.safetensors",
|
| 430 |
+
"model.qwen2vl_vision_tower.blocks.10.attn.proj.bias": "model-00004-of-00004.safetensors",
|
| 431 |
+
"model.qwen2vl_vision_tower.blocks.10.attn.proj.weight": "model-00002-of-00004.safetensors",
|
| 432 |
+
"model.qwen2vl_vision_tower.blocks.10.attn.qkv.bias": "model-00001-of-00004.safetensors",
|
| 433 |
+
"model.qwen2vl_vision_tower.blocks.10.attn.qkv.weight": "model-00001-of-00004.safetensors",
|
| 434 |
+
"model.qwen2vl_vision_tower.blocks.10.mlp.fc1.bias": "model-00004-of-00004.safetensors",
|
| 435 |
+
"model.qwen2vl_vision_tower.blocks.10.mlp.fc1.weight": "model-00001-of-00004.safetensors",
|
| 436 |
+
"model.qwen2vl_vision_tower.blocks.10.mlp.fc2.bias": "model-00004-of-00004.safetensors",
|
| 437 |
+
"model.qwen2vl_vision_tower.blocks.10.mlp.fc2.weight": "model-00004-of-00004.safetensors",
|
| 438 |
+
"model.qwen2vl_vision_tower.blocks.10.norm1.bias": "model-00002-of-00004.safetensors",
|
| 439 |
+
"model.qwen2vl_vision_tower.blocks.10.norm1.weight": "model-00004-of-00004.safetensors",
|
| 440 |
+
"model.qwen2vl_vision_tower.blocks.10.norm2.bias": "model-00002-of-00004.safetensors",
|
| 441 |
+
"model.qwen2vl_vision_tower.blocks.10.norm2.weight": "model-00001-of-00004.safetensors",
|
| 442 |
+
"model.qwen2vl_vision_tower.blocks.11.attn.proj.bias": "model-00003-of-00004.safetensors",
|
| 443 |
+
"model.qwen2vl_vision_tower.blocks.11.attn.proj.weight": "model-00003-of-00004.safetensors",
|
| 444 |
+
"model.qwen2vl_vision_tower.blocks.11.attn.qkv.bias": "model-00004-of-00004.safetensors",
|
| 445 |
+
"model.qwen2vl_vision_tower.blocks.11.attn.qkv.weight": "model-00002-of-00004.safetensors",
|
| 446 |
+
"model.qwen2vl_vision_tower.blocks.11.mlp.fc1.bias": "model-00004-of-00004.safetensors",
|
| 447 |
+
"model.qwen2vl_vision_tower.blocks.11.mlp.fc1.weight": "model-00001-of-00004.safetensors",
|
| 448 |
+
"model.qwen2vl_vision_tower.blocks.11.mlp.fc2.bias": "model-00002-of-00004.safetensors",
|
| 449 |
+
"model.qwen2vl_vision_tower.blocks.11.mlp.fc2.weight": "model-00002-of-00004.safetensors",
|
| 450 |
+
"model.qwen2vl_vision_tower.blocks.11.norm1.bias": "model-00002-of-00004.safetensors",
|
| 451 |
+
"model.qwen2vl_vision_tower.blocks.11.norm1.weight": "model-00003-of-00004.safetensors",
|
| 452 |
+
"model.qwen2vl_vision_tower.blocks.11.norm2.bias": "model-00001-of-00004.safetensors",
|
| 453 |
+
"model.qwen2vl_vision_tower.blocks.11.norm2.weight": "model-00001-of-00004.safetensors",
|
| 454 |
+
"model.qwen2vl_vision_tower.blocks.12.attn.proj.bias": "model-00002-of-00004.safetensors",
|
| 455 |
+
"model.qwen2vl_vision_tower.blocks.12.attn.proj.weight": "model-00002-of-00004.safetensors",
|
| 456 |
+
"model.qwen2vl_vision_tower.blocks.12.attn.qkv.bias": "model-00002-of-00004.safetensors",
|
| 457 |
+
"model.qwen2vl_vision_tower.blocks.12.attn.qkv.weight": "model-00004-of-00004.safetensors",
|
| 458 |
+
"model.qwen2vl_vision_tower.blocks.12.mlp.fc1.bias": "model-00002-of-00004.safetensors",
|
| 459 |
+
"model.qwen2vl_vision_tower.blocks.12.mlp.fc1.weight": "model-00003-of-00004.safetensors",
|
| 460 |
+
"model.qwen2vl_vision_tower.blocks.12.mlp.fc2.bias": "model-00002-of-00004.safetensors",
|
| 461 |
+
"model.qwen2vl_vision_tower.blocks.12.mlp.fc2.weight": "model-00004-of-00004.safetensors",
|
| 462 |
+
"model.qwen2vl_vision_tower.blocks.12.norm1.bias": "model-00003-of-00004.safetensors",
|
| 463 |
+
"model.qwen2vl_vision_tower.blocks.12.norm1.weight": "model-00001-of-00004.safetensors",
|
| 464 |
+
"model.qwen2vl_vision_tower.blocks.12.norm2.bias": "model-00001-of-00004.safetensors",
|
| 465 |
+
"model.qwen2vl_vision_tower.blocks.12.norm2.weight": "model-00002-of-00004.safetensors",
|
| 466 |
+
"model.qwen2vl_vision_tower.blocks.13.attn.proj.bias": "model-00001-of-00004.safetensors",
|
| 467 |
+
"model.qwen2vl_vision_tower.blocks.13.attn.proj.weight": "model-00001-of-00004.safetensors",
|
| 468 |
+
"model.qwen2vl_vision_tower.blocks.13.attn.qkv.bias": "model-00002-of-00004.safetensors",
|
| 469 |
+
"model.qwen2vl_vision_tower.blocks.13.attn.qkv.weight": "model-00002-of-00004.safetensors",
|
| 470 |
+
"model.qwen2vl_vision_tower.blocks.13.mlp.fc1.bias": "model-00002-of-00004.safetensors",
|
| 471 |
+
"model.qwen2vl_vision_tower.blocks.13.mlp.fc1.weight": "model-00002-of-00004.safetensors",
|
| 472 |
+
"model.qwen2vl_vision_tower.blocks.13.mlp.fc2.bias": "model-00003-of-00004.safetensors",
|
| 473 |
+
"model.qwen2vl_vision_tower.blocks.13.mlp.fc2.weight": "model-00004-of-00004.safetensors",
|
| 474 |
+
"model.qwen2vl_vision_tower.blocks.13.norm1.bias": "model-00004-of-00004.safetensors",
|
| 475 |
+
"model.qwen2vl_vision_tower.blocks.13.norm1.weight": "model-00004-of-00004.safetensors",
|
| 476 |
+
"model.qwen2vl_vision_tower.blocks.13.norm2.bias": "model-00004-of-00004.safetensors",
|
| 477 |
+
"model.qwen2vl_vision_tower.blocks.13.norm2.weight": "model-00001-of-00004.safetensors",
|
| 478 |
+
"model.qwen2vl_vision_tower.blocks.14.attn.proj.bias": "model-00002-of-00004.safetensors",
|
| 479 |
+
"model.qwen2vl_vision_tower.blocks.14.attn.proj.weight": "model-00003-of-00004.safetensors",
|
| 480 |
+
"model.qwen2vl_vision_tower.blocks.14.attn.qkv.bias": "model-00002-of-00004.safetensors",
|
| 481 |
+
"model.qwen2vl_vision_tower.blocks.14.attn.qkv.weight": "model-00004-of-00004.safetensors",
|
| 482 |
+
"model.qwen2vl_vision_tower.blocks.14.mlp.fc1.bias": "model-00004-of-00004.safetensors",
|
| 483 |
+
"model.qwen2vl_vision_tower.blocks.14.mlp.fc1.weight": "model-00001-of-00004.safetensors",
|
| 484 |
+
"model.qwen2vl_vision_tower.blocks.14.mlp.fc2.bias": "model-00001-of-00004.safetensors",
|
| 485 |
+
"model.qwen2vl_vision_tower.blocks.14.mlp.fc2.weight": "model-00004-of-00004.safetensors",
|
| 486 |
+
"model.qwen2vl_vision_tower.blocks.14.norm1.bias": "model-00004-of-00004.safetensors",
|
| 487 |
+
"model.qwen2vl_vision_tower.blocks.14.norm1.weight": "model-00002-of-00004.safetensors",
|
| 488 |
+
"model.qwen2vl_vision_tower.blocks.14.norm2.bias": "model-00003-of-00004.safetensors",
|
| 489 |
+
"model.qwen2vl_vision_tower.blocks.14.norm2.weight": "model-00001-of-00004.safetensors",
|
| 490 |
+
"model.qwen2vl_vision_tower.blocks.15.attn.proj.bias": "model-00004-of-00004.safetensors",
|
| 491 |
+
"model.qwen2vl_vision_tower.blocks.15.attn.proj.weight": "model-00002-of-00004.safetensors",
|
| 492 |
+
"model.qwen2vl_vision_tower.blocks.15.attn.qkv.bias": "model-00003-of-00004.safetensors",
|
| 493 |
+
"model.qwen2vl_vision_tower.blocks.15.attn.qkv.weight": "model-00002-of-00004.safetensors",
|
| 494 |
+
"model.qwen2vl_vision_tower.blocks.15.mlp.fc1.bias": "model-00001-of-00004.safetensors",
|
| 495 |
+
"model.qwen2vl_vision_tower.blocks.15.mlp.fc1.weight": "model-00004-of-00004.safetensors",
|
| 496 |
+
"model.qwen2vl_vision_tower.blocks.15.mlp.fc2.bias": "model-00004-of-00004.safetensors",
|
| 497 |
+
"model.qwen2vl_vision_tower.blocks.15.mlp.fc2.weight": "model-00003-of-00004.safetensors",
|
| 498 |
+
"model.qwen2vl_vision_tower.blocks.15.norm1.bias": "model-00002-of-00004.safetensors",
|
| 499 |
+
"model.qwen2vl_vision_tower.blocks.15.norm1.weight": "model-00003-of-00004.safetensors",
|
| 500 |
+
"model.qwen2vl_vision_tower.blocks.15.norm2.bias": "model-00004-of-00004.safetensors",
|
| 501 |
+
"model.qwen2vl_vision_tower.blocks.15.norm2.weight": "model-00002-of-00004.safetensors",
|
| 502 |
+
"model.qwen2vl_vision_tower.blocks.16.attn.proj.bias": "model-00002-of-00004.safetensors",
|
| 503 |
+
"model.qwen2vl_vision_tower.blocks.16.attn.proj.weight": "model-00001-of-00004.safetensors",
|
| 504 |
+
"model.qwen2vl_vision_tower.blocks.16.attn.qkv.bias": "model-00001-of-00004.safetensors",
|
| 505 |
+
"model.qwen2vl_vision_tower.blocks.16.attn.qkv.weight": "model-00003-of-00004.safetensors",
|
| 506 |
+
"model.qwen2vl_vision_tower.blocks.16.mlp.fc1.bias": "model-00004-of-00004.safetensors",
|
| 507 |
+
"model.qwen2vl_vision_tower.blocks.16.mlp.fc1.weight": "model-00004-of-00004.safetensors",
|
| 508 |
+
"model.qwen2vl_vision_tower.blocks.16.mlp.fc2.bias": "model-00001-of-00004.safetensors",
|
| 509 |
+
"model.qwen2vl_vision_tower.blocks.16.mlp.fc2.weight": "model-00003-of-00004.safetensors",
|
| 510 |
+
"model.qwen2vl_vision_tower.blocks.16.norm1.bias": "model-00003-of-00004.safetensors",
|
| 511 |
+
"model.qwen2vl_vision_tower.blocks.16.norm1.weight": "model-00002-of-00004.safetensors",
|
| 512 |
+
"model.qwen2vl_vision_tower.blocks.16.norm2.bias": "model-00001-of-00004.safetensors",
|
| 513 |
+
"model.qwen2vl_vision_tower.blocks.16.norm2.weight": "model-00003-of-00004.safetensors",
|
| 514 |
+
"model.qwen2vl_vision_tower.blocks.17.attn.proj.bias": "model-00002-of-00004.safetensors",
|
| 515 |
+
"model.qwen2vl_vision_tower.blocks.17.attn.proj.weight": "model-00004-of-00004.safetensors",
|
| 516 |
+
"model.qwen2vl_vision_tower.blocks.17.attn.qkv.bias": "model-00003-of-00004.safetensors",
|
| 517 |
+
"model.qwen2vl_vision_tower.blocks.17.attn.qkv.weight": "model-00003-of-00004.safetensors",
|
| 518 |
+
"model.qwen2vl_vision_tower.blocks.17.mlp.fc1.bias": "model-00004-of-00004.safetensors",
|
| 519 |
+
"model.qwen2vl_vision_tower.blocks.17.mlp.fc1.weight": "model-00002-of-00004.safetensors",
|
| 520 |
+
"model.qwen2vl_vision_tower.blocks.17.mlp.fc2.bias": "model-00003-of-00004.safetensors",
|
| 521 |
+
"model.qwen2vl_vision_tower.blocks.17.mlp.fc2.weight": "model-00003-of-00004.safetensors",
|
| 522 |
+
"model.qwen2vl_vision_tower.blocks.17.norm1.bias": "model-00004-of-00004.safetensors",
|
| 523 |
+
"model.qwen2vl_vision_tower.blocks.17.norm1.weight": "model-00003-of-00004.safetensors",
|
| 524 |
+
"model.qwen2vl_vision_tower.blocks.17.norm2.bias": "model-00001-of-00004.safetensors",
|
| 525 |
+
"model.qwen2vl_vision_tower.blocks.17.norm2.weight": "model-00003-of-00004.safetensors",
|
| 526 |
+
"model.qwen2vl_vision_tower.blocks.18.attn.proj.bias": "model-00002-of-00004.safetensors",
|
| 527 |
+
"model.qwen2vl_vision_tower.blocks.18.attn.proj.weight": "model-00002-of-00004.safetensors",
|
| 528 |
+
"model.qwen2vl_vision_tower.blocks.18.attn.qkv.bias": "model-00003-of-00004.safetensors",
|
| 529 |
+
"model.qwen2vl_vision_tower.blocks.18.attn.qkv.weight": "model-00002-of-00004.safetensors",
|
| 530 |
+
"model.qwen2vl_vision_tower.blocks.18.mlp.fc1.bias": "model-00004-of-00004.safetensors",
|
| 531 |
+
"model.qwen2vl_vision_tower.blocks.18.mlp.fc1.weight": "model-00002-of-00004.safetensors",
|
| 532 |
+
"model.qwen2vl_vision_tower.blocks.18.mlp.fc2.bias": "model-00004-of-00004.safetensors",
|
| 533 |
+
"model.qwen2vl_vision_tower.blocks.18.mlp.fc2.weight": "model-00002-of-00004.safetensors",
|
| 534 |
+
"model.qwen2vl_vision_tower.blocks.18.norm1.bias": "model-00001-of-00004.safetensors",
|
| 535 |
+
"model.qwen2vl_vision_tower.blocks.18.norm1.weight": "model-00002-of-00004.safetensors",
|
| 536 |
+
"model.qwen2vl_vision_tower.blocks.18.norm2.bias": "model-00004-of-00004.safetensors",
|
| 537 |
+
"model.qwen2vl_vision_tower.blocks.18.norm2.weight": "model-00004-of-00004.safetensors",
|
| 538 |
+
"model.qwen2vl_vision_tower.blocks.19.attn.proj.bias": "model-00004-of-00004.safetensors",
|
| 539 |
+
"model.qwen2vl_vision_tower.blocks.19.attn.proj.weight": "model-00002-of-00004.safetensors",
|
| 540 |
+
"model.qwen2vl_vision_tower.blocks.19.attn.qkv.bias": "model-00004-of-00004.safetensors",
|
| 541 |
+
"model.qwen2vl_vision_tower.blocks.19.attn.qkv.weight": "model-00004-of-00004.safetensors",
|
| 542 |
+
"model.qwen2vl_vision_tower.blocks.19.mlp.fc1.bias": "model-00001-of-00004.safetensors",
|
| 543 |
+
"model.qwen2vl_vision_tower.blocks.19.mlp.fc1.weight": "model-00003-of-00004.safetensors",
|
| 544 |
+
"model.qwen2vl_vision_tower.blocks.19.mlp.fc2.bias": "model-00003-of-00004.safetensors",
|
| 545 |
+
"model.qwen2vl_vision_tower.blocks.19.mlp.fc2.weight": "model-00003-of-00004.safetensors",
|
| 546 |
+
"model.qwen2vl_vision_tower.blocks.19.norm1.bias": "model-00003-of-00004.safetensors",
|
| 547 |
+
"model.qwen2vl_vision_tower.blocks.19.norm1.weight": "model-00004-of-00004.safetensors",
|
| 548 |
+
"model.qwen2vl_vision_tower.blocks.19.norm2.bias": "model-00004-of-00004.safetensors",
|
| 549 |
+
"model.qwen2vl_vision_tower.blocks.19.norm2.weight": "model-00002-of-00004.safetensors",
|
| 550 |
+
"model.qwen2vl_vision_tower.blocks.2.attn.proj.bias": "model-00002-of-00004.safetensors",
|
| 551 |
+
"model.qwen2vl_vision_tower.blocks.2.attn.proj.weight": "model-00003-of-00004.safetensors",
|
| 552 |
+
"model.qwen2vl_vision_tower.blocks.2.attn.qkv.bias": "model-00001-of-00004.safetensors",
|
| 553 |
+
"model.qwen2vl_vision_tower.blocks.2.attn.qkv.weight": "model-00002-of-00004.safetensors",
|
| 554 |
+
"model.qwen2vl_vision_tower.blocks.2.mlp.fc1.bias": "model-00002-of-00004.safetensors",
|
| 555 |
+
"model.qwen2vl_vision_tower.blocks.2.mlp.fc1.weight": "model-00004-of-00004.safetensors",
|
| 556 |
+
"model.qwen2vl_vision_tower.blocks.2.mlp.fc2.bias": "model-00002-of-00004.safetensors",
|
| 557 |
+
"model.qwen2vl_vision_tower.blocks.2.mlp.fc2.weight": "model-00004-of-00004.safetensors",
|
| 558 |
+
"model.qwen2vl_vision_tower.blocks.2.norm1.bias": "model-00001-of-00004.safetensors",
|
| 559 |
+
"model.qwen2vl_vision_tower.blocks.2.norm1.weight": "model-00002-of-00004.safetensors",
|
| 560 |
+
"model.qwen2vl_vision_tower.blocks.2.norm2.bias": "model-00002-of-00004.safetensors",
|
| 561 |
+
"model.qwen2vl_vision_tower.blocks.2.norm2.weight": "model-00001-of-00004.safetensors",
|
| 562 |
+
"model.qwen2vl_vision_tower.blocks.20.attn.proj.bias": "model-00001-of-00004.safetensors",
|
| 563 |
+
"model.qwen2vl_vision_tower.blocks.20.attn.proj.weight": "model-00003-of-00004.safetensors",
|
| 564 |
+
"model.qwen2vl_vision_tower.blocks.20.attn.qkv.bias": "model-00003-of-00004.safetensors",
|
| 565 |
+
"model.qwen2vl_vision_tower.blocks.20.attn.qkv.weight": "model-00001-of-00004.safetensors",
|
| 566 |
+
"model.qwen2vl_vision_tower.blocks.20.mlp.fc1.bias": "model-00004-of-00004.safetensors",
|
| 567 |
+
"model.qwen2vl_vision_tower.blocks.20.mlp.fc1.weight": "model-00001-of-00004.safetensors",
|
| 568 |
+
"model.qwen2vl_vision_tower.blocks.20.mlp.fc2.bias": "model-00004-of-00004.safetensors",
|
| 569 |
+
"model.qwen2vl_vision_tower.blocks.20.mlp.fc2.weight": "model-00003-of-00004.safetensors",
|
| 570 |
+
"model.qwen2vl_vision_tower.blocks.20.norm1.bias": "model-00001-of-00004.safetensors",
|
| 571 |
+
"model.qwen2vl_vision_tower.blocks.20.norm1.weight": "model-00003-of-00004.safetensors",
|
| 572 |
+
"model.qwen2vl_vision_tower.blocks.20.norm2.bias": "model-00003-of-00004.safetensors",
|
| 573 |
+
"model.qwen2vl_vision_tower.blocks.20.norm2.weight": "model-00002-of-00004.safetensors",
|
| 574 |
+
"model.qwen2vl_vision_tower.blocks.21.attn.proj.bias": "model-00003-of-00004.safetensors",
|
| 575 |
+
"model.qwen2vl_vision_tower.blocks.21.attn.proj.weight": "model-00003-of-00004.safetensors",
|
| 576 |
+
"model.qwen2vl_vision_tower.blocks.21.attn.qkv.bias": "model-00003-of-00004.safetensors",
|
| 577 |
+
"model.qwen2vl_vision_tower.blocks.21.attn.qkv.weight": "model-00002-of-00004.safetensors",
|
| 578 |
+
"model.qwen2vl_vision_tower.blocks.21.mlp.fc1.bias": "model-00001-of-00004.safetensors",
|
| 579 |
+
"model.qwen2vl_vision_tower.blocks.21.mlp.fc1.weight": "model-00003-of-00004.safetensors",
|
| 580 |
+
"model.qwen2vl_vision_tower.blocks.21.mlp.fc2.bias": "model-00002-of-00004.safetensors",
|
| 581 |
+
"model.qwen2vl_vision_tower.blocks.21.mlp.fc2.weight": "model-00004-of-00004.safetensors",
|
| 582 |
+
"model.qwen2vl_vision_tower.blocks.21.norm1.bias": "model-00002-of-00004.safetensors",
|
| 583 |
+
"model.qwen2vl_vision_tower.blocks.21.norm1.weight": "model-00004-of-00004.safetensors",
|
| 584 |
+
"model.qwen2vl_vision_tower.blocks.21.norm2.bias": "model-00002-of-00004.safetensors",
|
| 585 |
+
"model.qwen2vl_vision_tower.blocks.21.norm2.weight": "model-00004-of-00004.safetensors",
|
| 586 |
+
"model.qwen2vl_vision_tower.blocks.22.attn.proj.bias": "model-00004-of-00004.safetensors",
|
| 587 |
+
"model.qwen2vl_vision_tower.blocks.22.attn.proj.weight": "model-00003-of-00004.safetensors",
|
| 588 |
+
"model.qwen2vl_vision_tower.blocks.22.attn.qkv.bias": "model-00002-of-00004.safetensors",
|
| 589 |
+
"model.qwen2vl_vision_tower.blocks.22.attn.qkv.weight": "model-00001-of-00004.safetensors",
|
| 590 |
+
"model.qwen2vl_vision_tower.blocks.22.mlp.fc1.bias": "model-00002-of-00004.safetensors",
|
| 591 |
+
"model.qwen2vl_vision_tower.blocks.22.mlp.fc1.weight": "model-00002-of-00004.safetensors",
|
| 592 |
+
"model.qwen2vl_vision_tower.blocks.22.mlp.fc2.bias": "model-00002-of-00004.safetensors",
|
| 593 |
+
"model.qwen2vl_vision_tower.blocks.22.mlp.fc2.weight": "model-00002-of-00004.safetensors",
|
| 594 |
+
"model.qwen2vl_vision_tower.blocks.22.norm1.bias": "model-00004-of-00004.safetensors",
|
| 595 |
+
"model.qwen2vl_vision_tower.blocks.22.norm1.weight": "model-00002-of-00004.safetensors",
|
| 596 |
+
"model.qwen2vl_vision_tower.blocks.22.norm2.bias": "model-00003-of-00004.safetensors",
|
| 597 |
+
"model.qwen2vl_vision_tower.blocks.22.norm2.weight": "model-00002-of-00004.safetensors",
|
| 598 |
+
"model.qwen2vl_vision_tower.blocks.23.attn.proj.bias": "model-00003-of-00004.safetensors",
|
| 599 |
+
"model.qwen2vl_vision_tower.blocks.23.attn.proj.weight": "model-00004-of-00004.safetensors",
|
| 600 |
+
"model.qwen2vl_vision_tower.blocks.23.attn.qkv.bias": "model-00003-of-00004.safetensors",
|
| 601 |
+
"model.qwen2vl_vision_tower.blocks.23.attn.qkv.weight": "model-00003-of-00004.safetensors",
|
| 602 |
+
"model.qwen2vl_vision_tower.blocks.23.mlp.fc1.bias": "model-00003-of-00004.safetensors",
|
| 603 |
+
"model.qwen2vl_vision_tower.blocks.23.mlp.fc1.weight": "model-00002-of-00004.safetensors",
|
| 604 |
+
"model.qwen2vl_vision_tower.blocks.23.mlp.fc2.bias": "model-00003-of-00004.safetensors",
|
| 605 |
+
"model.qwen2vl_vision_tower.blocks.23.mlp.fc2.weight": "model-00001-of-00004.safetensors",
|
| 606 |
+
"model.qwen2vl_vision_tower.blocks.23.norm1.bias": "model-00002-of-00004.safetensors",
|
| 607 |
+
"model.qwen2vl_vision_tower.blocks.23.norm1.weight": "model-00002-of-00004.safetensors",
|
| 608 |
+
"model.qwen2vl_vision_tower.blocks.23.norm2.bias": "model-00002-of-00004.safetensors",
|
| 609 |
+
"model.qwen2vl_vision_tower.blocks.23.norm2.weight": "model-00002-of-00004.safetensors",
|
| 610 |
+
"model.qwen2vl_vision_tower.blocks.24.attn.proj.bias": "model-00001-of-00004.safetensors",
|
| 611 |
+
"model.qwen2vl_vision_tower.blocks.24.attn.proj.weight": "model-00004-of-00004.safetensors",
|
| 612 |
+
"model.qwen2vl_vision_tower.blocks.24.attn.qkv.bias": "model-00002-of-00004.safetensors",
|
| 613 |
+
"model.qwen2vl_vision_tower.blocks.24.attn.qkv.weight": "model-00002-of-00004.safetensors",
|
| 614 |
+
"model.qwen2vl_vision_tower.blocks.24.mlp.fc1.bias": "model-00002-of-00004.safetensors",
|
| 615 |
+
"model.qwen2vl_vision_tower.blocks.24.mlp.fc1.weight": "model-00004-of-00004.safetensors",
|
| 616 |
+
"model.qwen2vl_vision_tower.blocks.24.mlp.fc2.bias": "model-00002-of-00004.safetensors",
|
| 617 |
+
"model.qwen2vl_vision_tower.blocks.24.mlp.fc2.weight": "model-00002-of-00004.safetensors",
|
| 618 |
+
"model.qwen2vl_vision_tower.blocks.24.norm1.bias": "model-00001-of-00004.safetensors",
|
| 619 |
+
"model.qwen2vl_vision_tower.blocks.24.norm1.weight": "model-00003-of-00004.safetensors",
|
| 620 |
+
"model.qwen2vl_vision_tower.blocks.24.norm2.bias": "model-00003-of-00004.safetensors",
|
| 621 |
+
"model.qwen2vl_vision_tower.blocks.24.norm2.weight": "model-00002-of-00004.safetensors",
|
| 622 |
+
"model.qwen2vl_vision_tower.blocks.25.attn.proj.bias": "model-00003-of-00004.safetensors",
|
| 623 |
+
"model.qwen2vl_vision_tower.blocks.25.attn.proj.weight": "model-00004-of-00004.safetensors",
|
| 624 |
+
"model.qwen2vl_vision_tower.blocks.25.attn.qkv.bias": "model-00004-of-00004.safetensors",
|
| 625 |
+
"model.qwen2vl_vision_tower.blocks.25.attn.qkv.weight": "model-00002-of-00004.safetensors",
|
| 626 |
+
"model.qwen2vl_vision_tower.blocks.25.mlp.fc1.bias": "model-00003-of-00004.safetensors",
|
| 627 |
+
"model.qwen2vl_vision_tower.blocks.25.mlp.fc1.weight": "model-00003-of-00004.safetensors",
|
| 628 |
+
"model.qwen2vl_vision_tower.blocks.25.mlp.fc2.bias": "model-00004-of-00004.safetensors",
|
| 629 |
+
"model.qwen2vl_vision_tower.blocks.25.mlp.fc2.weight": "model-00002-of-00004.safetensors",
|
| 630 |
+
"model.qwen2vl_vision_tower.blocks.25.norm1.bias": "model-00004-of-00004.safetensors",
|
| 631 |
+
"model.qwen2vl_vision_tower.blocks.25.norm1.weight": "model-00003-of-00004.safetensors",
|
| 632 |
+
"model.qwen2vl_vision_tower.blocks.25.norm2.bias": "model-00004-of-00004.safetensors",
|
| 633 |
+
"model.qwen2vl_vision_tower.blocks.25.norm2.weight": "model-00001-of-00004.safetensors",
|
| 634 |
+
"model.qwen2vl_vision_tower.blocks.26.attn.proj.bias": "model-00001-of-00004.safetensors",
|
| 635 |
+
"model.qwen2vl_vision_tower.blocks.26.attn.proj.weight": "model-00001-of-00004.safetensors",
|
| 636 |
+
"model.qwen2vl_vision_tower.blocks.26.attn.qkv.bias": "model-00001-of-00004.safetensors",
|
| 637 |
+
"model.qwen2vl_vision_tower.blocks.26.attn.qkv.weight": "model-00003-of-00004.safetensors",
|
| 638 |
+
"model.qwen2vl_vision_tower.blocks.26.mlp.fc1.bias": "model-00003-of-00004.safetensors",
|
| 639 |
+
"model.qwen2vl_vision_tower.blocks.26.mlp.fc1.weight": "model-00001-of-00004.safetensors",
|
| 640 |
+
"model.qwen2vl_vision_tower.blocks.26.mlp.fc2.bias": "model-00003-of-00004.safetensors",
|
| 641 |
+
"model.qwen2vl_vision_tower.blocks.26.mlp.fc2.weight": "model-00001-of-00004.safetensors",
|
| 642 |
+
"model.qwen2vl_vision_tower.blocks.26.norm1.bias": "model-00003-of-00004.safetensors",
|
| 643 |
+
"model.qwen2vl_vision_tower.blocks.26.norm1.weight": "model-00002-of-00004.safetensors",
|
| 644 |
+
"model.qwen2vl_vision_tower.blocks.26.norm2.bias": "model-00002-of-00004.safetensors",
|
| 645 |
+
"model.qwen2vl_vision_tower.blocks.26.norm2.weight": "model-00002-of-00004.safetensors",
|
| 646 |
+
"model.qwen2vl_vision_tower.blocks.27.attn.proj.bias": "model-00001-of-00004.safetensors",
|
| 647 |
+
"model.qwen2vl_vision_tower.blocks.27.attn.proj.weight": "model-00001-of-00004.safetensors",
|
| 648 |
+
"model.qwen2vl_vision_tower.blocks.27.attn.qkv.bias": "model-00002-of-00004.safetensors",
|
| 649 |
+
"model.qwen2vl_vision_tower.blocks.27.attn.qkv.weight": "model-00003-of-00004.safetensors",
|
| 650 |
+
"model.qwen2vl_vision_tower.blocks.27.mlp.fc1.bias": "model-00002-of-00004.safetensors",
|
| 651 |
+
"model.qwen2vl_vision_tower.blocks.27.mlp.fc1.weight": "model-00002-of-00004.safetensors",
|
| 652 |
+
"model.qwen2vl_vision_tower.blocks.27.mlp.fc2.bias": "model-00004-of-00004.safetensors",
|
| 653 |
+
"model.qwen2vl_vision_tower.blocks.27.mlp.fc2.weight": "model-00003-of-00004.safetensors",
|
| 654 |
+
"model.qwen2vl_vision_tower.blocks.27.norm1.bias": "model-00002-of-00004.safetensors",
|
| 655 |
+
"model.qwen2vl_vision_tower.blocks.27.norm1.weight": "model-00002-of-00004.safetensors",
|
| 656 |
+
"model.qwen2vl_vision_tower.blocks.27.norm2.bias": "model-00003-of-00004.safetensors",
|
| 657 |
+
"model.qwen2vl_vision_tower.blocks.27.norm2.weight": "model-00003-of-00004.safetensors",
|
| 658 |
+
"model.qwen2vl_vision_tower.blocks.28.attn.proj.bias": "model-00003-of-00004.safetensors",
|
| 659 |
+
"model.qwen2vl_vision_tower.blocks.28.attn.proj.weight": "model-00004-of-00004.safetensors",
|
| 660 |
+
"model.qwen2vl_vision_tower.blocks.28.attn.qkv.bias": "model-00003-of-00004.safetensors",
|
| 661 |
+
"model.qwen2vl_vision_tower.blocks.28.attn.qkv.weight": "model-00004-of-00004.safetensors",
|
| 662 |
+
"model.qwen2vl_vision_tower.blocks.28.mlp.fc1.bias": "model-00004-of-00004.safetensors",
|
| 663 |
+
"model.qwen2vl_vision_tower.blocks.28.mlp.fc1.weight": "model-00002-of-00004.safetensors",
|
| 664 |
+
"model.qwen2vl_vision_tower.blocks.28.mlp.fc2.bias": "model-00003-of-00004.safetensors",
|
| 665 |
+
"model.qwen2vl_vision_tower.blocks.28.mlp.fc2.weight": "model-00002-of-00004.safetensors",
|
| 666 |
+
"model.qwen2vl_vision_tower.blocks.28.norm1.bias": "model-00003-of-00004.safetensors",
|
| 667 |
+
"model.qwen2vl_vision_tower.blocks.28.norm1.weight": "model-00004-of-00004.safetensors",
|
| 668 |
+
"model.qwen2vl_vision_tower.blocks.28.norm2.bias": "model-00001-of-00004.safetensors",
|
| 669 |
+
"model.qwen2vl_vision_tower.blocks.28.norm2.weight": "model-00003-of-00004.safetensors",
|
| 670 |
+
"model.qwen2vl_vision_tower.blocks.29.attn.proj.bias": "model-00002-of-00004.safetensors",
|
| 671 |
+
"model.qwen2vl_vision_tower.blocks.29.attn.proj.weight": "model-00004-of-00004.safetensors",
|
| 672 |
+
"model.qwen2vl_vision_tower.blocks.29.attn.qkv.bias": "model-00003-of-00004.safetensors",
|
| 673 |
+
"model.qwen2vl_vision_tower.blocks.29.attn.qkv.weight": "model-00003-of-00004.safetensors",
|
| 674 |
+
"model.qwen2vl_vision_tower.blocks.29.mlp.fc1.bias": "model-00003-of-00004.safetensors",
|
| 675 |
+
"model.qwen2vl_vision_tower.blocks.29.mlp.fc1.weight": "model-00001-of-00004.safetensors",
|
| 676 |
+
"model.qwen2vl_vision_tower.blocks.29.mlp.fc2.bias": "model-00001-of-00004.safetensors",
|
| 677 |
+
"model.qwen2vl_vision_tower.blocks.29.mlp.fc2.weight": "model-00002-of-00004.safetensors",
|
| 678 |
+
"model.qwen2vl_vision_tower.blocks.29.norm1.bias": "model-00004-of-00004.safetensors",
|
| 679 |
+
"model.qwen2vl_vision_tower.blocks.29.norm1.weight": "model-00003-of-00004.safetensors",
|
| 680 |
+
"model.qwen2vl_vision_tower.blocks.29.norm2.bias": "model-00002-of-00004.safetensors",
|
| 681 |
+
"model.qwen2vl_vision_tower.blocks.29.norm2.weight": "model-00001-of-00004.safetensors",
|
| 682 |
+
"model.qwen2vl_vision_tower.blocks.3.attn.proj.bias": "model-00002-of-00004.safetensors",
|
| 683 |
+
"model.qwen2vl_vision_tower.blocks.3.attn.proj.weight": "model-00003-of-00004.safetensors",
|
| 684 |
+
"model.qwen2vl_vision_tower.blocks.3.attn.qkv.bias": "model-00002-of-00004.safetensors",
|
| 685 |
+
"model.qwen2vl_vision_tower.blocks.3.attn.qkv.weight": "model-00003-of-00004.safetensors",
|
| 686 |
+
"model.qwen2vl_vision_tower.blocks.3.mlp.fc1.bias": "model-00001-of-00004.safetensors",
|
| 687 |
+
"model.qwen2vl_vision_tower.blocks.3.mlp.fc1.weight": "model-00004-of-00004.safetensors",
|
| 688 |
+
"model.qwen2vl_vision_tower.blocks.3.mlp.fc2.bias": "model-00003-of-00004.safetensors",
|
| 689 |
+
"model.qwen2vl_vision_tower.blocks.3.mlp.fc2.weight": "model-00002-of-00004.safetensors",
|
| 690 |
+
"model.qwen2vl_vision_tower.blocks.3.norm1.bias": "model-00003-of-00004.safetensors",
|
| 691 |
+
"model.qwen2vl_vision_tower.blocks.3.norm1.weight": "model-00003-of-00004.safetensors",
|
| 692 |
+
"model.qwen2vl_vision_tower.blocks.3.norm2.bias": "model-00002-of-00004.safetensors",
|
| 693 |
+
"model.qwen2vl_vision_tower.blocks.3.norm2.weight": "model-00001-of-00004.safetensors",
|
| 694 |
+
"model.qwen2vl_vision_tower.blocks.30.attn.proj.bias": "model-00002-of-00004.safetensors",
|
| 695 |
+
"model.qwen2vl_vision_tower.blocks.30.attn.proj.weight": "model-00004-of-00004.safetensors",
|
| 696 |
+
"model.qwen2vl_vision_tower.blocks.30.attn.qkv.bias": "model-00002-of-00004.safetensors",
|
| 697 |
+
"model.qwen2vl_vision_tower.blocks.30.attn.qkv.weight": "model-00002-of-00004.safetensors",
|
| 698 |
+
"model.qwen2vl_vision_tower.blocks.30.mlp.fc1.bias": "model-00002-of-00004.safetensors",
|
| 699 |
+
"model.qwen2vl_vision_tower.blocks.30.mlp.fc1.weight": "model-00003-of-00004.safetensors",
|
| 700 |
+
"model.qwen2vl_vision_tower.blocks.30.mlp.fc2.bias": "model-00001-of-00004.safetensors",
|
| 701 |
+
"model.qwen2vl_vision_tower.blocks.30.mlp.fc2.weight": "model-00004-of-00004.safetensors",
|
| 702 |
+
"model.qwen2vl_vision_tower.blocks.30.norm1.bias": "model-00003-of-00004.safetensors",
|
| 703 |
+
"model.qwen2vl_vision_tower.blocks.30.norm1.weight": "model-00003-of-00004.safetensors",
|
| 704 |
+
"model.qwen2vl_vision_tower.blocks.30.norm2.bias": "model-00004-of-00004.safetensors",
|
| 705 |
+
"model.qwen2vl_vision_tower.blocks.30.norm2.weight": "model-00003-of-00004.safetensors",
|
| 706 |
+
"model.qwen2vl_vision_tower.blocks.31.attn.proj.bias": "model-00003-of-00004.safetensors",
|
| 707 |
+
"model.qwen2vl_vision_tower.blocks.31.attn.proj.weight": "model-00002-of-00004.safetensors",
|
| 708 |
+
"model.qwen2vl_vision_tower.blocks.31.attn.qkv.bias": "model-00004-of-00004.safetensors",
|
| 709 |
+
"model.qwen2vl_vision_tower.blocks.31.attn.qkv.weight": "model-00004-of-00004.safetensors",
|
| 710 |
+
"model.qwen2vl_vision_tower.blocks.31.mlp.fc1.bias": "model-00004-of-00004.safetensors",
|
| 711 |
+
"model.qwen2vl_vision_tower.blocks.31.mlp.fc1.weight": "model-00001-of-00004.safetensors",
|
| 712 |
+
"model.qwen2vl_vision_tower.blocks.31.mlp.fc2.bias": "model-00003-of-00004.safetensors",
|
| 713 |
+
"model.qwen2vl_vision_tower.blocks.31.mlp.fc2.weight": "model-00003-of-00004.safetensors",
|
| 714 |
+
"model.qwen2vl_vision_tower.blocks.31.norm1.bias": "model-00004-of-00004.safetensors",
|
| 715 |
+
"model.qwen2vl_vision_tower.blocks.31.norm1.weight": "model-00004-of-00004.safetensors",
|
| 716 |
+
"model.qwen2vl_vision_tower.blocks.31.norm2.bias": "model-00003-of-00004.safetensors",
|
| 717 |
+
"model.qwen2vl_vision_tower.blocks.31.norm2.weight": "model-00004-of-00004.safetensors",
|
| 718 |
+
"model.qwen2vl_vision_tower.blocks.4.attn.proj.bias": "model-00002-of-00004.safetensors",
|
| 719 |
+
"model.qwen2vl_vision_tower.blocks.4.attn.proj.weight": "model-00004-of-00004.safetensors",
|
| 720 |
+
"model.qwen2vl_vision_tower.blocks.4.attn.qkv.bias": "model-00001-of-00004.safetensors",
|
| 721 |
+
"model.qwen2vl_vision_tower.blocks.4.attn.qkv.weight": "model-00003-of-00004.safetensors",
|
| 722 |
+
"model.qwen2vl_vision_tower.blocks.4.mlp.fc1.bias": "model-00001-of-00004.safetensors",
|
| 723 |
+
"model.qwen2vl_vision_tower.blocks.4.mlp.fc1.weight": "model-00002-of-00004.safetensors",
|
| 724 |
+
"model.qwen2vl_vision_tower.blocks.4.mlp.fc2.bias": "model-00002-of-00004.safetensors",
|
| 725 |
+
"model.qwen2vl_vision_tower.blocks.4.mlp.fc2.weight": "model-00002-of-00004.safetensors",
|
| 726 |
+
"model.qwen2vl_vision_tower.blocks.4.norm1.bias": "model-00003-of-00004.safetensors",
|
| 727 |
+
"model.qwen2vl_vision_tower.blocks.4.norm1.weight": "model-00003-of-00004.safetensors",
|
| 728 |
+
"model.qwen2vl_vision_tower.blocks.4.norm2.bias": "model-00004-of-00004.safetensors",
|
| 729 |
+
"model.qwen2vl_vision_tower.blocks.4.norm2.weight": "model-00002-of-00004.safetensors",
|
| 730 |
+
"model.qwen2vl_vision_tower.blocks.5.attn.proj.bias": "model-00001-of-00004.safetensors",
|
| 731 |
+
"model.qwen2vl_vision_tower.blocks.5.attn.proj.weight": "model-00004-of-00004.safetensors",
|
| 732 |
+
"model.qwen2vl_vision_tower.blocks.5.attn.qkv.bias": "model-00002-of-00004.safetensors",
|
| 733 |
+
"model.qwen2vl_vision_tower.blocks.5.attn.qkv.weight": "model-00003-of-00004.safetensors",
|
| 734 |
+
"model.qwen2vl_vision_tower.blocks.5.mlp.fc1.bias": "model-00002-of-00004.safetensors",
|
| 735 |
+
"model.qwen2vl_vision_tower.blocks.5.mlp.fc1.weight": "model-00001-of-00004.safetensors",
|
| 736 |
+
"model.qwen2vl_vision_tower.blocks.5.mlp.fc2.bias": "model-00001-of-00004.safetensors",
|
| 737 |
+
"model.qwen2vl_vision_tower.blocks.5.mlp.fc2.weight": "model-00004-of-00004.safetensors",
|
| 738 |
+
"model.qwen2vl_vision_tower.blocks.5.norm1.bias": "model-00001-of-00004.safetensors",
|
| 739 |
+
"model.qwen2vl_vision_tower.blocks.5.norm1.weight": "model-00001-of-00004.safetensors",
|
| 740 |
+
"model.qwen2vl_vision_tower.blocks.5.norm2.bias": "model-00004-of-00004.safetensors",
|
| 741 |
+
"model.qwen2vl_vision_tower.blocks.5.norm2.weight": "model-00001-of-00004.safetensors",
|
| 742 |
+
"model.qwen2vl_vision_tower.blocks.6.attn.proj.bias": "model-00001-of-00004.safetensors",
|
| 743 |
+
"model.qwen2vl_vision_tower.blocks.6.attn.proj.weight": "model-00001-of-00004.safetensors",
|
| 744 |
+
"model.qwen2vl_vision_tower.blocks.6.attn.qkv.bias": "model-00002-of-00004.safetensors",
|
| 745 |
+
"model.qwen2vl_vision_tower.blocks.6.attn.qkv.weight": "model-00004-of-00004.safetensors",
|
| 746 |
+
"model.qwen2vl_vision_tower.blocks.6.mlp.fc1.bias": "model-00002-of-00004.safetensors",
|
| 747 |
+
"model.qwen2vl_vision_tower.blocks.6.mlp.fc1.weight": "model-00003-of-00004.safetensors",
|
| 748 |
+
"model.qwen2vl_vision_tower.blocks.6.mlp.fc2.bias": "model-00004-of-00004.safetensors",
|
| 749 |
+
"model.qwen2vl_vision_tower.blocks.6.mlp.fc2.weight": "model-00002-of-00004.safetensors",
|
| 750 |
+
"model.qwen2vl_vision_tower.blocks.6.norm1.bias": "model-00004-of-00004.safetensors",
|
| 751 |
+
"model.qwen2vl_vision_tower.blocks.6.norm1.weight": "model-00002-of-00004.safetensors",
|
| 752 |
+
"model.qwen2vl_vision_tower.blocks.6.norm2.bias": "model-00003-of-00004.safetensors",
|
| 753 |
+
"model.qwen2vl_vision_tower.blocks.6.norm2.weight": "model-00002-of-00004.safetensors",
|
| 754 |
+
"model.qwen2vl_vision_tower.blocks.7.attn.proj.bias": "model-00004-of-00004.safetensors",
|
| 755 |
+
"model.qwen2vl_vision_tower.blocks.7.attn.proj.weight": "model-00001-of-00004.safetensors",
|
| 756 |
+
"model.qwen2vl_vision_tower.blocks.7.attn.qkv.bias": "model-00004-of-00004.safetensors",
|
| 757 |
+
"model.qwen2vl_vision_tower.blocks.7.attn.qkv.weight": "model-00001-of-00004.safetensors",
|
| 758 |
+
"model.qwen2vl_vision_tower.blocks.7.mlp.fc1.bias": "model-00003-of-00004.safetensors",
|
| 759 |
+
"model.qwen2vl_vision_tower.blocks.7.mlp.fc1.weight": "model-00003-of-00004.safetensors",
|
| 760 |
+
"model.qwen2vl_vision_tower.blocks.7.mlp.fc2.bias": "model-00002-of-00004.safetensors",
|
| 761 |
+
"model.qwen2vl_vision_tower.blocks.7.mlp.fc2.weight": "model-00002-of-00004.safetensors",
|
| 762 |
+
"model.qwen2vl_vision_tower.blocks.7.norm1.bias": "model-00003-of-00004.safetensors",
|
| 763 |
+
"model.qwen2vl_vision_tower.blocks.7.norm1.weight": "model-00002-of-00004.safetensors",
|
| 764 |
+
"model.qwen2vl_vision_tower.blocks.7.norm2.bias": "model-00003-of-00004.safetensors",
|
| 765 |
+
"model.qwen2vl_vision_tower.blocks.7.norm2.weight": "model-00002-of-00004.safetensors",
|
| 766 |
+
"model.qwen2vl_vision_tower.blocks.8.attn.proj.bias": "model-00003-of-00004.safetensors",
|
| 767 |
+
"model.qwen2vl_vision_tower.blocks.8.attn.proj.weight": "model-00003-of-00004.safetensors",
|
| 768 |
+
"model.qwen2vl_vision_tower.blocks.8.attn.qkv.bias": "model-00004-of-00004.safetensors",
|
| 769 |
+
"model.qwen2vl_vision_tower.blocks.8.attn.qkv.weight": "model-00004-of-00004.safetensors",
|
| 770 |
+
"model.qwen2vl_vision_tower.blocks.8.mlp.fc1.bias": "model-00003-of-00004.safetensors",
|
| 771 |
+
"model.qwen2vl_vision_tower.blocks.8.mlp.fc1.weight": "model-00003-of-00004.safetensors",
|
| 772 |
+
"model.qwen2vl_vision_tower.blocks.8.mlp.fc2.bias": "model-00002-of-00004.safetensors",
|
| 773 |
+
"model.qwen2vl_vision_tower.blocks.8.mlp.fc2.weight": "model-00001-of-00004.safetensors",
|
| 774 |
+
"model.qwen2vl_vision_tower.blocks.8.norm1.bias": "model-00004-of-00004.safetensors",
|
| 775 |
+
"model.qwen2vl_vision_tower.blocks.8.norm1.weight": "model-00003-of-00004.safetensors",
|
| 776 |
+
"model.qwen2vl_vision_tower.blocks.8.norm2.bias": "model-00004-of-00004.safetensors",
|
| 777 |
+
"model.qwen2vl_vision_tower.blocks.8.norm2.weight": "model-00003-of-00004.safetensors",
|
| 778 |
+
"model.qwen2vl_vision_tower.blocks.9.attn.proj.bias": "model-00002-of-00004.safetensors",
|
| 779 |
+
"model.qwen2vl_vision_tower.blocks.9.attn.proj.weight": "model-00002-of-00004.safetensors",
|
| 780 |
+
"model.qwen2vl_vision_tower.blocks.9.attn.qkv.bias": "model-00002-of-00004.safetensors",
|
| 781 |
+
"model.qwen2vl_vision_tower.blocks.9.attn.qkv.weight": "model-00002-of-00004.safetensors",
|
| 782 |
+
"model.qwen2vl_vision_tower.blocks.9.mlp.fc1.bias": "model-00001-of-00004.safetensors",
|
| 783 |
+
"model.qwen2vl_vision_tower.blocks.9.mlp.fc1.weight": "model-00001-of-00004.safetensors",
|
| 784 |
+
"model.qwen2vl_vision_tower.blocks.9.mlp.fc2.bias": "model-00002-of-00004.safetensors",
|
| 785 |
+
"model.qwen2vl_vision_tower.blocks.9.mlp.fc2.weight": "model-00001-of-00004.safetensors",
|
| 786 |
+
"model.qwen2vl_vision_tower.blocks.9.norm1.bias": "model-00002-of-00004.safetensors",
|
| 787 |
+
"model.qwen2vl_vision_tower.blocks.9.norm1.weight": "model-00001-of-00004.safetensors",
|
| 788 |
+
"model.qwen2vl_vision_tower.blocks.9.norm2.bias": "model-00002-of-00004.safetensors",
|
| 789 |
+
"model.qwen2vl_vision_tower.blocks.9.norm2.weight": "model-00003-of-00004.safetensors",
|
| 790 |
+
"model.qwen2vl_vision_tower.merger.ln_q.bias": "model-00004-of-00004.safetensors",
|
| 791 |
+
"model.qwen2vl_vision_tower.merger.ln_q.weight": "model-00003-of-00004.safetensors",
|
| 792 |
+
"model.qwen2vl_vision_tower.merger.mlp.0.bias": "model-00001-of-00004.safetensors",
|
| 793 |
+
"model.qwen2vl_vision_tower.merger.mlp.0.weight": "model-00003-of-00004.safetensors",
|
| 794 |
+
"model.qwen2vl_vision_tower.merger.mlp.2.bias": "model-00004-of-00004.safetensors",
|
| 795 |
+
"model.qwen2vl_vision_tower.merger.mlp.2.weight": "model-00001-of-00004.safetensors",
|
| 796 |
+
"model.qwen2vl_vision_tower.patch_embed.proj.weight": "model-00003-of-00004.safetensors"
|
| 797 |
+
}
|
| 798 |
+
}
|
modeling_projector.py
CHANGED
|
@@ -1,3 +1,308 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import math
|
| 2 |
+
import torch
|
| 3 |
+
import torch.nn as nn
|
| 4 |
+
from .utils import IMAGE_INDICATOR_IDS
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def build_vision_projector(config, delay_load=False, **kwargs):
|
| 8 |
+
projector_type = getattr(config, 'mm_projector_type', 'linear')
|
| 9 |
+
|
| 10 |
+
if projector_type == 'conv_adapter':
|
| 11 |
+
return ConvAdapter(config.mm_hidden_size, config.hidden_size, getattr(config, "mlp_hidden_dim", None))
|
| 12 |
+
elif projector_type == 'mlp_pixel_shuffle':
|
| 13 |
+
return MlpPixelShuffle(config.mm_hidden_size, config.hidden_size,
|
| 14 |
+
config.pixelshuffle_downsample_ratio, getattr(config, "mlp_hidden_dim", None))
|
| 15 |
+
elif projector_type == 'ovis_conv_adapter':
|
| 16 |
+
return OvisConvAdapter(config.mm_hidden_size, config.hidden_size, getattr(config, "mlp_hidden_dim", 32000),
|
| 17 |
+
getattr(config, "tokenize_function", "softmax"))
|
| 18 |
+
elif projector_type == 'ovis2_adapter':
|
| 19 |
+
return Ovis2Adapter(config.mm_hidden_size, config.hidden_size, getattr(config, "mlp_hidden_dim", 66536),
|
| 20 |
+
getattr(config, "hidden_stride", 2), getattr(config, "pooling_stride", 1), getattr(config, "tokenize_function", "softmax"))
|
| 21 |
+
elif projector_type == 'ovis_conv_adapter_navit':
|
| 22 |
+
return OvisConvAdapterNavit(1280, config.hidden_size, getattr(config, "mlp_hidden_dim", 32000), getattr(config, "tokenize_function", "softmax"))
|
| 23 |
+
raise ValueError(f'Unknown projector type: {projector_type}')
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class ConvAdapter(nn.Module):
|
| 27 |
+
def __init__(self, dim_in, dim_out, mlp_hidden_dim=None):
|
| 28 |
+
super().__init__()
|
| 29 |
+
self.mm_projector_type = 'conv_adapter'
|
| 30 |
+
if mlp_hidden_dim is None:
|
| 31 |
+
self.mlp = nn.Sequential(
|
| 32 |
+
nn.Linear(dim_in, dim_out),
|
| 33 |
+
nn.GELU(),
|
| 34 |
+
nn.Linear(dim_out, dim_out)
|
| 35 |
+
)
|
| 36 |
+
else:
|
| 37 |
+
self.mlp = nn.Sequential(
|
| 38 |
+
nn.Linear(dim_in, mlp_hidden_dim),
|
| 39 |
+
nn.GELU(),
|
| 40 |
+
nn.Linear(mlp_hidden_dim, dim_out)
|
| 41 |
+
)
|
| 42 |
+
self.conv = nn.Conv2d(dim_out, dim_out, kernel_size=(3, 3), stride=(2, 2), padding=1)
|
| 43 |
+
|
| 44 |
+
def forward(self, x):
|
| 45 |
+
"""
|
| 46 |
+
Args:
|
| 47 |
+
x (torch.Tensor): image features
|
| 48 |
+
shape (F, v, D)
|
| 49 |
+
Returns:
|
| 50 |
+
shape (F, n, D) where n is token_num that has been reduced
|
| 51 |
+
"""
|
| 52 |
+
x = self.mlp(x)
|
| 53 |
+
|
| 54 |
+
f, v, d = x.shape
|
| 55 |
+
s = int(math.sqrt(v - 1))
|
| 56 |
+
x = x[:, 1:, :] # remove cls_token
|
| 57 |
+
x = x.reshape(f, s, s, d).permute([0, 3, 1, 2])
|
| 58 |
+
x = self.conv(x)
|
| 59 |
+
x = x.permute([0, 2, 3, 1]).reshape(f, -1, d)
|
| 60 |
+
return x
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
class MlpPixelShuffle(nn.Module):
|
| 64 |
+
def __init__(self, dim_in, dim_out, pixelshuffle_downsample_ratio, mlp_hidden_dim=None):
|
| 65 |
+
super().__init__()
|
| 66 |
+
self.mm_projector_type = 'mlp_pixel_shuffle'
|
| 67 |
+
if mlp_hidden_dim is None:
|
| 68 |
+
self.mlp = nn.Sequential(
|
| 69 |
+
nn.Linear(int(dim_in * (pixelshuffle_downsample_ratio ** 2)), dim_out),
|
| 70 |
+
nn.GELU(),
|
| 71 |
+
nn.Linear(dim_out, dim_out)
|
| 72 |
+
)
|
| 73 |
+
else:
|
| 74 |
+
self.mlp = nn.Sequential(
|
| 75 |
+
nn.Linear(int(dim_in * (pixelshuffle_downsample_ratio ** 2)), mlp_hidden_dim),
|
| 76 |
+
nn.GELU(),
|
| 77 |
+
nn.Linear(mlp_hidden_dim, dim_out)
|
| 78 |
+
)
|
| 79 |
+
self.scale_factor = pixelshuffle_downsample_ratio
|
| 80 |
+
|
| 81 |
+
def pixel_shuffle(self, x, scale_factor=2):
|
| 82 |
+
# change scale_factor from float to int
|
| 83 |
+
|
| 84 |
+
n, w, h, c = x.size()
|
| 85 |
+
# N, W, H, C --> N, W, H / scale, C * scale
|
| 86 |
+
x = x.view(n, w, int(h / scale_factor), int(c * scale_factor))
|
| 87 |
+
# N, W, H / scale, C * scale --> N, H / scale, W, C * scale
|
| 88 |
+
x = x.permute(0, 2, 1, 3).contiguous()
|
| 89 |
+
# N, H / scale, W, C * scale --> N, H / scale, W / scale, C * (scale ** 2)
|
| 90 |
+
x = x.view(n, int(h / scale_factor), int(w / scale_factor),
|
| 91 |
+
int(c * (scale_factor * scale_factor)))
|
| 92 |
+
|
| 93 |
+
x = x.permute(0, 2, 1, 3).contiguous()
|
| 94 |
+
|
| 95 |
+
return x
|
| 96 |
+
|
| 97 |
+
def forward(self, x):
|
| 98 |
+
"""
|
| 99 |
+
Args:
|
| 100 |
+
x (torch.Tensor): image features
|
| 101 |
+
shape (F, v, D)
|
| 102 |
+
Returns:
|
| 103 |
+
shape (F, n, D) where n is token_num that has been reduced
|
| 104 |
+
"""
|
| 105 |
+
x = x[:, 1:, :] # remove cls_token
|
| 106 |
+
h = w = int(x.shape[1] ** 0.5)
|
| 107 |
+
x = x.view(x.shape[0], h, w, -1)
|
| 108 |
+
x = self.pixel_shuffle(x, self.scale_factor)
|
| 109 |
+
x = self.mlp(x)
|
| 110 |
+
x = x.view(x.shape[0],-1,x.shape[-1])
|
| 111 |
+
return x
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
class OvisConvAdapter(nn.Module):
|
| 115 |
+
def __init__(self, dim_in, dim_out, vocab_size, tokenize_function="softmax"):
|
| 116 |
+
super().__init__()
|
| 117 |
+
self.mm_projector_type = 'ovis_conv_adapter'
|
| 118 |
+
self.conv = nn.Conv2d(dim_in, dim_in, kernel_size=(3, 3), stride=(2, 2), padding=1)
|
| 119 |
+
self.mlp = torch.nn.Sequential(
|
| 120 |
+
torch.nn.Linear(dim_in, vocab_size, bias=False),
|
| 121 |
+
torch.nn.LayerNorm(vocab_size)
|
| 122 |
+
)
|
| 123 |
+
self.embedding = torch.nn.Embedding(vocab_size, dim_out)
|
| 124 |
+
self.tokenize_function = tokenize_function
|
| 125 |
+
|
| 126 |
+
def tokenize(self, logits):
|
| 127 |
+
def st_argmax(y_soft, dim): # straight-through softmax
|
| 128 |
+
index = y_soft.max(dim, keepdim=True)[1]
|
| 129 |
+
y_hard = torch.zeros_like(y_soft, memory_format=torch.legacy_contiguous_format).scatter_(dim, index, 1.0)
|
| 130 |
+
ret = y_hard - y_soft.detach() + y_soft
|
| 131 |
+
return ret
|
| 132 |
+
|
| 133 |
+
if self.tokenize_function == 'softmax':
|
| 134 |
+
tokens = torch.nn.functional.softmax(logits, dim=-1)
|
| 135 |
+
elif self.tokenize_function == 'gumbel_argmax':
|
| 136 |
+
tokens = torch.nn.functional.gumbel_softmax(logits, tau=self.config.tau, hard=True)
|
| 137 |
+
elif self.tokenize_function == 'st_argmax':
|
| 138 |
+
tokens = st_argmax(logits, dim=-1)
|
| 139 |
+
else:
|
| 140 |
+
raise ValueError(
|
| 141 |
+
'Invalid `max_type`, expected softmax or gumbel_argmax or st_argmax,'
|
| 142 |
+
f' but got {self.config.tokenize_function}'
|
| 143 |
+
)
|
| 144 |
+
return tokens
|
| 145 |
+
|
| 146 |
+
def forward(self, x):
|
| 147 |
+
"""
|
| 148 |
+
Args:
|
| 149 |
+
x (torch.Tensor): image features
|
| 150 |
+
shape (F, v, D)
|
| 151 |
+
Returns:
|
| 152 |
+
shape (F, n, D) where n is token_num that has been reduced
|
| 153 |
+
"""
|
| 154 |
+
# conv
|
| 155 |
+
f, v, d = x.shape
|
| 156 |
+
s = int(math.sqrt(v - 1))
|
| 157 |
+
x = x[:, 1:, :] # remove cls_token
|
| 158 |
+
x = x.reshape(f, s, s, d).permute([0, 3, 1, 2])
|
| 159 |
+
x = self.conv(x)
|
| 160 |
+
x = x.permute([0, 2, 3, 1]).reshape(f, -1, d)
|
| 161 |
+
|
| 162 |
+
# tokenize
|
| 163 |
+
logits = self.mlp(x)
|
| 164 |
+
visual_tokens = self.tokenize(logits)
|
| 165 |
+
|
| 166 |
+
# get embeddings
|
| 167 |
+
out = torch.matmul(visual_tokens, self.embedding.weight)
|
| 168 |
+
|
| 169 |
+
return out
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
class Ovis2Adapter(nn.Module):
|
| 173 |
+
def __init__(self, dim_in, dim_out, vocab_size, hidden_stride=2, pooling_stride=1, tokenize_function="softmax"):
|
| 174 |
+
super().__init__()
|
| 175 |
+
head_dim = vocab_size - len(IMAGE_INDICATOR_IDS)
|
| 176 |
+
self.mm_projector_type = 'ovis2_adapter'
|
| 177 |
+
self.hidden_stride = hidden_stride
|
| 178 |
+
self.tokenize_function = tokenize_function
|
| 179 |
+
self.head = torch.nn.Sequential(
|
| 180 |
+
torch.nn.Linear(
|
| 181 |
+
dim_in * self.hidden_stride * self.hidden_stride, head_dim,
|
| 182 |
+
bias=False
|
| 183 |
+
),
|
| 184 |
+
torch.nn.LayerNorm(head_dim)
|
| 185 |
+
)
|
| 186 |
+
self.embedding = torch.nn.Embedding(vocab_size, dim_out)
|
| 187 |
+
self.pool_s = pooling_stride
|
| 188 |
+
print("pooling_stride: ", pooling_stride)
|
| 189 |
+
|
| 190 |
+
def encode(self, features):
|
| 191 |
+
# merge number of `hidden_stride * hidden_stride` hidden states together to reduce token sequence length
|
| 192 |
+
# e.g., for hidden_stride=2, this leads to a token length reduction: 1024 -> 256 for aimv2
|
| 193 |
+
features = features[:, 1:, :]
|
| 194 |
+
if self.hidden_stride > 1:
|
| 195 |
+
n, l, d = features.shape # this `d` maybe different from the above `d
|
| 196 |
+
sqrt_l = int(l ** 0.5)
|
| 197 |
+
assert sqrt_l ** 2 == l, "The token sequence length should be a perfect square."
|
| 198 |
+
features = features.reshape(n, sqrt_l, sqrt_l, d)
|
| 199 |
+
pl = (self.hidden_stride - (sqrt_l % self.hidden_stride)) % self.hidden_stride
|
| 200 |
+
features = torch.nn.functional.pad(features, (0, 0, 0, pl, 0, pl), "constant", 0)
|
| 201 |
+
sqrt_l += pl
|
| 202 |
+
features = features.reshape(n, sqrt_l // self.hidden_stride, self.hidden_stride,
|
| 203 |
+
sqrt_l // self.hidden_stride, self.hidden_stride, d)
|
| 204 |
+
features = features.permute(0, 1, 3, 2, 4, 5) # [n, sqrt_l/hs, sqrt_l/hs, hs, hs, d]
|
| 205 |
+
features = features.flatten(3) # [n, sqrt_l/hs, sqrt_l/hs, hs*hs*d]
|
| 206 |
+
features = features.reshape(
|
| 207 |
+
n, -1, self.hidden_stride * self.hidden_stride * d)
|
| 208 |
+
|
| 209 |
+
return features
|
| 210 |
+
|
| 211 |
+
def tokenize(self, logits):
|
| 212 |
+
def st_argmax(y_soft, dim): # straight-through softmax
|
| 213 |
+
index = y_soft.max(dim, keepdim=True)[1]
|
| 214 |
+
y_hard = torch.zeros_like(y_soft, memory_format=torch.legacy_contiguous_format).scatter_(dim, index, 1.0)
|
| 215 |
+
ret = y_hard - y_soft.detach() + y_soft
|
| 216 |
+
return ret
|
| 217 |
+
|
| 218 |
+
if self.tokenize_function == 'softmax':
|
| 219 |
+
tokens = torch.nn.functional.softmax(logits, dim=-1)
|
| 220 |
+
elif self.tokenize_function == 'gumbel_argmax':
|
| 221 |
+
tokens = torch.nn.functional.gumbel_softmax(logits, tau=self.config.tau, hard=True) # here need to be check???
|
| 222 |
+
elif self.tokenize_function == 'st_argmax':
|
| 223 |
+
tokens = st_argmax(logits, dim=-1)
|
| 224 |
+
else:
|
| 225 |
+
raise ValueError(
|
| 226 |
+
f'Invalid `max_type`, expected softmax or gumbel_argmax or st_argmax, but got {self.config.tokenize_function}')
|
| 227 |
+
return tokens
|
| 228 |
+
|
| 229 |
+
def forward(self, x):
|
| 230 |
+
"""
|
| 231 |
+
Args:
|
| 232 |
+
x (torch.Tensor): image features
|
| 233 |
+
shape (F, v, D)
|
| 234 |
+
Returns:
|
| 235 |
+
shape (F, n, D) where n is token_num that has been reduced
|
| 236 |
+
"""
|
| 237 |
+
# pixelshuffle
|
| 238 |
+
# f, v, d = x.shape
|
| 239 |
+
# s = int(math.sqrt(v))
|
| 240 |
+
x = self.encode(x)
|
| 241 |
+
# tokenize
|
| 242 |
+
logits = self.head(x)
|
| 243 |
+
visual_tokens = self.tokenize(logits)
|
| 244 |
+
batch_size, token_len, _ = visual_tokens.shape
|
| 245 |
+
padding_tensor = torch.zeros(size=(batch_size, token_len, len(IMAGE_INDICATOR_IDS)),
|
| 246 |
+
dtype=visual_tokens.dtype,
|
| 247 |
+
device=visual_tokens.device,
|
| 248 |
+
layout=visual_tokens.layout,
|
| 249 |
+
requires_grad=False)
|
| 250 |
+
visual_tokens = torch.cat([visual_tokens, padding_tensor], dim=2)
|
| 251 |
+
# get embeddings here need to change argmax
|
| 252 |
+
out = torch.matmul(visual_tokens, self.embedding.weight)
|
| 253 |
+
|
| 254 |
+
if self.pool_s > 1:
|
| 255 |
+
f, v, d = out.shape
|
| 256 |
+
s = int(math.sqrt(v))
|
| 257 |
+
out = out.reshape(f, s, s, d)
|
| 258 |
+
out = out.reshape(f, s // self.pool_s, self.pool_s, s // self.pool_s, self.pool_s, d)
|
| 259 |
+
out = out.permute([0, 1, 3, 5, 2, 4]).reshape(f, s // self.pool_s * s // self.pool_s, d, -1).mean(-1)
|
| 260 |
+
return out
|
| 261 |
+
|
| 262 |
+
class OvisConvAdapterNavit(nn.Module):
|
| 263 |
+
def __init__(self, dim_in, dim_out, vocab_size, tokenize_function="softmax"):
|
| 264 |
+
super().__init__()
|
| 265 |
+
self.mm_projector_type = 'ovis_conv_adapter_navit'
|
| 266 |
+
self.conv = nn.Conv2d(dim_in, dim_in, kernel_size=(2, 2), stride=(2, 2))
|
| 267 |
+
self.mlp = torch.nn.Sequential(
|
| 268 |
+
torch.nn.Linear(dim_in, vocab_size, bias=False),
|
| 269 |
+
torch.nn.LayerNorm(vocab_size)
|
| 270 |
+
)
|
| 271 |
+
self.embedding = torch.nn.Embedding(vocab_size, dim_out)
|
| 272 |
+
self.tokenize_function = tokenize_function
|
| 273 |
+
def tokenize(self, logits):
|
| 274 |
+
def st_argmax(y_soft, dim): # straight-through softmax
|
| 275 |
+
index = y_soft.max(dim, keepdim=True)[1]
|
| 276 |
+
y_hard = torch.zeros_like(y_soft, memory_format=torch.legacy_contiguous_format).scatter_(dim, index, 1.0)
|
| 277 |
+
ret = y_hard - y_soft.detach() + y_soft
|
| 278 |
+
return ret
|
| 279 |
+
if self.tokenize_function == 'softmax':
|
| 280 |
+
tokens = torch.nn.functional.softmax(logits, dim=-1)
|
| 281 |
+
elif self.tokenize_function == 'gumbel_argmax':
|
| 282 |
+
tokens = torch.nn.functional.gumbel_softmax(logits, tau=self.config.tau, hard=True)
|
| 283 |
+
elif self.tokenize_function == 'st_argmax':
|
| 284 |
+
tokens = st_argmax(logits, dim=-1)
|
| 285 |
+
else:
|
| 286 |
+
raise ValueError(
|
| 287 |
+
f'Invalid `max_type`, expected softmax or gumbel_argmax or st_argmax, but got {self.config.tokenize_function}')
|
| 288 |
+
return tokens
|
| 289 |
+
def forward(self, x):
|
| 290 |
+
"""
|
| 291 |
+
Args:
|
| 292 |
+
x (torch.Tensor): image features of navit
|
| 293 |
+
shape (v, D)
|
| 294 |
+
Returns:
|
| 295 |
+
shape (n, D) where n is token_num that has been reduced
|
| 296 |
+
"""
|
| 297 |
+
# conv
|
| 298 |
+
_, d = x.shape
|
| 299 |
+
x = x.reshape(-1, 2, 2, d).permute([0, 3, 1, 2])
|
| 300 |
+
x = self.conv(x)
|
| 301 |
+
x = x.permute([0, 2, 3, 1]).reshape(-1, d)
|
| 302 |
+
# tokenize
|
| 303 |
+
logits = self.mlp(x)
|
| 304 |
+
visual_tokens = self.tokenize(logits)
|
| 305 |
+
# get embeddings
|
| 306 |
+
out = torch.matmul(visual_tokens, self.embedding.weight)
|
| 307 |
+
|
| 308 |
+
return out
|
modeling_valley.py
CHANGED
|
@@ -589,6 +589,7 @@ class ValleyQwen3ForCausalLM(Qwen3ForCausalLM, ValleyMetaForCausalLM):
|
|
| 589 |
shift_labels = shift_labels.to(shift_logits.device)
|
| 590 |
loss = torch.stack([loss_fct(shift_logits[i], shift_labels[i]) for i in range(bs)])
|
| 591 |
|
|
|
|
| 592 |
if not return_dict:
|
| 593 |
output = (logits,) + outputs[1:]
|
| 594 |
return (loss,) + output if loss is not None else output
|
|
|
|
| 589 |
shift_labels = shift_labels.to(shift_logits.device)
|
| 590 |
loss = torch.stack([loss_fct(shift_logits[i], shift_labels[i]) for i in range(bs)])
|
| 591 |
|
| 592 |
+
|
| 593 |
if not return_dict:
|
| 594 |
output = (logits,) + outputs[1:]
|
| 595 |
return (loss,) + output if loss is not None else output
|
modeling_vision_tower.py
CHANGED
|
@@ -1,3 +1,323 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
from transformers.models.qwen2_vl.modeling_qwen2_vl import Qwen2VisionTransformerPretrainedModel
|
| 4 |
+
from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import Qwen2_5_VisionTransformerPretrainedModel
|
| 5 |
+
from transformers import PretrainedConfig
|
| 6 |
+
|
| 7 |
+
siglip_config = PretrainedConfig.from_dict(
|
| 8 |
+
{
|
| 9 |
+
"attention_dropout": 0.0,
|
| 10 |
+
"hidden_act": "gelu_pytorch_tanh",
|
| 11 |
+
"hidden_size": 1152,
|
| 12 |
+
"image_size": 384,
|
| 13 |
+
"intermediate_size": 4304,
|
| 14 |
+
"layer_norm_eps": 1e-06,
|
| 15 |
+
"model_type": "siglip_vision_model",
|
| 16 |
+
"num_attention_heads": 16,
|
| 17 |
+
"num_channels": 3,
|
| 18 |
+
"num_hidden_layers": 27,
|
| 19 |
+
"patch_size": 14,
|
| 20 |
+
}
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
qwen2vl_vit_config = PretrainedConfig.from_dict(
|
| 24 |
+
{
|
| 25 |
+
"depth": 32,
|
| 26 |
+
"embed_dim": 1280,
|
| 27 |
+
"hidden_act": "quick_gelu",
|
| 28 |
+
"hidden_size": 3584,
|
| 29 |
+
"in_channels": 3,
|
| 30 |
+
"in_chans": 3,
|
| 31 |
+
"mlp_ratio": 4,
|
| 32 |
+
"model_type": "qwen2_vl",
|
| 33 |
+
"num_heads": 16,
|
| 34 |
+
"patch_size": 14,
|
| 35 |
+
"spatial_merge_size": 2,
|
| 36 |
+
"spatial_patch_size": 14,
|
| 37 |
+
"temporal_patch_size": 2,
|
| 38 |
+
"_attn_implementation": "flash_attention_2",
|
| 39 |
+
"_attn_implementation_internal": "flash_attention_2"
|
| 40 |
+
}
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
qwen2_5vl_vit_config = PretrainedConfig.from_dict(
|
| 44 |
+
{
|
| 45 |
+
"depth": 32,
|
| 46 |
+
"hidden_act": "silu",
|
| 47 |
+
"hidden_size": 1280,
|
| 48 |
+
"intermediate_size": 3420,
|
| 49 |
+
"num_heads": 16,
|
| 50 |
+
"in_chans": 3,
|
| 51 |
+
"out_hidden_size": 3584,
|
| 52 |
+
"patch_size": 14,
|
| 53 |
+
"spatial_merge_size": 2,
|
| 54 |
+
"spatial_patch_size": 14,
|
| 55 |
+
"window_size": 112,
|
| 56 |
+
"fullatt_block_indexes": [
|
| 57 |
+
7,
|
| 58 |
+
15,
|
| 59 |
+
23,
|
| 60 |
+
31
|
| 61 |
+
],
|
| 62 |
+
"tokens_per_second": 2,
|
| 63 |
+
"temporal_patch_size": 2
|
| 64 |
+
}
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
aimv2_config = PretrainedConfig.from_dict(
|
| 68 |
+
{
|
| 69 |
+
"hidden_size": 1024,
|
| 70 |
+
"image_size": 448,
|
| 71 |
+
"intermediate_size": 2816,
|
| 72 |
+
"model_type": "aimv2",
|
| 73 |
+
"num_attention_heads": 8,
|
| 74 |
+
"num_channels": 3,
|
| 75 |
+
"num_hidden_layers": 24,
|
| 76 |
+
"patch_size": 14,
|
| 77 |
+
"projection_dropout": 0.0,
|
| 78 |
+
"qkv_bias": False,
|
| 79 |
+
"rms_norm_eps": 1e-05,
|
| 80 |
+
"torch_dtype": "float32",
|
| 81 |
+
"transformers_version": "4.46.3",
|
| 82 |
+
"auto_map": {
|
| 83 |
+
"AutoConfig": "configuration_aimv2.AIMv2Config",
|
| 84 |
+
"AutoModel": "modeling_aimv2.AIMv2Model",
|
| 85 |
+
},
|
| 86 |
+
}
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
+
def wrapped_qwen2vl_vision_tower(vision_tower_cfg, qwen2vl_vision_tower):
|
| 90 |
+
if getattr(vision_tower_cfg, "only_navit", False) and \
|
| 91 |
+
getattr(vision_tower_cfg, "navit_use_mm_projector", False):
|
| 92 |
+
qwen2vl_vision_tower.merger = torch.nn.Identity()
|
| 93 |
+
print("navit_use_mm_projector is NOT None, so we need to initialize a new merger...")
|
| 94 |
+
|
| 95 |
+
else:
|
| 96 |
+
old_linear = qwen2vl_vision_tower.merger.mlp[-1] # shape: 5120 * 3584, 3584 is dim of LLM, 5120 is the hidden_dim of merger
|
| 97 |
+
navit_merger_hidden_dim = getattr(vision_tower_cfg, "navit_merger_hidden_dim", None)
|
| 98 |
+
|
| 99 |
+
rule1 = old_linear.out_features != vision_tower_cfg.hidden_size
|
| 100 |
+
rule2 = navit_merger_hidden_dim is not None and navit_merger_hidden_dim != old_linear.in_features
|
| 101 |
+
|
| 102 |
+
if rule1 or rule2:
|
| 103 |
+
del qwen2vl_vision_tower.merger
|
| 104 |
+
qwen2vl_vision_tower.merger = CustomPatchMerger(
|
| 105 |
+
dim=vision_tower_cfg.hidden_size, # output_dim of merger, also the dim of LLM
|
| 106 |
+
context_dim=1280, # 1280 is the hidden_dim of qwen2vl_vision_tower, so input_dim of merger is 1280*4=5120 (2*2 pixel shuffle)
|
| 107 |
+
hidden_dim=navit_merger_hidden_dim if navit_merger_hidden_dim is not None else old_linear.in_features # hidden_dim of merger
|
| 108 |
+
)
|
| 109 |
+
print("output_dim of original merger is not match or navit_merger_hidden_dim is not match, we need to initialize a new merger...")
|
| 110 |
+
|
| 111 |
+
return qwen2vl_vision_tower
|
| 112 |
+
|
| 113 |
+
def build_vision_tower(vision_tower_cfg, **kwargs):
|
| 114 |
+
vision_tower = getattr(vision_tower_cfg, "mm_vision_tower", getattr(vision_tower_cfg, "vision_tower", None))
|
| 115 |
+
if "siglip-so400m-patch14-384" in vision_tower or "Oryx-ViT" in vision_tower or "navit" in vision_tower.lower():
|
| 116 |
+
# if 'navit' in vision_tower, vision_tower_cfg.eagle_vision_tower is not None and vision_tower_cfg.only_navit is True
|
| 117 |
+
if "navit" in vision_tower.lower():
|
| 118 |
+
assert getattr(vision_tower_cfg, "only_navit", False) and \
|
| 119 |
+
getattr(vision_tower_cfg, "eagle_vision_tower", None) is not None
|
| 120 |
+
|
| 121 |
+
if getattr(vision_tower_cfg, "eagle_vision_tower", None) is not None:
|
| 122 |
+
if "Qwen2.5-VL" in vision_tower_cfg.eagle_vision_tower:
|
| 123 |
+
if getattr(vision_tower_cfg, "_vit_attn_implementation", None) is not None:
|
| 124 |
+
qwen2_5vl_vit_config._attn_implementation = vision_tower_cfg._vit_attn_implementation
|
| 125 |
+
qwen2_5vl_vit_config._attn_implementation_internal = vision_tower_cfg._vit_attn_implementation
|
| 126 |
+
qwen2vl_vision_tower = Qwen2_5_VisionTransformerPretrainedModel._from_config(qwen2_5vl_vit_config)
|
| 127 |
+
elif "Qwen2-VL" in vision_tower_cfg.eagle_vision_tower:
|
| 128 |
+
if getattr(vision_tower_cfg, "_vit_attn_implementation", None) is not None:
|
| 129 |
+
qwen2vl_vit_config._attn_implementation = vision_tower_cfg._vit_attn_implementation
|
| 130 |
+
qwen2vl_vit_config._attn_implementation_internal = vision_tower_cfg._vit_attn_implementation
|
| 131 |
+
qwen2vl_vision_tower = Qwen2VisionTransformerPretrainedModel._from_config(qwen2vl_vit_config)
|
| 132 |
+
else:
|
| 133 |
+
raise ValueError(f"Unknown vision tower: {vision_tower_cfg.eagle_vision_tower}")
|
| 134 |
+
|
| 135 |
+
qwen2vl_vision_tower = wrapped_qwen2vl_vision_tower(vision_tower_cfg, qwen2vl_vision_tower)
|
| 136 |
+
qwen2vl_vision_tower.requires_grad_(False)
|
| 137 |
+
if getattr(vision_tower_cfg, "only_navit", False):
|
| 138 |
+
return None, qwen2vl_vision_tower
|
| 139 |
+
else:
|
| 140 |
+
siglip_vision_tower = SigLipVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
|
| 141 |
+
return siglip_vision_tower, qwen2vl_vision_tower
|
| 142 |
+
|
| 143 |
+
# only return siglip vision tower if eagle vision tower is None
|
| 144 |
+
else:
|
| 145 |
+
return SigLipVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
|
| 146 |
+
elif "aimv2-huge-patch14-448" in vision_tower or "Ovis2-8B-visual" in vision_tower:
|
| 147 |
+
return AIMv2VisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
|
| 148 |
+
elif "aimv2-large-patch14-448" in vision_tower or "Ovis2-2B-visual" in vision_tower:
|
| 149 |
+
return AIMv2VisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
|
| 150 |
+
else:
|
| 151 |
+
raise ValueError(f"Unknown vision tower: {vision_tower}")
|
| 152 |
+
|
| 153 |
+
class SigLipVisionTower(nn.Module):
|
| 154 |
+
def __init__(self, vision_tower, args, delay_load=False, cache_dir="./cache_dir"):
|
| 155 |
+
super().__init__()
|
| 156 |
+
self.is_loaded = False
|
| 157 |
+
self.image_tower_name = vision_tower
|
| 158 |
+
self.select_layer = args.mm_vision_select_layer
|
| 159 |
+
self.select_feature = getattr(args, "mm_vision_select_feature", "patch")
|
| 160 |
+
self.cache_dir = cache_dir
|
| 161 |
+
|
| 162 |
+
if not delay_load:
|
| 163 |
+
self.load_model()
|
| 164 |
+
else:
|
| 165 |
+
from transformers import SiglipVisionModel
|
| 166 |
+
self.cfg_only = siglip_config
|
| 167 |
+
self.vision_tower = SiglipVisionModel._from_config(siglip_config) # dummy-load
|
| 168 |
+
|
| 169 |
+
def load_model(self):
|
| 170 |
+
from transformers import SiglipVisionModel
|
| 171 |
+
self.vision_tower = SiglipVisionModel._from_config(siglip_config)
|
| 172 |
+
self.vision_tower.requires_grad_(False)
|
| 173 |
+
self.is_loaded = True
|
| 174 |
+
|
| 175 |
+
def feature_select(self, image_forward_outs):
|
| 176 |
+
assert self.select_feature == "cls_patch"
|
| 177 |
+
image_features = torch.cat([image_forward_outs[:, :1, :], image_forward_outs], dim=1)
|
| 178 |
+
return image_features
|
| 179 |
+
|
| 180 |
+
def forward(self, images):
|
| 181 |
+
if type(images) is list:
|
| 182 |
+
image_features = []
|
| 183 |
+
for image in images:
|
| 184 |
+
image_forward_out = self.vision_tower(
|
| 185 |
+
image.to(device=self.device, dtype=self.dtype).unsqueeze(0),
|
| 186 |
+
output_hidden_states=True,
|
| 187 |
+
return_dict=True,
|
| 188 |
+
)
|
| 189 |
+
image_feature = self.feature_select(image_forward_out.last_hidden_state).to(image.dtype)
|
| 190 |
+
image_features.append(image_feature)
|
| 191 |
+
else:
|
| 192 |
+
image_forward_outs = self.vision_tower(
|
| 193 |
+
images.to(device=self.device, dtype=self.dtype),
|
| 194 |
+
output_hidden_states=True,
|
| 195 |
+
return_dict=True,
|
| 196 |
+
)
|
| 197 |
+
image_features = self.feature_select(image_forward_outs.last_hidden_state).to(images.dtype)
|
| 198 |
+
|
| 199 |
+
return image_features
|
| 200 |
+
|
| 201 |
+
@property
|
| 202 |
+
def dummy_feature(self):
|
| 203 |
+
return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
|
| 204 |
+
|
| 205 |
+
@property
|
| 206 |
+
def dtype(self):
|
| 207 |
+
return self.vision_tower.dtype
|
| 208 |
+
|
| 209 |
+
@property
|
| 210 |
+
def device(self):
|
| 211 |
+
return self.vision_tower.device
|
| 212 |
+
|
| 213 |
+
@property
|
| 214 |
+
def config(self):
|
| 215 |
+
if self.is_loaded:
|
| 216 |
+
return self.vision_tower.config
|
| 217 |
+
else:
|
| 218 |
+
return self.cfg_only
|
| 219 |
+
|
| 220 |
+
@property
|
| 221 |
+
def hidden_size(self):
|
| 222 |
+
return self.config.hidden_size
|
| 223 |
+
|
| 224 |
+
@property
|
| 225 |
+
def num_patches(self):
|
| 226 |
+
return (self.config.image_size // self.config.patch_size) ** 2
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
class CustomPatchMerger(nn.Module):
|
| 230 |
+
def __init__(self, dim: int, context_dim: int, hidden_dim: int, spatial_merge_size: int = 2) -> None:
|
| 231 |
+
super().__init__()
|
| 232 |
+
self.input_dim = context_dim * (spatial_merge_size**2)
|
| 233 |
+
self.ln_q = nn.LayerNorm(context_dim, eps=1e-6)
|
| 234 |
+
self.mlp = nn.Sequential(
|
| 235 |
+
nn.Linear(self.input_dim, hidden_dim),
|
| 236 |
+
nn.GELU(),
|
| 237 |
+
nn.Linear(hidden_dim, dim),
|
| 238 |
+
)
|
| 239 |
+
|
| 240 |
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
| 241 |
+
x = self.mlp(self.ln_q(x).view(-1, self.input_dim))
|
| 242 |
+
return x
|
| 243 |
+
|
| 244 |
+
|
| 245 |
+
|
| 246 |
+
|
| 247 |
+
class AIMv2VisionTower(nn.Module):
|
| 248 |
+
def __init__(self, vision_tower, args, delay_load=False, cache_dir='./cache_dir'):
|
| 249 |
+
super().__init__()
|
| 250 |
+
|
| 251 |
+
self.is_loaded = False
|
| 252 |
+
|
| 253 |
+
self.image_tower_name = vision_tower
|
| 254 |
+
self.select_layer = args.mm_vision_select_layer
|
| 255 |
+
self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
|
| 256 |
+
|
| 257 |
+
self.cache_dir = cache_dir
|
| 258 |
+
if not delay_load:
|
| 259 |
+
self.load_model()
|
| 260 |
+
else:
|
| 261 |
+
from transformers import AutoConfig, AutoModel
|
| 262 |
+
# self.cfg_only = AutoConfig.from_pretrained(self.image_tower_name, cache_dir=self.cache_dir, trust_remote_code=True)
|
| 263 |
+
# self.vision_tower = AutoModel.from_pretrained(self.vision_tower_name, trust_remote_code=True) # dummy-load
|
| 264 |
+
self.cfg_only = aimv2_config
|
| 265 |
+
self.vision_tower = AutoModel._from_config(aimv2_config) # dummy-load
|
| 266 |
+
|
| 267 |
+
|
| 268 |
+
def load_model(self):
|
| 269 |
+
from transformers import AutoConfig, AutoModel, AutoProcessor
|
| 270 |
+
self.image_processor = AutoProcessor.from_pretrained(self.image_tower_name, trust_remote_code=True)
|
| 271 |
+
self.vision_tower = AutoModel.from_pretrained(self.image_tower_name, trust_remote_code=True)
|
| 272 |
+
self.vision_tower.requires_grad_(False)
|
| 273 |
+
# self.image_processor.crop_size = self.image_processor.crop_size['height']
|
| 274 |
+
self.image_processor.crop_size = self.image_processor.size["shortest_edge"]
|
| 275 |
+
|
| 276 |
+
self.is_loaded = True
|
| 277 |
+
|
| 278 |
+
def feature_select(self, image_forward_outs):
|
| 279 |
+
assert self.select_feature == 'cls_patch'
|
| 280 |
+
image_features = torch.cat([image_forward_outs[:, :1, :], image_forward_outs], dim=1)
|
| 281 |
+
return image_features
|
| 282 |
+
|
| 283 |
+
def forward(self, images):
|
| 284 |
+
if type(images) is list:
|
| 285 |
+
image_features = []
|
| 286 |
+
for image in images:
|
| 287 |
+
image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True
|
| 288 |
+
,return_dict=True,)
|
| 289 |
+
image_feature = self.feature_select(image_forward_out.last_hidden_state).to(image.dtype)
|
| 290 |
+
image_features.append(image_feature)
|
| 291 |
+
else:
|
| 292 |
+
image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True
|
| 293 |
+
,return_dict=True,)
|
| 294 |
+
image_features = self.feature_select(image_forward_outs.last_hidden_state).to(images.dtype)
|
| 295 |
+
|
| 296 |
+
return image_features
|
| 297 |
+
|
| 298 |
+
# @property
|
| 299 |
+
# def dummy_feature(self):
|
| 300 |
+
# return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
|
| 301 |
+
|
| 302 |
+
@property
|
| 303 |
+
def dtype(self):
|
| 304 |
+
return self.vision_tower.dtype
|
| 305 |
+
|
| 306 |
+
@property
|
| 307 |
+
def device(self):
|
| 308 |
+
return self.vision_tower.device
|
| 309 |
+
|
| 310 |
+
@property
|
| 311 |
+
def config(self):
|
| 312 |
+
if self.is_loaded:
|
| 313 |
+
return self.vision_tower.config
|
| 314 |
+
else:
|
| 315 |
+
return self.cfg_only
|
| 316 |
+
|
| 317 |
+
@property
|
| 318 |
+
def hidden_size(self):
|
| 319 |
+
return self.config.hidden_size
|
| 320 |
+
|
| 321 |
+
@property
|
| 322 |
+
def num_patches(self):
|
| 323 |
+
return (self.config.image_size // self.config.patch_size) ** 2
|
preprocessor_config.json
CHANGED
|
@@ -1,3 +1,6 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"processor_class": "ValleyProcessor",
|
| 3 |
+
"auto_map": {
|
| 4 |
+
"AutoProcessor": "processing_valley.ValleyProcessor"
|
| 5 |
+
}
|
| 6 |
+
}
|
processing_valley.py
CHANGED
|
@@ -1,3 +1,618 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import types
|
| 3 |
+
import io
|
| 4 |
+
import torch
|
| 5 |
+
import os
|
| 6 |
+
from PIL import Image
|
| 7 |
+
import argparse
|
| 8 |
+
from qwen_vl_utils import fetch_image
|
| 9 |
+
|
| 10 |
+
from transformers import (
|
| 11 |
+
ProcessorMixin,
|
| 12 |
+
SiglipImageProcessor,
|
| 13 |
+
BatchFeature,
|
| 14 |
+
Qwen2VLImageProcessor,
|
| 15 |
+
PreTrainedTokenizer,
|
| 16 |
+
AutoImageProcessor,
|
| 17 |
+
CLIPImageProcessor,
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
from .utils import (
|
| 21 |
+
process_anyres_image,
|
| 22 |
+
preprocess_image_ovis,
|
| 23 |
+
ovis_template_process,
|
| 24 |
+
BLACK_IMG_ENV,
|
| 25 |
+
DEFAULT_IM_END_TOKEN,
|
| 26 |
+
DEFAULT_IM_START_TOKEN,
|
| 27 |
+
DEFAULT_IMAGE_TOKEN,
|
| 28 |
+
DEFAULT_VI_END_TOKEN,
|
| 29 |
+
DEFAULT_VI_START_TOKEN,
|
| 30 |
+
DEFAULT_VIDEO_TOKEN,
|
| 31 |
+
IMAGE_TOKEN_INDEX,
|
| 32 |
+
SEQ_MAX_LEN,
|
| 33 |
+
IGNORE_INDEX,
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
siglip_processor_config = {
|
| 37 |
+
"do_normalize": True,
|
| 38 |
+
"do_rescale": True,
|
| 39 |
+
"do_resize": True,
|
| 40 |
+
"image_mean": [
|
| 41 |
+
0.5,
|
| 42 |
+
0.5,
|
| 43 |
+
0.5
|
| 44 |
+
],
|
| 45 |
+
"image_processor_type": "SiglipImageProcessor",
|
| 46 |
+
"image_std": [
|
| 47 |
+
0.5,
|
| 48 |
+
0.5,
|
| 49 |
+
0.5
|
| 50 |
+
],
|
| 51 |
+
"processor_class": "SiglipProcessor",
|
| 52 |
+
"resample": 3,
|
| 53 |
+
"rescale_factor": 0.00392156862745098,
|
| 54 |
+
"size": {
|
| 55 |
+
"height": 384,
|
| 56 |
+
"width": 384
|
| 57 |
+
}
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
qwen2vl_processor_config = {
|
| 61 |
+
"min_pixels": 3136,
|
| 62 |
+
"max_pixels": 12845056,
|
| 63 |
+
"patch_size": 14,
|
| 64 |
+
"temporal_patch_size": 2,
|
| 65 |
+
"merge_size": 2,
|
| 66 |
+
"image_mean": [
|
| 67 |
+
0.48145466,
|
| 68 |
+
0.4578275,
|
| 69 |
+
0.40821073
|
| 70 |
+
],
|
| 71 |
+
"image_std": [
|
| 72 |
+
0.26862954,
|
| 73 |
+
0.26130258,
|
| 74 |
+
0.27577711
|
| 75 |
+
],
|
| 76 |
+
"image_processor_type": "Qwen2VLImageProcessor",
|
| 77 |
+
"processor_class": "Qwen2VLProcessor"
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
aimv2_processor_config = {
|
| 81 |
+
"crop_size": {
|
| 82 |
+
"height": 448,
|
| 83 |
+
"width": 448
|
| 84 |
+
},
|
| 85 |
+
"do_center_crop": True,
|
| 86 |
+
"do_convert_rgb": True,
|
| 87 |
+
"do_normalize": True,
|
| 88 |
+
"do_rescale": True,
|
| 89 |
+
"do_resize": True,
|
| 90 |
+
"image_mean": [
|
| 91 |
+
0.48145466,
|
| 92 |
+
0.4578275,
|
| 93 |
+
0.40821073
|
| 94 |
+
],
|
| 95 |
+
"image_processor_type": "CLIPImageProcessor",
|
| 96 |
+
"image_std": [
|
| 97 |
+
0.26862954,
|
| 98 |
+
0.26130258,
|
| 99 |
+
0.27577711
|
| 100 |
+
],
|
| 101 |
+
"resample": 3,
|
| 102 |
+
"rescale_factor": 0.00392156862745098,
|
| 103 |
+
"size": {
|
| 104 |
+
"shortest_edge": 448
|
| 105 |
+
}
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
class ValleyProcessor(ProcessorMixin):
|
| 110 |
+
attributes = ["tokenizer"]
|
| 111 |
+
optional_attributes = [
|
| 112 |
+
"max_pixels",
|
| 113 |
+
"min_pixels",
|
| 114 |
+
"anyres",
|
| 115 |
+
"only_crop_single_image",
|
| 116 |
+
"grid_pinpoints",
|
| 117 |
+
"use_special_start_end_token",
|
| 118 |
+
"only_navit",
|
| 119 |
+
"chat_template",
|
| 120 |
+
"process_mode",
|
| 121 |
+
]
|
| 122 |
+
tokenizer_class = "AutoTokenizer"
|
| 123 |
+
|
| 124 |
+
def __init__(self, tokenizer=None, chat_template=None, **kwargs):
|
| 125 |
+
super().__init__(tokenizer=tokenizer, chat_template=chat_template, **kwargs)
|
| 126 |
+
self.black_img = BLACK_IMG_ENV
|
| 127 |
+
self.siglip_image_processor = SiglipImageProcessor.from_dict(siglip_processor_config)
|
| 128 |
+
self.qwen2vl_image_processor = Qwen2VLImageProcessor.from_dict(qwen2vl_processor_config)
|
| 129 |
+
self.aimv2_image_processor = CLIPImageProcessor.from_dict(aimv2_processor_config)
|
| 130 |
+
self.anyres = kwargs.get("anyres", True)
|
| 131 |
+
self.grid_pinpoints = kwargs.get("grid_pinpoints", "(1x1),...,(3x3)")
|
| 132 |
+
self.only_crop_single_image = kwargs.get("only_crop_single_image", True)
|
| 133 |
+
self.use_special_start_end_token = kwargs.get("use_special_start_end_token", True)
|
| 134 |
+
self.only_navit = kwargs.get("only_navit", False)
|
| 135 |
+
self.process_mode = kwargs.get("process_mode", "qwen3")
|
| 136 |
+
|
| 137 |
+
self.aimv2_crop_size = self.aimv2_image_processor.size["shortest_edge"]
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
def preprocess_images_siglip(self, images) -> torch.FloatTensor:
|
| 141 |
+
if isinstance(images[0], str):
|
| 142 |
+
images_pil = [Image.open(img).convert("RGB") for img in images]
|
| 143 |
+
elif isinstance(images[0], Image.Image):
|
| 144 |
+
images_pil = [img.convert("RGB") for img in images]
|
| 145 |
+
elif isinstance(images[0], bytes):
|
| 146 |
+
images_pil = [Image.open(io.BytesIO(img)).convert("RGB") for img in images]
|
| 147 |
+
else:
|
| 148 |
+
raise ValueError("unsupported type")
|
| 149 |
+
|
| 150 |
+
processed_images = []
|
| 151 |
+
have_multi_images = len(images_pil) > 1
|
| 152 |
+
for img in images_pil:
|
| 153 |
+
if self.anyres:
|
| 154 |
+
if not self.only_crop_single_image or not have_multi_images:
|
| 155 |
+
image = process_anyres_image(img, self.siglip_image_processor, self.grid_pinpoints)
|
| 156 |
+
else:
|
| 157 |
+
image = [self.siglip_image_processor(img, return_tensors="pt")["pixel_values"][0]]
|
| 158 |
+
else:
|
| 159 |
+
image = self.siglip_image_processor(img, return_tensors="pt")["pixel_values"][0]
|
| 160 |
+
|
| 161 |
+
processed_images.append(image)
|
| 162 |
+
|
| 163 |
+
if not self.anyres:
|
| 164 |
+
return torch.stack(processed_images, dim=0)
|
| 165 |
+
else:
|
| 166 |
+
return [torch.stack(img, dim=0) for img in processed_images]
|
| 167 |
+
|
| 168 |
+
def preprocess_images_qwen2vl(self, images) -> dict:
|
| 169 |
+
if isinstance(images[0], str):
|
| 170 |
+
images_pil = [Image.open(img).convert("RGB") for img in images]
|
| 171 |
+
elif isinstance(images[0], Image.Image):
|
| 172 |
+
images_pil = [img.convert("RGB") for img in images]
|
| 173 |
+
elif isinstance(images[0], bytes):
|
| 174 |
+
images_pil = [Image.open(io.BytesIO(img)).convert("RGB") for img in images]
|
| 175 |
+
else:
|
| 176 |
+
raise ValueError("unsupported type")
|
| 177 |
+
|
| 178 |
+
image_sizes = [[x.size for x in images_pil]]
|
| 179 |
+
data_dict_qwen2vl = self.qwen2vl_image_processor(
|
| 180 |
+
[fetch_image({"image": img}) for img in images_pil],
|
| 181 |
+
return_tensors="pt"
|
| 182 |
+
)
|
| 183 |
+
|
| 184 |
+
data_dict_qwen2vl["image_sizes"] = image_sizes
|
| 185 |
+
|
| 186 |
+
return data_dict_qwen2vl
|
| 187 |
+
|
| 188 |
+
def preprocess_multimodal(self, conversations):
|
| 189 |
+
for sentence in conversations:
|
| 190 |
+
if sentence["role"] == "system":
|
| 191 |
+
continue
|
| 192 |
+
segs = re.split(DEFAULT_IMAGE_TOKEN, sentence["content"])
|
| 193 |
+
if self.use_special_start_end_token:
|
| 194 |
+
sentence["content"] = (DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN).join(segs)
|
| 195 |
+
else:
|
| 196 |
+
sentence["content"] = DEFAULT_IMAGE_TOKEN.join(segs)
|
| 197 |
+
|
| 198 |
+
return conversations
|
| 199 |
+
|
| 200 |
+
def preprocess_images_aimv2(self, images) -> torch.FloatTensor:
|
| 201 |
+
processed_images = []
|
| 202 |
+
image_sizes_list = []
|
| 203 |
+
have_multi_images = len(images) > 1
|
| 204 |
+
for image_file in images:
|
| 205 |
+
if isinstance(image_file, str):
|
| 206 |
+
img = Image.open(image_file).convert("RGB")
|
| 207 |
+
elif isinstance(image_file, Image.Image):
|
| 208 |
+
img = image_file.convert("RGB")
|
| 209 |
+
elif isinstance(image_file, bytes):
|
| 210 |
+
img = Image.open(io.BytesIO(image_file)).convert("RGB")
|
| 211 |
+
else:
|
| 212 |
+
raise ValueError("unsupported type")
|
| 213 |
+
image_sizes_list.append(img.size)
|
| 214 |
+
if self.anyres:
|
| 215 |
+
if not self.only_crop_single_image or not have_multi_images:
|
| 216 |
+
img, ovis_image_placeholders = preprocess_image_ovis(img, image_processor=self.aimv2_image_processor, crop_size=self.aimv2_crop_size, max_partition=9)
|
| 217 |
+
else:
|
| 218 |
+
img, ovis_image_placeholders = preprocess_image_ovis(img, image_processor=self.aimv2_image_processor, crop_size=self.aimv2_crop_size, max_partition=1)
|
| 219 |
+
else:
|
| 220 |
+
img, ovis_image_placeholders = preprocess_image_ovis(img, image_processor=self.aimv2_image_processor, crop_size=self.aimv2_crop_size, max_partition=1)
|
| 221 |
+
img = (img, ovis_image_placeholders)
|
| 222 |
+
processed_images.append(img)
|
| 223 |
+
|
| 224 |
+
if not self.anyres:
|
| 225 |
+
return [(img[0], img[1]) for img in processed_images], [image_sizes_list]
|
| 226 |
+
else:
|
| 227 |
+
return [(torch.cat(img[0], dim=0), img[1]) for img in processed_images], [image_sizes_list]
|
| 228 |
+
|
| 229 |
+
|
| 230 |
+
def preprocess_qwen2(
|
| 231 |
+
self,
|
| 232 |
+
conversations,
|
| 233 |
+
tokenizer: PreTrainedTokenizer,
|
| 234 |
+
has_image: bool = False,
|
| 235 |
+
inference: bool = False,
|
| 236 |
+
only_mask_system: bool = False,
|
| 237 |
+
) -> dict:
|
| 238 |
+
conv = types.SimpleNamespace(
|
| 239 |
+
system="You are a helpful assistant.",
|
| 240 |
+
roles=("user", "assistant"),
|
| 241 |
+
version="qwen2",
|
| 242 |
+
offset=0,
|
| 243 |
+
sep="<|im_start|>",
|
| 244 |
+
sep2="<|im_end|>\n",
|
| 245 |
+
)
|
| 246 |
+
|
| 247 |
+
# Check system prompt
|
| 248 |
+
assert conversations[0]["role"] == "system"
|
| 249 |
+
if conversations[0]["content"] == None:
|
| 250 |
+
conversations[0]["content"] = conv.system # use default system prompt
|
| 251 |
+
|
| 252 |
+
# Check conversation sequence
|
| 253 |
+
for j, sentence in enumerate(conversations[1:]):
|
| 254 |
+
role = sentence["role"]
|
| 255 |
+
assert role == conv.roles[j % 2], "The conversation sequence is incorrect."
|
| 256 |
+
|
| 257 |
+
conversation_str = tokenizer.apply_chat_template(conversations, tokenize=False, add_generation_prompt=inference)
|
| 258 |
+
|
| 259 |
+
# Mask targets
|
| 260 |
+
rounds = conversation_str.split(conv.sep2)
|
| 261 |
+
input_ids_ = torch.tensor([], dtype=torch.int64)
|
| 262 |
+
targets_ = torch.tensor([], dtype=torch.int64)
|
| 263 |
+
for i, rou in enumerate(rounds):
|
| 264 |
+
if rou == "":
|
| 265 |
+
continue
|
| 266 |
+
if (not inference) or (i < (len(rounds) - 1)):
|
| 267 |
+
rou += conv.sep2
|
| 268 |
+
if has_image:
|
| 269 |
+
cur_input_ids_ = self.tokenizer_image_token(rou, tokenizer, return_tensors='pt')
|
| 270 |
+
input_ids_ = torch.cat([input_ids_, cur_input_ids_], dim=0)
|
| 271 |
+
if only_mask_system:
|
| 272 |
+
mask_len = len(self.tokenizer_image_token(re.sub(rf'{conv.roles[0]}\n[\s\S]*', f'{conv.roles[0]}:', rou),
|
| 273 |
+
tokenizer))
|
| 274 |
+
else:
|
| 275 |
+
mask_len = len(self.tokenizer_image_token(re.sub(rf'{conv.roles[1]}\n[\s\S]*', f'{conv.roles[1]}:', rou),
|
| 276 |
+
tokenizer))
|
| 277 |
+
targets_ = torch.cat([targets_, torch.tensor([-100] * mask_len), cur_input_ids_[mask_len:]], dim=0)
|
| 278 |
+
else:
|
| 279 |
+
cur_input_ids_ = tokenizer(rou, return_tensors='pt')["input_ids"][0, :]
|
| 280 |
+
input_ids_ = torch.cat([input_ids_, cur_input_ids_], dim=0)
|
| 281 |
+
mask_len = len(tokenizer(re.sub(rf'{conv.roles[1]}\n[\s\S]*', rf'{conv.roles[1]}:', rou))["input_ids"][:])
|
| 282 |
+
targets_ = torch.cat([targets_, torch.tensor([-100] * mask_len), cur_input_ids_[mask_len:]], dim=0)
|
| 283 |
+
|
| 284 |
+
return {"input_ids": input_ids_, "labels": targets_}
|
| 285 |
+
|
| 286 |
+
|
| 287 |
+
def preprocess_qwen3(
|
| 288 |
+
self,
|
| 289 |
+
conversations,
|
| 290 |
+
tokenizer: PreTrainedTokenizer,
|
| 291 |
+
has_image: bool = False,
|
| 292 |
+
inference: bool = False,
|
| 293 |
+
only_mask_system: bool = False,
|
| 294 |
+
enable_thinking: bool = False, #ZYF Modify to support enable_thinking
|
| 295 |
+
) -> dict:
|
| 296 |
+
conv = types.SimpleNamespace(
|
| 297 |
+
system="You are a helpful assistant.",
|
| 298 |
+
roles=("user", "assistant"),
|
| 299 |
+
version="qwen3",
|
| 300 |
+
offset=0,
|
| 301 |
+
sep="<|im_start|>",
|
| 302 |
+
sep2="<|im_end|>\n",
|
| 303 |
+
)
|
| 304 |
+
#print(conversations)
|
| 305 |
+
|
| 306 |
+
# Check system prompt
|
| 307 |
+
|
| 308 |
+
assert conversations[0]["role"] == "system"
|
| 309 |
+
if conversations[0]["content"] == None:
|
| 310 |
+
conversations[0]["content"] = conv.system # use default system prompt
|
| 311 |
+
# if conversations[0]['role'] == "system":
|
| 312 |
+
# conversations = conversations[1:]
|
| 313 |
+
|
| 314 |
+
# Check conversation sequence
|
| 315 |
+
# print(conversations)
|
| 316 |
+
for j, sentence in enumerate(conversations[1:]):
|
| 317 |
+
role = sentence["role"]
|
| 318 |
+
assert role == conv.roles[j % 2], "The conversation sequence is incorrect."
|
| 319 |
+
|
| 320 |
+
conversation_str = tokenizer.apply_chat_template(conversations, tokenize=False, add_generation_prompt=inference, enable_thinking=enable_thinking) #ZYF Modify to support thinking
|
| 321 |
+
|
| 322 |
+
# Mask targets
|
| 323 |
+
rounds = conversation_str.split(conv.sep2)
|
| 324 |
+
input_ids_ = torch.tensor([], dtype=torch.int64)
|
| 325 |
+
targets_ = torch.tensor([], dtype=torch.int64)
|
| 326 |
+
for i, rou in enumerate(rounds):
|
| 327 |
+
if rou == "":
|
| 328 |
+
continue
|
| 329 |
+
if (not inference) or (i < (len(rounds) - 1)):
|
| 330 |
+
rou += conv.sep2
|
| 331 |
+
if has_image:
|
| 332 |
+
cur_input_ids_ = self.tokenizer_image_token(rou, tokenizer, return_tensors='pt')
|
| 333 |
+
input_ids_ = torch.cat([input_ids_, cur_input_ids_], dim=0)
|
| 334 |
+
if only_mask_system:
|
| 335 |
+
mask_len = len(self.tokenizer_image_token(re.sub(rf'{conv.roles[0]}\n[\s\S]*', f'{conv.roles[0]}:', rou),
|
| 336 |
+
tokenizer))
|
| 337 |
+
else:
|
| 338 |
+
mask_len = len(self.tokenizer_image_token(re.sub(rf'{conv.roles[1]}\n[\s\S]*', f'{conv.roles[1]}:', rou),
|
| 339 |
+
tokenizer))
|
| 340 |
+
targets_ = torch.cat([targets_, torch.tensor([-100] * mask_len), cur_input_ids_[mask_len:]], dim=0)
|
| 341 |
+
else:
|
| 342 |
+
cur_input_ids_ = tokenizer(rou, return_tensors='pt')["input_ids"][0, :]
|
| 343 |
+
input_ids_ = torch.cat([input_ids_, cur_input_ids_], dim=0)
|
| 344 |
+
mask_len = len(tokenizer(re.sub(rf'{conv.roles[1]}\n[\s\S]*', rf'{conv.roles[1]}:', rou))["input_ids"][:])
|
| 345 |
+
targets_ = torch.cat([targets_, torch.tensor([-100] * mask_len), cur_input_ids_[mask_len:]], dim=0)
|
| 346 |
+
|
| 347 |
+
return {"input_ids": input_ids_, "labels": targets_}
|
| 348 |
+
|
| 349 |
+
|
| 350 |
+
def preprocess_ovis2(
|
| 351 |
+
self,
|
| 352 |
+
source, # do not include system prompt
|
| 353 |
+
tokenizer: PreTrainedTokenizer,
|
| 354 |
+
has_image: bool = False,
|
| 355 |
+
inference: bool = False,
|
| 356 |
+
only_mask_system: bool = False,
|
| 357 |
+
video_len: int = 0,
|
| 358 |
+
):
|
| 359 |
+
# print(source)
|
| 360 |
+
judge_format = "from" in source[0].keys()
|
| 361 |
+
|
| 362 |
+
if judge_format:
|
| 363 |
+
if source[-1]["from"] == "gpt":
|
| 364 |
+
source = source[:-1]
|
| 365 |
+
|
| 366 |
+
roles = {"human": 'user', "gpt": 'assistant'}
|
| 367 |
+
input_ids = []
|
| 368 |
+
labels = []
|
| 369 |
+
messages = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
|
| 370 |
+
for message in source:
|
| 371 |
+
if message["from"] == "human":
|
| 372 |
+
user = message["value"]
|
| 373 |
+
if '<image>' not in user and '<video>' not in user:
|
| 374 |
+
messages += f"<|im_start|>{roles['human']}\n" + user + "<|im_end|>\n"
|
| 375 |
+
|
| 376 |
+
if '<image>' in user:
|
| 377 |
+
# import re
|
| 378 |
+
# image_count = user.count('<image>')
|
| 379 |
+
# user = re.sub(r'<image>', '', user).strip()
|
| 380 |
+
# user = '\n'.join([f'Image {i+1}: <image>' for i in range(image_count)]) + '\n' + user
|
| 381 |
+
messages += f"<|im_start|>{roles['human']}\n" + user + "<|im_end|>\n"
|
| 382 |
+
|
| 383 |
+
if '<video>' in user:
|
| 384 |
+
user = user.replace('<video>', '\n'.join(['<image>'] * video_len) + '\n')
|
| 385 |
+
messages += f"<|im_start|>{roles['human']}\n" + user + "<|im_end|>\n"
|
| 386 |
+
|
| 387 |
+
|
| 388 |
+
elif message["from"] == "gpt":
|
| 389 |
+
assistant = message["value"]
|
| 390 |
+
messages += f"<|im_start|>{roles['gpt']}\n" + assistant + "<|im_end|>\n"
|
| 391 |
+
if inference:
|
| 392 |
+
messages += f"<|im_start|>{roles['gpt']}\n"
|
| 393 |
+
else:
|
| 394 |
+
messages = messages[:-1] # remove the final '\n',keep <|im_end|> as the end
|
| 395 |
+
|
| 396 |
+
messages = messages.split('<image>')
|
| 397 |
+
messages = [tokenizer.encode(m) for m in messages]
|
| 398 |
+
for m in messages[:-1]:
|
| 399 |
+
input_ids += m
|
| 400 |
+
input_ids += [IMAGE_TOKEN_INDEX]
|
| 401 |
+
input_ids += messages[-1]
|
| 402 |
+
|
| 403 |
+
# mask last assistant
|
| 404 |
+
head_id = tokenizer.encode(f'<|im_start|>{roles["gpt"]}\n')
|
| 405 |
+
last_id = None
|
| 406 |
+
for i, id in enumerate(input_ids):
|
| 407 |
+
if input_ids[i:i+len(head_id)] == head_id:
|
| 408 |
+
last_id = i+len(head_id)
|
| 409 |
+
if i+len(head_id) > len(input_ids):
|
| 410 |
+
break
|
| 411 |
+
|
| 412 |
+
assert last_id != None
|
| 413 |
+
labels = len(input_ids) * [IGNORE_INDEX]
|
| 414 |
+
labels[last_id:] = input_ids[last_id:]
|
| 415 |
+
return {"input_ids": torch.tensor(input_ids), "labels": torch.tensor(labels)}
|
| 416 |
+
|
| 417 |
+
else:
|
| 418 |
+
if source[-1]["role"] == "assistant":
|
| 419 |
+
source = source[:-1]
|
| 420 |
+
|
| 421 |
+
input_ids = []
|
| 422 |
+
labels = []
|
| 423 |
+
messages = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
|
| 424 |
+
for message in source:
|
| 425 |
+
if message["role"] == "user":
|
| 426 |
+
user = message["value"]
|
| 427 |
+
if '<image>' not in user and '<video>' not in user:
|
| 428 |
+
messages += f"<|im_start|>user\n" + user + "<|im_end|>\n"
|
| 429 |
+
|
| 430 |
+
if '<image>' in user:
|
| 431 |
+
# import re
|
| 432 |
+
# image_count = user.count('<image>')
|
| 433 |
+
# user = re.sub(r'<image>', '', user).strip()
|
| 434 |
+
# user = '\n'.join([f'Image {i+1}: <image>' for i in range(image_count)]) + '\n' + user
|
| 435 |
+
messages += f"<|im_start|>user\n" + user + "<|im_end|>\n"
|
| 436 |
+
|
| 437 |
+
if '<video>' in user:
|
| 438 |
+
user = user.replace('<video>', '\n'.join(['<image>'] * video_len) + '\n')
|
| 439 |
+
messages += f"<|im_start|>user\n" + user + "<|im_end|>\n"
|
| 440 |
+
|
| 441 |
+
elif message["role"] == "assistant":
|
| 442 |
+
assistant = message["value"]
|
| 443 |
+
messages += f"<|im_start|>assistant\n" + assistant + "<|im_end|>\n"
|
| 444 |
+
if inference:
|
| 445 |
+
messages += f"<|im_start|>assistant\n"
|
| 446 |
+
else:
|
| 447 |
+
messages = messages[:-1] # remove the final '\n',keep <|im_end|> as the end
|
| 448 |
+
|
| 449 |
+
messages = messages.split('<image>')
|
| 450 |
+
messages = [tokenizer.encode(m) for m in messages]
|
| 451 |
+
for m in messages[:-1]:
|
| 452 |
+
input_ids += m
|
| 453 |
+
input_ids += [IMAGE_TOKEN_INDEX]
|
| 454 |
+
input_ids += messages[-1]
|
| 455 |
+
|
| 456 |
+
# mask last assistant
|
| 457 |
+
head_id = tokenizer.encode(f'<|im_start|>assistant\n')
|
| 458 |
+
last_id = None
|
| 459 |
+
for i, id in enumerate(input_ids):
|
| 460 |
+
if input_ids[i:i+len(head_id)] == head_id:
|
| 461 |
+
last_id = i+len(head_id)
|
| 462 |
+
if i+len(head_id) > len(input_ids):
|
| 463 |
+
break
|
| 464 |
+
|
| 465 |
+
assert last_id != None
|
| 466 |
+
labels = len(input_ids) * [IGNORE_INDEX]
|
| 467 |
+
labels[last_id:] = input_ids[last_id:]
|
| 468 |
+
return {"input_ids": torch.tensor(input_ids), "labels": torch.tensor(labels)}
|
| 469 |
+
|
| 470 |
+
|
| 471 |
+
def tokenizer_image_token(
|
| 472 |
+
self,
|
| 473 |
+
prompt,
|
| 474 |
+
tokenizer,
|
| 475 |
+
image_token_index=IMAGE_TOKEN_INDEX,
|
| 476 |
+
return_tensors=None,
|
| 477 |
+
):
|
| 478 |
+
def split_with_token(string, token):
|
| 479 |
+
result = string.split(token)
|
| 480 |
+
for i in range(len(result) - 1):
|
| 481 |
+
result.insert(i * 2 + 1, token)
|
| 482 |
+
return result
|
| 483 |
+
|
| 484 |
+
if len(prompt) > SEQ_MAX_LEN:
|
| 485 |
+
raise ValueError("sequence is too long !!!")
|
| 486 |
+
|
| 487 |
+
prompt_chunks = split_with_token(prompt, DEFAULT_IMAGE_TOKEN)
|
| 488 |
+
input_ids, offset = ([tokenizer.bos_token_id], 1) if getattr(tokenizer,'bos_token',None) else ([], 0)
|
| 489 |
+
token2index = {DEFAULT_IMAGE_TOKEN: image_token_index}
|
| 490 |
+
for chunk in prompt_chunks:
|
| 491 |
+
if chunk in token2index:
|
| 492 |
+
input_ids.append(token2index[chunk])
|
| 493 |
+
else:
|
| 494 |
+
chunk_ids = tokenizer(chunk).input_ids
|
| 495 |
+
if chunk_ids[0] != getattr(tokenizer,'bos_token_id', None):
|
| 496 |
+
offset = 0
|
| 497 |
+
input_ids.extend(chunk_ids[offset:])
|
| 498 |
+
|
| 499 |
+
if return_tensors is not None:
|
| 500 |
+
if return_tensors == "pt":
|
| 501 |
+
return torch.tensor(input_ids, dtype=torch.long)
|
| 502 |
+
raise ValueError(f"Unsupported tensor type: {return_tensors}")
|
| 503 |
+
return input_ids
|
| 504 |
+
|
| 505 |
+
|
| 506 |
+
|
| 507 |
+
def __call__(self, messages, inference=True, **kwargs) -> BatchFeature:
|
| 508 |
+
# print("+++++++++++"*5+"Process get"+"++++++++++"*5)
|
| 509 |
+
# print(messages)
|
| 510 |
+
# print("+++++++++++"*10)
|
| 511 |
+
process_mode = self.process_mode
|
| 512 |
+
if process_mode == "ovis2":
|
| 513 |
+
video_len = kwargs.get('video_len', 0)
|
| 514 |
+
# max_tile_num = kwargs.get('max_tile_num', 1)
|
| 515 |
+
|
| 516 |
+
if "images" not in messages or not messages["images"] or not messages["images"][0]:
|
| 517 |
+
images = [self.black_img]
|
| 518 |
+
elif type(messages["images"]) == str:
|
| 519 |
+
images = [messages["images"]]
|
| 520 |
+
else:
|
| 521 |
+
images = messages["images"]
|
| 522 |
+
|
| 523 |
+
conversations = messages["conversations"]
|
| 524 |
+
|
| 525 |
+
# adapt for user-assistant format, transform to human-gpt format
|
| 526 |
+
if "role" in conversations[0]:
|
| 527 |
+
new_conversations = []
|
| 528 |
+
for conversation in conversations:
|
| 529 |
+
if conversation["role"] == "system":
|
| 530 |
+
new_conversations.append({"from": "system", "value": conversation["content"]})
|
| 531 |
+
elif conversation["role"] == "user":
|
| 532 |
+
new_conversations.append({"from": "human", "value": conversation["content"]})
|
| 533 |
+
elif conversation["role"] == "assistant":
|
| 534 |
+
new_conversations.append({"from": "gpt", "value": conversation["content"]})
|
| 535 |
+
conversations = new_conversations
|
| 536 |
+
|
| 537 |
+
# add <image> token
|
| 538 |
+
first_conv = conversations[1] if conversations[0]["from"] == "system" else conversations[0]
|
| 539 |
+
if images and "<image>" not in first_conv["value"]:
|
| 540 |
+
image_token = "\n".join(["<image>"] * len(images))
|
| 541 |
+
first_conv["value"] = f"{image_token}\n{first_conv['value']}"
|
| 542 |
+
|
| 543 |
+
data_dict = self.preprocess_ovis2(conversations, self.tokenizer, has_image=True, only_mask_system=False, inference=inference, video_len=video_len)
|
| 544 |
+
data_dict['images'], data_dict['image_sizes'] = self.preprocess_images_aimv2(images)
|
| 545 |
+
data_dict = ovis_template_process(data_dict)
|
| 546 |
+
# be batch
|
| 547 |
+
data_dict['images'] = [data_dict['images']]
|
| 548 |
+
data_dict['input_ids'] = data_dict['input_ids'].unsqueeze(0)
|
| 549 |
+
return BatchFeature(data={**data_dict})
|
| 550 |
+
|
| 551 |
+
elif process_mode == "qwen2" or process_mode == "qwen3":
|
| 552 |
+
max_pixels=kwargs.get("max_pixels", self.max_pixels)
|
| 553 |
+
min_pixels=kwargs.get("min_pixels", self.min_pixels)
|
| 554 |
+
if max_pixels is not None:
|
| 555 |
+
self.qwen2vl_image_processor.max_pixels = max_pixels
|
| 556 |
+
if min_pixels is not None:
|
| 557 |
+
self.qwen2vl_image_processor.min_pixels = min_pixels
|
| 558 |
+
|
| 559 |
+
# Deal with images
|
| 560 |
+
if "images" not in messages or not messages["images"] or not messages["images"][0]:
|
| 561 |
+
images = [self.black_img]
|
| 562 |
+
elif type(messages["images"]) == str:
|
| 563 |
+
images = [messages["images"]]
|
| 564 |
+
else:
|
| 565 |
+
images = messages["images"]
|
| 566 |
+
|
| 567 |
+
# Deal with conversations
|
| 568 |
+
conversations = messages["conversations"]
|
| 569 |
+
if conversations[0]["role"] != "system":
|
| 570 |
+
conversations = [{"role":"system", "content": None}] + conversations # dummy system prompt
|
| 571 |
+
|
| 572 |
+
# Insert special token `<image>`
|
| 573 |
+
assert conversations[1]["role"] == "user"
|
| 574 |
+
if images and "<image>" not in conversations[1]["content"]:
|
| 575 |
+
image_token = " ".join(["<image>"] * len(images))
|
| 576 |
+
conversations[1]["content"] = f"{image_token}\n{conversations[1]['content']}"
|
| 577 |
+
|
| 578 |
+
# The last message should be assistant if inference=True
|
| 579 |
+
if inference:
|
| 580 |
+
assert conversations[-1]["role"] == "user", "the last message should be assistant if inference=True"
|
| 581 |
+
|
| 582 |
+
# Image preprocess
|
| 583 |
+
if self.only_navit:
|
| 584 |
+
precessed_images_siglip = None
|
| 585 |
+
else:
|
| 586 |
+
precessed_images_siglip = self.preprocess_images_siglip(images)
|
| 587 |
+
processed_data_dict_qwen2vl = self.preprocess_images_qwen2vl(images)
|
| 588 |
+
source = self.preprocess_multimodal(conversations)
|
| 589 |
+
if process_mode == "qwen2":
|
| 590 |
+
data_dict = self.preprocess_qwen2(source, self.tokenizer, has_image=True, only_mask_system=False, inference=inference)
|
| 591 |
+
if process_mode == "qwen3":
|
| 592 |
+
# ZYF Modify to support thinking
|
| 593 |
+
enable_thinking = kwargs.get("enable_thinking", True) #默认开启
|
| 594 |
+
data_dict = self.preprocess_qwen3(source, self.tokenizer, has_image=True, only_mask_system=False, inference=inference, enable_thinking=enable_thinking)
|
| 595 |
+
# Construct batch data
|
| 596 |
+
data_dict["input_ids"] = data_dict["input_ids"].unsqueeze(0) # batch_size = 1
|
| 597 |
+
data_dict["labels"] = data_dict["labels"].unsqueeze(0)
|
| 598 |
+
data_dict["images"] = [precessed_images_siglip]
|
| 599 |
+
|
| 600 |
+
return BatchFeature(data={**data_dict, **processed_data_dict_qwen2vl})
|
| 601 |
+
else:
|
| 602 |
+
raise ValueError(f"Unsupported process mode: {process_mode}")
|
| 603 |
+
|
| 604 |
+
def batch_decode(self, *args, **kwargs):
|
| 605 |
+
"""
|
| 606 |
+
This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
|
| 607 |
+
refer to the docstring of this method for more information.
|
| 608 |
+
"""
|
| 609 |
+
return self.tokenizer.batch_decode(*args, **kwargs)
|
| 610 |
+
|
| 611 |
+
|
| 612 |
+
def decode(self, *args, **kwargs):
|
| 613 |
+
"""
|
| 614 |
+
This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
|
| 615 |
+
the docstring of this method for more information.
|
| 616 |
+
"""
|
| 617 |
+
return self.tokenizer.decode(*args, **kwargs)
|
| 618 |
+
|
special_tokens_map.json
CHANGED
|
@@ -1,3 +1,37 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"additional_special_tokens": [
|
| 3 |
+
"<|im_start|>",
|
| 4 |
+
"<|im_end|>",
|
| 5 |
+
"<|object_ref_start|>",
|
| 6 |
+
"<|object_ref_end|>",
|
| 7 |
+
"<|box_start|>",
|
| 8 |
+
"<|box_end|>",
|
| 9 |
+
"<|quad_start|>",
|
| 10 |
+
"<|quad_end|>",
|
| 11 |
+
"<|vision_start|>",
|
| 12 |
+
"<|vision_end|>",
|
| 13 |
+
"<|vision_pad|>",
|
| 14 |
+
"<|image_pad|>",
|
| 15 |
+
"<|video_pad|>",
|
| 16 |
+
"<im_start>",
|
| 17 |
+
"<im_end>",
|
| 18 |
+
"<vi_start>",
|
| 19 |
+
"<vi_end>",
|
| 20 |
+
"<cor>",
|
| 21 |
+
"<\\cor>"
|
| 22 |
+
],
|
| 23 |
+
"eos_token": {
|
| 24 |
+
"content": "<|im_end|>",
|
| 25 |
+
"lstrip": false,
|
| 26 |
+
"normalized": false,
|
| 27 |
+
"rstrip": false,
|
| 28 |
+
"single_word": false
|
| 29 |
+
},
|
| 30 |
+
"pad_token": {
|
| 31 |
+
"content": "<|endoftext|>",
|
| 32 |
+
"lstrip": false,
|
| 33 |
+
"normalized": false,
|
| 34 |
+
"rstrip": false,
|
| 35 |
+
"single_word": false
|
| 36 |
+
}
|
| 37 |
+
}
|
tokenizer_config.json
CHANGED
|
@@ -1,3 +1,298 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": false,
|
| 3 |
+
"add_prefix_space": false,
|
| 4 |
+
"added_tokens_decoder": {
|
| 5 |
+
"151643": {
|
| 6 |
+
"content": "<|endoftext|>",
|
| 7 |
+
"lstrip": false,
|
| 8 |
+
"normalized": false,
|
| 9 |
+
"rstrip": false,
|
| 10 |
+
"single_word": false,
|
| 11 |
+
"special": true
|
| 12 |
+
},
|
| 13 |
+
"151644": {
|
| 14 |
+
"content": "<|im_start|>",
|
| 15 |
+
"lstrip": false,
|
| 16 |
+
"normalized": false,
|
| 17 |
+
"rstrip": false,
|
| 18 |
+
"single_word": false,
|
| 19 |
+
"special": true
|
| 20 |
+
},
|
| 21 |
+
"151645": {
|
| 22 |
+
"content": "<|im_end|>",
|
| 23 |
+
"lstrip": false,
|
| 24 |
+
"normalized": false,
|
| 25 |
+
"rstrip": false,
|
| 26 |
+
"single_word": false,
|
| 27 |
+
"special": true
|
| 28 |
+
},
|
| 29 |
+
"151646": {
|
| 30 |
+
"content": "<|object_ref_start|>",
|
| 31 |
+
"lstrip": false,
|
| 32 |
+
"normalized": false,
|
| 33 |
+
"rstrip": false,
|
| 34 |
+
"single_word": false,
|
| 35 |
+
"special": true
|
| 36 |
+
},
|
| 37 |
+
"151647": {
|
| 38 |
+
"content": "<|object_ref_end|>",
|
| 39 |
+
"lstrip": false,
|
| 40 |
+
"normalized": false,
|
| 41 |
+
"rstrip": false,
|
| 42 |
+
"single_word": false,
|
| 43 |
+
"special": true
|
| 44 |
+
},
|
| 45 |
+
"151648": {
|
| 46 |
+
"content": "<|box_start|>",
|
| 47 |
+
"lstrip": false,
|
| 48 |
+
"normalized": false,
|
| 49 |
+
"rstrip": false,
|
| 50 |
+
"single_word": false,
|
| 51 |
+
"special": true
|
| 52 |
+
},
|
| 53 |
+
"151649": {
|
| 54 |
+
"content": "<|box_end|>",
|
| 55 |
+
"lstrip": false,
|
| 56 |
+
"normalized": false,
|
| 57 |
+
"rstrip": false,
|
| 58 |
+
"single_word": false,
|
| 59 |
+
"special": true
|
| 60 |
+
},
|
| 61 |
+
"151650": {
|
| 62 |
+
"content": "<|quad_start|>",
|
| 63 |
+
"lstrip": false,
|
| 64 |
+
"normalized": false,
|
| 65 |
+
"rstrip": false,
|
| 66 |
+
"single_word": false,
|
| 67 |
+
"special": true
|
| 68 |
+
},
|
| 69 |
+
"151651": {
|
| 70 |
+
"content": "<|quad_end|>",
|
| 71 |
+
"lstrip": false,
|
| 72 |
+
"normalized": false,
|
| 73 |
+
"rstrip": false,
|
| 74 |
+
"single_word": false,
|
| 75 |
+
"special": true
|
| 76 |
+
},
|
| 77 |
+
"151652": {
|
| 78 |
+
"content": "<|vision_start|>",
|
| 79 |
+
"lstrip": false,
|
| 80 |
+
"normalized": false,
|
| 81 |
+
"rstrip": false,
|
| 82 |
+
"single_word": false,
|
| 83 |
+
"special": true
|
| 84 |
+
},
|
| 85 |
+
"151653": {
|
| 86 |
+
"content": "<|vision_end|>",
|
| 87 |
+
"lstrip": false,
|
| 88 |
+
"normalized": false,
|
| 89 |
+
"rstrip": false,
|
| 90 |
+
"single_word": false,
|
| 91 |
+
"special": true
|
| 92 |
+
},
|
| 93 |
+
"151654": {
|
| 94 |
+
"content": "<|vision_pad|>",
|
| 95 |
+
"lstrip": false,
|
| 96 |
+
"normalized": false,
|
| 97 |
+
"rstrip": false,
|
| 98 |
+
"single_word": false,
|
| 99 |
+
"special": true
|
| 100 |
+
},
|
| 101 |
+
"151655": {
|
| 102 |
+
"content": "<|image_pad|>",
|
| 103 |
+
"lstrip": false,
|
| 104 |
+
"normalized": false,
|
| 105 |
+
"rstrip": false,
|
| 106 |
+
"single_word": false,
|
| 107 |
+
"special": true
|
| 108 |
+
},
|
| 109 |
+
"151656": {
|
| 110 |
+
"content": "<|video_pad|>",
|
| 111 |
+
"lstrip": false,
|
| 112 |
+
"normalized": false,
|
| 113 |
+
"rstrip": false,
|
| 114 |
+
"single_word": false,
|
| 115 |
+
"special": true
|
| 116 |
+
},
|
| 117 |
+
"151657": {
|
| 118 |
+
"content": "<tool_call>",
|
| 119 |
+
"lstrip": false,
|
| 120 |
+
"normalized": false,
|
| 121 |
+
"rstrip": false,
|
| 122 |
+
"single_word": false,
|
| 123 |
+
"special": false
|
| 124 |
+
},
|
| 125 |
+
"151658": {
|
| 126 |
+
"content": "</tool_call>",
|
| 127 |
+
"lstrip": false,
|
| 128 |
+
"normalized": false,
|
| 129 |
+
"rstrip": false,
|
| 130 |
+
"single_word": false,
|
| 131 |
+
"special": false
|
| 132 |
+
},
|
| 133 |
+
"151659": {
|
| 134 |
+
"content": "<|fim_prefix|>",
|
| 135 |
+
"lstrip": false,
|
| 136 |
+
"normalized": false,
|
| 137 |
+
"rstrip": false,
|
| 138 |
+
"single_word": false,
|
| 139 |
+
"special": false
|
| 140 |
+
},
|
| 141 |
+
"151660": {
|
| 142 |
+
"content": "<|fim_middle|>",
|
| 143 |
+
"lstrip": false,
|
| 144 |
+
"normalized": false,
|
| 145 |
+
"rstrip": false,
|
| 146 |
+
"single_word": false,
|
| 147 |
+
"special": false
|
| 148 |
+
},
|
| 149 |
+
"151661": {
|
| 150 |
+
"content": "<|fim_suffix|>",
|
| 151 |
+
"lstrip": false,
|
| 152 |
+
"normalized": false,
|
| 153 |
+
"rstrip": false,
|
| 154 |
+
"single_word": false,
|
| 155 |
+
"special": false
|
| 156 |
+
},
|
| 157 |
+
"151662": {
|
| 158 |
+
"content": "<|fim_pad|>",
|
| 159 |
+
"lstrip": false,
|
| 160 |
+
"normalized": false,
|
| 161 |
+
"rstrip": false,
|
| 162 |
+
"single_word": false,
|
| 163 |
+
"special": false
|
| 164 |
+
},
|
| 165 |
+
"151663": {
|
| 166 |
+
"content": "<|repo_name|>",
|
| 167 |
+
"lstrip": false,
|
| 168 |
+
"normalized": false,
|
| 169 |
+
"rstrip": false,
|
| 170 |
+
"single_word": false,
|
| 171 |
+
"special": false
|
| 172 |
+
},
|
| 173 |
+
"151664": {
|
| 174 |
+
"content": "<|file_sep|>",
|
| 175 |
+
"lstrip": false,
|
| 176 |
+
"normalized": false,
|
| 177 |
+
"rstrip": false,
|
| 178 |
+
"single_word": false,
|
| 179 |
+
"special": false
|
| 180 |
+
},
|
| 181 |
+
"151665": {
|
| 182 |
+
"content": "<tool_response>",
|
| 183 |
+
"lstrip": false,
|
| 184 |
+
"normalized": false,
|
| 185 |
+
"rstrip": false,
|
| 186 |
+
"single_word": false,
|
| 187 |
+
"special": false
|
| 188 |
+
},
|
| 189 |
+
"151666": {
|
| 190 |
+
"content": "</tool_response>",
|
| 191 |
+
"lstrip": false,
|
| 192 |
+
"normalized": false,
|
| 193 |
+
"rstrip": false,
|
| 194 |
+
"single_word": false,
|
| 195 |
+
"special": false
|
| 196 |
+
},
|
| 197 |
+
"151667": {
|
| 198 |
+
"content": "<think>",
|
| 199 |
+
"lstrip": false,
|
| 200 |
+
"normalized": false,
|
| 201 |
+
"rstrip": false,
|
| 202 |
+
"single_word": false,
|
| 203 |
+
"special": false
|
| 204 |
+
},
|
| 205 |
+
"151668": {
|
| 206 |
+
"content": "</think>",
|
| 207 |
+
"lstrip": false,
|
| 208 |
+
"normalized": false,
|
| 209 |
+
"rstrip": false,
|
| 210 |
+
"single_word": false,
|
| 211 |
+
"special": false
|
| 212 |
+
},
|
| 213 |
+
"151669": {
|
| 214 |
+
"content": "<im_start>",
|
| 215 |
+
"lstrip": false,
|
| 216 |
+
"normalized": false,
|
| 217 |
+
"rstrip": false,
|
| 218 |
+
"single_word": false,
|
| 219 |
+
"special": true
|
| 220 |
+
},
|
| 221 |
+
"151670": {
|
| 222 |
+
"content": "<im_end>",
|
| 223 |
+
"lstrip": false,
|
| 224 |
+
"normalized": false,
|
| 225 |
+
"rstrip": false,
|
| 226 |
+
"single_word": false,
|
| 227 |
+
"special": true
|
| 228 |
+
},
|
| 229 |
+
"151671": {
|
| 230 |
+
"content": "<vi_start>",
|
| 231 |
+
"lstrip": false,
|
| 232 |
+
"normalized": false,
|
| 233 |
+
"rstrip": false,
|
| 234 |
+
"single_word": false,
|
| 235 |
+
"special": true
|
| 236 |
+
},
|
| 237 |
+
"151672": {
|
| 238 |
+
"content": "<vi_end>",
|
| 239 |
+
"lstrip": false,
|
| 240 |
+
"normalized": false,
|
| 241 |
+
"rstrip": false,
|
| 242 |
+
"single_word": false,
|
| 243 |
+
"special": true
|
| 244 |
+
},
|
| 245 |
+
"151673": {
|
| 246 |
+
"content": "<cor>",
|
| 247 |
+
"lstrip": false,
|
| 248 |
+
"normalized": false,
|
| 249 |
+
"rstrip": false,
|
| 250 |
+
"single_word": false,
|
| 251 |
+
"special": true
|
| 252 |
+
},
|
| 253 |
+
"151674": {
|
| 254 |
+
"content": "<\\cor>",
|
| 255 |
+
"lstrip": false,
|
| 256 |
+
"normalized": false,
|
| 257 |
+
"rstrip": false,
|
| 258 |
+
"single_word": false,
|
| 259 |
+
"special": true
|
| 260 |
+
}
|
| 261 |
+
},
|
| 262 |
+
"additional_special_tokens": [
|
| 263 |
+
"<|im_start|>",
|
| 264 |
+
"<|im_end|>",
|
| 265 |
+
"<|object_ref_start|>",
|
| 266 |
+
"<|object_ref_end|>",
|
| 267 |
+
"<|box_start|>",
|
| 268 |
+
"<|box_end|>",
|
| 269 |
+
"<|quad_start|>",
|
| 270 |
+
"<|quad_end|>",
|
| 271 |
+
"<|vision_start|>",
|
| 272 |
+
"<|vision_end|>",
|
| 273 |
+
"<|vision_pad|>",
|
| 274 |
+
"<|image_pad|>",
|
| 275 |
+
"<|video_pad|>",
|
| 276 |
+
"<im_start>",
|
| 277 |
+
"<im_end>",
|
| 278 |
+
"<vi_start>",
|
| 279 |
+
"<vi_end>",
|
| 280 |
+
"<cor>",
|
| 281 |
+
"<\\cor>"
|
| 282 |
+
],
|
| 283 |
+
"auto_map": {
|
| 284 |
+
"AutoProcessor": "/mnt/bn/ecomcommonnas/zhangshuo/easyguard/checkpoints/VALLEY_B8_V1_GTHINKER_ENABLE_THINKING_COLD_START_V0908_MERGED_V1/checkpoint-400--processing_valley.ValleyProcessor"
|
| 285 |
+
},
|
| 286 |
+
"bos_token": null,
|
| 287 |
+
"clean_up_tokenization_spaces": false,
|
| 288 |
+
"eos_token": "<|im_end|>",
|
| 289 |
+
"errors": "replace",
|
| 290 |
+
"extra_special_tokens": {},
|
| 291 |
+
"model_max_length": 4096,
|
| 292 |
+
"pad_token": "<|endoftext|>",
|
| 293 |
+
"padding_side": "right",
|
| 294 |
+
"processor_class": "ValleyProcessor",
|
| 295 |
+
"split_special_tokens": false,
|
| 296 |
+
"tokenizer_class": "Qwen2Tokenizer",
|
| 297 |
+
"unk_token": null
|
| 298 |
+
}
|
utils.py
CHANGED
|
@@ -1,3 +1,409 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from PIL import Image
|
| 2 |
+
from io import BytesIO
|
| 3 |
+
import base64
|
| 4 |
+
import math
|
| 5 |
+
import ast
|
| 6 |
+
import re
|
| 7 |
+
import torch
|
| 8 |
+
from transformers import StoppingCriteria
|
| 9 |
+
|
| 10 |
+
IGNORE_INDEX = -100
|
| 11 |
+
IMAGE_TOKEN_INDEX = -200
|
| 12 |
+
GANDALF_TOKEN_INDEX = -300
|
| 13 |
+
DEFAULT_PAD_TOKEN = "[PAD]"
|
| 14 |
+
DEFAULT_EOS_TOKEN = "</s>"
|
| 15 |
+
DEFAULT_BOS_TOKEN = "</s>"
|
| 16 |
+
DEFAULT_UNK_TOKEN = "<unk>"
|
| 17 |
+
DEFAULT_IMAGE_TOKEN = "<image>"
|
| 18 |
+
DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
|
| 19 |
+
DEFAULT_IM_START_TOKEN = "<im_start>"
|
| 20 |
+
DEFAULT_IM_END_TOKEN = "<im_end>"
|
| 21 |
+
DEFAULT_VIDEO_TOKEN = "<video>"
|
| 22 |
+
DEFAULT_VIDEO_FRAME_TOKEN = "<vi_frame>"
|
| 23 |
+
DEFAULT_VI_START_TOKEN = "<vi_start>"
|
| 24 |
+
DEFAULT_VI_END_TOKEN = "<vi_end>"
|
| 25 |
+
DEFAULT_EOC_TOKEN = "<eoc>"
|
| 26 |
+
COR_START_TOKEN = "<cor>"
|
| 27 |
+
COR_END_TOKEN = "<\cor>"
|
| 28 |
+
SEQ_MAX_LEN = 50000
|
| 29 |
+
BLACK_IMG_ENV = b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x03\x00\x00\x00\x03\x08\x02\x00\x00\x00\xd9J"\xe8\x00\x00\x00\x12IDAT\x08\x1dcd\x80\x01F\x06\x18`d\x80\x01\x00\x00Z\x00\x04we\x03N\x00\x00\x00\x00IEND\xaeB`\x82'
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
|
| 33 |
+
"""
|
| 34 |
+
Calculate the shape of the image patch grid after the preprocessing for images of any resolution.
|
| 35 |
+
Args:
|
| 36 |
+
image_size (tuple): The size of the input image in the format (width, height).
|
| 37 |
+
grid_pinpoints (str): A string representation of a list of possible resolutions.
|
| 38 |
+
patch_size (int): The size of each image patch.
|
| 39 |
+
Returns:
|
| 40 |
+
tuple: The shape of the image patch grid in the format (width, height).
|
| 41 |
+
"""
|
| 42 |
+
if isinstance(grid_pinpoints, str) and "x" in grid_pinpoints:
|
| 43 |
+
assert patch_size in [224, 336, 384, 448, 512], "patch_size should be in [224, 336, 384, 448, 512]"
|
| 44 |
+
# Use regex to extract the range from the input string
|
| 45 |
+
matches = re.findall(r"\((\d+)x(\d+)\)", grid_pinpoints)
|
| 46 |
+
range_start = tuple(map(int, matches[0]))
|
| 47 |
+
range_end = tuple(map(int, matches[-1]))
|
| 48 |
+
# Generate a matrix of tuples from (range_start[0], range_start[1]) to (range_end[0], range_end[1])
|
| 49 |
+
grid_pinpoints = [
|
| 50 |
+
(i, j)
|
| 51 |
+
for i in range(range_start[0], range_end[0] + 1)
|
| 52 |
+
for j in range(range_start[1], range_end[1] + 1)
|
| 53 |
+
]
|
| 54 |
+
# Multiply all elements by patch_size
|
| 55 |
+
grid_pinpoints = [[dim * patch_size for dim in pair] for pair in grid_pinpoints]
|
| 56 |
+
if type(grid_pinpoints) is list:
|
| 57 |
+
possible_resolutions = grid_pinpoints
|
| 58 |
+
else:
|
| 59 |
+
possible_resolutions = ast.literal_eval(grid_pinpoints)
|
| 60 |
+
width, height = select_best_resolution(image_size, possible_resolutions)
|
| 61 |
+
return width // patch_size, height // patch_size
|
| 62 |
+
|
| 63 |
+
def select_best_resolution(original_size, possible_resolutions):
|
| 64 |
+
"""
|
| 65 |
+
Selects the best resolution from a list of possible resolutions based on the original size.
|
| 66 |
+
Args:
|
| 67 |
+
original_size (tuple): The original size of the image in the format (width, height).
|
| 68 |
+
possible_resolutions (list): A list of possible resolutions in the format
|
| 69 |
+
[(width1, height1), (width2, height2), ...].
|
| 70 |
+
Returns:
|
| 71 |
+
tuple: The best fit resolution in the format (width, height).
|
| 72 |
+
"""
|
| 73 |
+
original_width, original_height = original_size
|
| 74 |
+
best_fit = None
|
| 75 |
+
max_effective_resolution = 0
|
| 76 |
+
min_wasted_resolution = float("inf")
|
| 77 |
+
|
| 78 |
+
for width, height in possible_resolutions:
|
| 79 |
+
# Calculate the downscaled size to keep the aspect ratio
|
| 80 |
+
scale = min(width / original_width, height / original_height)
|
| 81 |
+
downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
|
| 82 |
+
|
| 83 |
+
# Calculate effective and wasted resolutions
|
| 84 |
+
effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
|
| 85 |
+
wasted_resolution = (width * height) - effective_resolution
|
| 86 |
+
|
| 87 |
+
if effective_resolution > max_effective_resolution or \
|
| 88 |
+
(effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution):
|
| 89 |
+
max_effective_resolution = effective_resolution
|
| 90 |
+
min_wasted_resolution = wasted_resolution
|
| 91 |
+
best_fit = (width, height)
|
| 92 |
+
|
| 93 |
+
return best_fit
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def unpad_image(tensor, original_size):
|
| 97 |
+
"""
|
| 98 |
+
Unpads a PyTorch tensor of a padded and resized image.
|
| 99 |
+
Args:
|
| 100 |
+
tensor (torch.Tensor): The image tensor, assumed to be in CxHxW format.
|
| 101 |
+
original_size (tuple): The original size of the image (height, width).
|
| 102 |
+
Returns:
|
| 103 |
+
torch.Tensor: The unpadded image tensor.
|
| 104 |
+
"""
|
| 105 |
+
original_width, original_height = original_size
|
| 106 |
+
current_height, current_width = tensor.shape[1:]
|
| 107 |
+
|
| 108 |
+
# Compute aspect ratios
|
| 109 |
+
original_aspect_ratio = original_width / original_height
|
| 110 |
+
current_aspect_ratio = current_width / current_height
|
| 111 |
+
|
| 112 |
+
# Determine padding size and direction
|
| 113 |
+
if original_aspect_ratio > current_aspect_ratio:
|
| 114 |
+
# Padding was added to the height
|
| 115 |
+
scale_factor = current_width / original_width
|
| 116 |
+
new_height = int(original_height * scale_factor)
|
| 117 |
+
padding = (current_height - new_height) // 2
|
| 118 |
+
unpadded_tensor = tensor[:, padding: current_height - padding, :]
|
| 119 |
+
else:
|
| 120 |
+
# Padding was added to the width
|
| 121 |
+
scale_factor = current_height / original_height
|
| 122 |
+
new_width = int(original_width * scale_factor)
|
| 123 |
+
padding = (current_width - new_width) // 2
|
| 124 |
+
unpadded_tensor = tensor[:, :, padding: current_width - padding]
|
| 125 |
+
|
| 126 |
+
return unpadded_tensor
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
def process_anyres_image(image, processor, grid_pinpoints):
|
| 130 |
+
"""
|
| 131 |
+
Process an image with variable resolutions.
|
| 132 |
+
Args:
|
| 133 |
+
image (PIL.Image.Image): The input image to be processed.
|
| 134 |
+
processor: The image processor object.
|
| 135 |
+
grid_pinpoints (str): A string representation of a list of possible resolutions.
|
| 136 |
+
Returns:
|
| 137 |
+
torch.Tensor: A tensor containing the processed image patches.
|
| 138 |
+
"""
|
| 139 |
+
# Convert grid_pinpoints from string to list
|
| 140 |
+
if isinstance(grid_pinpoints, str) and "x" in grid_pinpoints:
|
| 141 |
+
try:
|
| 142 |
+
patch_size = processor.size["height"]
|
| 143 |
+
except Exception:
|
| 144 |
+
patch_size = processor.size["shortest_edge"]
|
| 145 |
+
assert patch_size in [224, 336, 384, 448, 512], "patch_size should be in [224, 336, 384, 448, 512]"
|
| 146 |
+
# Use regex to extract the range from the input string
|
| 147 |
+
matches = re.findall(r"\((\d+)x(\d+)\)", grid_pinpoints)
|
| 148 |
+
range_start = tuple(map(int, matches[0]))
|
| 149 |
+
range_end = tuple(map(int, matches[-1]))
|
| 150 |
+
# Generate a matrix of tuples from (range_start[0], range_start[1]) to (range_end[0], range_end[1])
|
| 151 |
+
grid_pinpoints = [
|
| 152 |
+
(i, j)
|
| 153 |
+
for i in range(range_start[0], range_end[0] + 1)
|
| 154 |
+
for j in range(range_start[1], range_end[1] + 1)
|
| 155 |
+
]
|
| 156 |
+
# Multiply all elements by patch_size
|
| 157 |
+
grid_pinpoints = [[dim * patch_size for dim in pair] for pair in grid_pinpoints]
|
| 158 |
+
|
| 159 |
+
if type(grid_pinpoints) is list:
|
| 160 |
+
possible_resolutions = grid_pinpoints
|
| 161 |
+
else:
|
| 162 |
+
possible_resolutions = ast.literal_eval(grid_pinpoints)
|
| 163 |
+
best_resolution = select_best_resolution(image.size, possible_resolutions)
|
| 164 |
+
image_padded = resize_and_pad_image(image, best_resolution)
|
| 165 |
+
|
| 166 |
+
patches = divide_to_patches(image_padded, processor.size["height"])
|
| 167 |
+
|
| 168 |
+
# FIXME: this seems to be a bug that it resizes instead of pad.
|
| 169 |
+
# but to keep it consistent with previous, i will keep it as it is
|
| 170 |
+
# TODO: uncomment below to ablate with the padding
|
| 171 |
+
if isinstance(processor.size, dict):
|
| 172 |
+
shortest_edge = processor.size["height"]
|
| 173 |
+
else:
|
| 174 |
+
shortest_edge = min(processor.size)
|
| 175 |
+
image_original_resize = image.resize((shortest_edge, shortest_edge))
|
| 176 |
+
# image_padded_square = expand2square(image, tuple(int(x*255) for x in processor.image_mean))
|
| 177 |
+
|
| 178 |
+
image_patches = [image_original_resize] + patches
|
| 179 |
+
image_patches = [
|
| 180 |
+
processor.preprocess(image_patch, return_tensors="pt")["pixel_values"][0]
|
| 181 |
+
for image_patch in image_patches
|
| 182 |
+
]
|
| 183 |
+
# return torch.stack(image_patches, dim=0)
|
| 184 |
+
return image_patches
|
| 185 |
+
|
| 186 |
+
def resize_and_pad_image(image, target_resolution):
|
| 187 |
+
"""
|
| 188 |
+
Resize and pad an image to a target resolution while maintaining aspect ratio.
|
| 189 |
+
Args:
|
| 190 |
+
image (PIL.Image.Image): The input image.
|
| 191 |
+
target_resolution (tuple): The target resolution (width, height) of the image.
|
| 192 |
+
Returns:
|
| 193 |
+
PIL.Image.Image: The resized and padded image.
|
| 194 |
+
"""
|
| 195 |
+
original_width, original_height = image.size
|
| 196 |
+
target_width, target_height = target_resolution
|
| 197 |
+
|
| 198 |
+
# Determine which dimension (width or height) to fill
|
| 199 |
+
scale_w = target_width / original_width
|
| 200 |
+
scale_h = target_height / original_height
|
| 201 |
+
|
| 202 |
+
if scale_w < scale_h:
|
| 203 |
+
# Width will be filled completely
|
| 204 |
+
new_width = target_width
|
| 205 |
+
new_height = min(math.ceil(original_height * scale_w), target_height)
|
| 206 |
+
else:
|
| 207 |
+
# Height will be filled completely
|
| 208 |
+
new_height = target_height
|
| 209 |
+
new_width = min(math.ceil(original_width * scale_h), target_width)
|
| 210 |
+
|
| 211 |
+
# Resize the image
|
| 212 |
+
resized_image = image.resize((new_width, new_height))
|
| 213 |
+
|
| 214 |
+
# Create a new image with the target size and paste the resized image onto it
|
| 215 |
+
new_image = Image.new("RGB", (target_width, target_height), (0, 0, 0))
|
| 216 |
+
paste_x = (target_width - new_width) // 2
|
| 217 |
+
paste_y = (target_height - new_height) // 2
|
| 218 |
+
new_image.paste(resized_image, (paste_x, paste_y))
|
| 219 |
+
|
| 220 |
+
return new_image
|
| 221 |
+
|
| 222 |
+
def divide_to_patches(image, patch_size):
|
| 223 |
+
"""
|
| 224 |
+
Divides an image into patches of a specified size.
|
| 225 |
+
Args:
|
| 226 |
+
image (PIL.Image.Image): The input image.
|
| 227 |
+
patch_size (int): The size of each patch.
|
| 228 |
+
Returns:
|
| 229 |
+
list: A list of PIL.Image.Image objects representing the patches.
|
| 230 |
+
"""
|
| 231 |
+
patches = []
|
| 232 |
+
width, height = image.size
|
| 233 |
+
for i in range(0, height, patch_size):
|
| 234 |
+
for j in range(0, width, patch_size):
|
| 235 |
+
box = (j, i, j + patch_size, i + patch_size)
|
| 236 |
+
patch = image.crop(box)
|
| 237 |
+
patches.append(patch)
|
| 238 |
+
|
| 239 |
+
return patches
|
| 240 |
+
|
| 241 |
+
|
| 242 |
+
from typing import List
|
| 243 |
+
import PIL.Image
|
| 244 |
+
import torch
|
| 245 |
+
import transformers
|
| 246 |
+
IGNORE_ID = -100
|
| 247 |
+
IMAGE_TOKEN_ID = -200
|
| 248 |
+
IMAGE_TOKEN = "<image>"
|
| 249 |
+
IMAGE_ATOM_ID = -300
|
| 250 |
+
IMAGE_INDICATOR_IDS = [-301, -302, -303, -304, -305]
|
| 251 |
+
|
| 252 |
+
|
| 253 |
+
def construct_image_placeholders(grid):
|
| 254 |
+
image_placeholders = [IMAGE_INDICATOR_IDS[0], IMAGE_ATOM_ID, IMAGE_INDICATOR_IDS[1]]
|
| 255 |
+
if grid[0] * grid[1] > 1:
|
| 256 |
+
for r in range(grid[0]):
|
| 257 |
+
for c in range(grid[1]):
|
| 258 |
+
image_placeholders.append(IMAGE_ATOM_ID)
|
| 259 |
+
if c < grid[1] - 1:
|
| 260 |
+
image_placeholders.append(IMAGE_INDICATOR_IDS[2])
|
| 261 |
+
if r < grid[0] - 1:
|
| 262 |
+
image_placeholders.append(IMAGE_INDICATOR_IDS[3])
|
| 263 |
+
image_placeholders.append(IMAGE_INDICATOR_IDS[4])
|
| 264 |
+
return image_placeholders
|
| 265 |
+
|
| 266 |
+
|
| 267 |
+
def preprocess_image_ovis(image: PIL.Image.Image, image_processor, crop_size, max_partition=9, covering_threshold=0.9, convert_to_rgb=True):
|
| 268 |
+
def _preprocess(img: PIL.Image.Image, side):
|
| 269 |
+
# first resize and preprocess
|
| 270 |
+
w, h = img.size
|
| 271 |
+
if w == h:
|
| 272 |
+
new_width = new_height = side
|
| 273 |
+
elif w > h:
|
| 274 |
+
new_width = side
|
| 275 |
+
new_height = int(h / w * new_width)
|
| 276 |
+
else:
|
| 277 |
+
new_height = side
|
| 278 |
+
new_width = int(w / h * new_height)
|
| 279 |
+
new_size = dict(height=new_height, width=new_width)
|
| 280 |
+
pixel_values = image_processor.preprocess(img, size=new_size, return_tensors='pt')['pixel_values']
|
| 281 |
+
|
| 282 |
+
# then pad to square
|
| 283 |
+
square_values = torch.zeros([1, 3, side, side], dtype=pixel_values.dtype, device=pixel_values.device)
|
| 284 |
+
new_height, new_width = pixel_values.shape[2:]
|
| 285 |
+
if new_height == new_width:
|
| 286 |
+
square_values[:, :, :, :] = pixel_values
|
| 287 |
+
elif new_height > new_width:
|
| 288 |
+
from_index = (side - new_width) // 2
|
| 289 |
+
square_values[:, :, :, from_index:from_index + new_width] = pixel_values
|
| 290 |
+
else:
|
| 291 |
+
from_index = (side - new_height) // 2
|
| 292 |
+
square_values[:, :, from_index:from_index + new_height, :] = pixel_values
|
| 293 |
+
|
| 294 |
+
return square_values
|
| 295 |
+
|
| 296 |
+
def _partition(img, grid):
|
| 297 |
+
w, h = img.size
|
| 298 |
+
row_height = h // grid[0]
|
| 299 |
+
col_width = w // grid[1]
|
| 300 |
+
|
| 301 |
+
partition = []
|
| 302 |
+
for row in range(grid[0]):
|
| 303 |
+
for col in range(grid[1]):
|
| 304 |
+
left = col * col_width
|
| 305 |
+
upper = row * row_height
|
| 306 |
+
right = w if col == grid[1] - 1 else (col + 1) * col_width
|
| 307 |
+
lower = h if row == grid[0] - 1 else (row + 1) * row_height
|
| 308 |
+
partition.append((left, upper, right, lower))
|
| 309 |
+
|
| 310 |
+
return partition
|
| 311 |
+
|
| 312 |
+
def _covering_area(left, upper, right, lower, side):
|
| 313 |
+
w = right - left
|
| 314 |
+
h = lower - upper
|
| 315 |
+
w, h = max(w, h), min(w, h)
|
| 316 |
+
if w > side:
|
| 317 |
+
h = h / w * side
|
| 318 |
+
w = side
|
| 319 |
+
return w * h
|
| 320 |
+
|
| 321 |
+
def _get_best_grid(img, side):
|
| 322 |
+
img_area = img.size[0] * img.size[1]
|
| 323 |
+
|
| 324 |
+
candidate_grids = []
|
| 325 |
+
for i in range(1, max_partition + 1):
|
| 326 |
+
for j in range(1, max_partition + 1):
|
| 327 |
+
if i * j <= max_partition:
|
| 328 |
+
candidate_grids.append((i, j))
|
| 329 |
+
|
| 330 |
+
all_grids = []
|
| 331 |
+
good_grids = []
|
| 332 |
+
for grid in candidate_grids:
|
| 333 |
+
partition = _partition(img, grid)
|
| 334 |
+
covering_ratio = sum([_covering_area(*p, side) for p in partition]) / img_area
|
| 335 |
+
assert covering_ratio <= 1.0
|
| 336 |
+
all_grids.append((grid, covering_ratio))
|
| 337 |
+
if covering_ratio > covering_threshold:
|
| 338 |
+
good_grids.append((grid, covering_ratio))
|
| 339 |
+
|
| 340 |
+
if len(good_grids) > 0:
|
| 341 |
+
# pick the good partition with minimum #sub_images and break the tie using covering_ratio
|
| 342 |
+
return sorted(good_grids, key=lambda x: (x[0][0] * x[0][1], -x[1]))[0][0]
|
| 343 |
+
else:
|
| 344 |
+
# pick the partition with maximum covering_ratio and break the tie using #sub_images
|
| 345 |
+
return sorted(all_grids, key=lambda x: (-x[1], x[0][0] * x[0][1]))[0][0]
|
| 346 |
+
|
| 347 |
+
if convert_to_rgb and image.mode != 'RGB':
|
| 348 |
+
image = image.convert('RGB')
|
| 349 |
+
|
| 350 |
+
# sides = self.get_image_size()
|
| 351 |
+
sides = [crop_size, crop_size]
|
| 352 |
+
if sides[0] != sides[1]:
|
| 353 |
+
raise ValueError('get_image_size() returns non-square size')
|
| 354 |
+
side = sides[0]
|
| 355 |
+
grid = _get_best_grid(image, side)
|
| 356 |
+
partition = _partition(image, grid)
|
| 357 |
+
crops = [image.crop(p) for p in partition]
|
| 358 |
+
if len(crops) > 1:
|
| 359 |
+
crops.insert(0, image)
|
| 360 |
+
# pixel_values = torch.cat([_preprocess(crop, side) for crop in crops], dim=0)
|
| 361 |
+
pixel_values = [_preprocess(crop, side) for crop in crops] # cat in the outer function
|
| 362 |
+
image_placeholders = construct_image_placeholders(grid)
|
| 363 |
+
return pixel_values, image_placeholders
|
| 364 |
+
|
| 365 |
+
|
| 366 |
+
|
| 367 |
+
def ovis_template_process(data_dict):
|
| 368 |
+
image = data_dict['images']
|
| 369 |
+
input_ids = data_dict['input_ids']
|
| 370 |
+
labels = data_dict['labels']
|
| 371 |
+
placeholder = []
|
| 372 |
+
new_input_ids = []
|
| 373 |
+
new_labels = []
|
| 374 |
+
for img in image:
|
| 375 |
+
placeholder.append(img[1])
|
| 376 |
+
|
| 377 |
+
indices = torch.nonzero(input_ids==IMAGE_TOKEN_ID).squeeze(1)
|
| 378 |
+
assert len(placeholder) == len(indices)
|
| 379 |
+
|
| 380 |
+
cnt = 0
|
| 381 |
+
idx = 0
|
| 382 |
+
for ids in input_ids:
|
| 383 |
+
if ids == IMAGE_TOKEN_ID:
|
| 384 |
+
for i in placeholder[cnt]:
|
| 385 |
+
new_input_ids.append(i)
|
| 386 |
+
new_labels.append(-100)
|
| 387 |
+
cnt += 1
|
| 388 |
+
idx += 1
|
| 389 |
+
else:
|
| 390 |
+
new_input_ids.append(input_ids[idx])
|
| 391 |
+
new_labels.append(labels[idx])
|
| 392 |
+
idx += 1
|
| 393 |
+
|
| 394 |
+
assert len(new_input_ids) == len(new_labels)
|
| 395 |
+
assert len(placeholder) == cnt
|
| 396 |
+
|
| 397 |
+
data_dict['images'] = [img[0] for img in data_dict['images']] # (3,3,448,448)
|
| 398 |
+
data_dict['input_ids'] = torch.tensor(new_input_ids)
|
| 399 |
+
data_dict['labels'] = torch.tensor(new_labels)
|
| 400 |
+
return data_dict
|
| 401 |
+
|
| 402 |
+
|
| 403 |
+
def pad_truncate_sequence(multimodal_max_length, sequences: List[torch.Tensor], batch_first: bool = True, padding_value: float = 0.0, left_padding: bool = False) -> torch.Tensor:
|
| 404 |
+
if not left_padding:
|
| 405 |
+
pad_sequence = torch.nn.utils.rnn.pad_sequence(sequences, batch_first=batch_first, padding_value=padding_value)
|
| 406 |
+
return pad_sequence[:,:multimodal_max_length]
|
| 407 |
+
else:
|
| 408 |
+
pad_sequence = torch.nn.utils.rnn.pad_sequence([i.flip(dims=[0]) for i in sequences],batch_first=True, padding_value=padding_value).flip(dims=[1])
|
| 409 |
+
return pad_sequence[:,multimodal_max_length:]
|