Hyggge commited on
Commit
64c250f
·
1 Parent(s): 93b1113

feat: modify file type of *.py, *.txt, etc. to change storage method

Browse files
.gitattributes CHANGED
@@ -38,3 +38,4 @@ model-00004-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text
38
  model-00002-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text
39
  model-00003-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text
40
  valley_structure.png filter=lfs diff=lfs merge=lfs -text
 
 
38
  model-00002-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text
39
  model-00003-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text
40
  valley_structure.png filter=lfs diff=lfs merge=lfs -text
41
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
added_tokens.json CHANGED
@@ -1,3 +1,34 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:87e5cd31a0e03650b23635178999a8a3942978e9270f041f3cf33ee3270c252f
3
- size 839
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</think>": 151668,
3
+ "</tool_call>": 151658,
4
+ "</tool_response>": 151666,
5
+ "<\\cor>": 151674,
6
+ "<cor>": 151673,
7
+ "<im_end>": 151670,
8
+ "<im_start>": 151669,
9
+ "<think>": 151667,
10
+ "<tool_call>": 151657,
11
+ "<tool_response>": 151665,
12
+ "<vi_end>": 151672,
13
+ "<vi_start>": 151671,
14
+ "<|box_end|>": 151649,
15
+ "<|box_start|>": 151648,
16
+ "<|endoftext|>": 151643,
17
+ "<|file_sep|>": 151664,
18
+ "<|fim_middle|>": 151660,
19
+ "<|fim_pad|>": 151662,
20
+ "<|fim_prefix|>": 151659,
21
+ "<|fim_suffix|>": 151661,
22
+ "<|im_end|>": 151645,
23
+ "<|im_start|>": 151644,
24
+ "<|image_pad|>": 151655,
25
+ "<|object_ref_end|>": 151647,
26
+ "<|object_ref_start|>": 151646,
27
+ "<|quad_end|>": 151651,
28
+ "<|quad_start|>": 151650,
29
+ "<|repo_name|>": 151663,
30
+ "<|video_pad|>": 151656,
31
+ "<|vision_end|>": 151653,
32
+ "<|vision_pad|>": 151654,
33
+ "<|vision_start|>": 151652
34
+ }
chat_template.jinja CHANGED
@@ -1,3 +1,85 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:87a2728cb8dc9fe424d624542f6060ec05a1d285ebbec578bb078900e33396b5
3
- size 4116
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0].role == 'system' %}
4
+ {{- messages[0].content + '\n\n' }}
5
+ {%- endif %}
6
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
7
+ {%- for tool in tools %}
8
+ {{- "\n" }}
9
+ {{- tool | tojson }}
10
+ {%- endfor %}
11
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
12
+ {%- else %}
13
+ {%- if messages[0].role == 'system' %}
14
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
15
+ {%- endif %}
16
+ {%- endif %}
17
+ {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
18
+ {%- for message in messages[::-1] %}
19
+ {%- set index = (messages|length - 1) - loop.index0 %}
20
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
21
+ {%- set ns.multi_step_tool = false %}
22
+ {%- set ns.last_query_index = index %}
23
+ {%- endif %}
24
+ {%- endfor %}
25
+ {%- for message in messages %}
26
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
27
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
28
+ {%- elif message.role == "assistant" %}
29
+ {%- set content = message.content %}
30
+ {%- set reasoning_content = '' %}
31
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
32
+ {%- set reasoning_content = message.reasoning_content %}
33
+ {%- else %}
34
+ {%- if '</think>' in message.content %}
35
+ {%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
36
+ {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
37
+ {%- endif %}
38
+ {%- endif %}
39
+ {%- if loop.index0 > ns.last_query_index %}
40
+ {%- if loop.last or (not loop.last and reasoning_content) %}
41
+ {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
42
+ {%- else %}
43
+ {{- '<|im_start|>' + message.role + '\n' + content }}
44
+ {%- endif %}
45
+ {%- else %}
46
+ {{- '<|im_start|>' + message.role + '\n' + content }}
47
+ {%- endif %}
48
+ {%- if message.tool_calls %}
49
+ {%- for tool_call in message.tool_calls %}
50
+ {%- if (loop.first and content) or (not loop.first) %}
51
+ {{- '\n' }}
52
+ {%- endif %}
53
+ {%- if tool_call.function %}
54
+ {%- set tool_call = tool_call.function %}
55
+ {%- endif %}
56
+ {{- '<tool_call>\n{"name": "' }}
57
+ {{- tool_call.name }}
58
+ {{- '", "arguments": ' }}
59
+ {%- if tool_call.arguments is string %}
60
+ {{- tool_call.arguments }}
61
+ {%- else %}
62
+ {{- tool_call.arguments | tojson }}
63
+ {%- endif %}
64
+ {{- '}\n</tool_call>' }}
65
+ {%- endfor %}
66
+ {%- endif %}
67
+ {{- '<|im_end|>\n' }}
68
+ {%- elif message.role == "tool" %}
69
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
70
+ {{- '<|im_start|>user' }}
71
+ {%- endif %}
72
+ {{- '\n<tool_response>\n' }}
73
+ {{- message.content }}
74
+ {{- '\n</tool_response>' }}
75
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
76
+ {{- '<|im_end|>\n' }}
77
+ {%- endif %}
78
+ {%- endif %}
79
+ {%- endfor %}
80
+ {%- if add_generation_prompt %}
81
+ {{- '<|im_start|>assistant\n' }}
82
+ {%- if enable_thinking is defined and enable_thinking is false %}
83
+ {{- '<think>\n\n</think>\n\n' }}
84
+ {%- endif %}
85
+ {%- endif %}
chat_template.json CHANGED
@@ -1,3 +1,3 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:8eedba4e39df3e45cccc86e7681c3c58fd90199fb601a8ab2b430be8b89bf8b3
3
- size 4306
 
1
+ {
2
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0].role == 'system' %}\n {{- messages[0].content + '\\n\\n' }}\n {%- endif %}\n {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0].role == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n {%- set index = (messages|length - 1) - loop.index0 %}\n {%- if ns.multi_step_tool and message.role == \"user\" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}\n {%- set ns.multi_step_tool = false %}\n {%- set ns.last_query_index = index %}\n {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {%- set content = message.content %}\n {%- set reasoning_content = '' %}\n {%- if message.reasoning_content is defined and message.reasoning_content is not none %}\n {%- set reasoning_content = message.reasoning_content %}\n {%- else %}\n {%- if '</think>' in message.content %}\n {%- set content = message.content.split('</think>')[-1].lstrip('\\n') %}\n {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n {%- endif %}\n {%- endif %}\n {%- if loop.index0 > ns.last_query_index %}\n {%- if loop.last or (not loop.last and reasoning_content) %}\n {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- if message.tool_calls %}\n {%- for tool_call in message.tool_calls %}\n {%- if (loop.first and content) or (not loop.first) %}\n {{- '\\n' }}\n {%- endif %}\n {%- if tool_call.function %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {%- if tool_call.arguments is string %}\n {{- tool_call.arguments }}\n {%- else %}\n {{- tool_call.arguments | tojson }}\n {%- endif %}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n {%- if enable_thinking is defined and enable_thinking is false %}\n {{- '<think>\\n\\n</think>\\n\\n' }}\n {%- endif %}\n{%- endif %}"
3
+ }
generation_config.json CHANGED
@@ -1,3 +1,7 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f307d8ac4048390cd67f8cd0111b62d14b82b613213c4ec76aa6e9873d8505e1
3
- size 142
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "eos_token_id": 151645,
4
+ "pad_token_id": 151643,
5
+ "transformers_version": "4.54.0",
6
+ "use_cache": true
7
+ }
merges.txt CHANGED
The diff for this file is too large to render. See raw diff
 
model.safetensors.index.json CHANGED
@@ -1,3 +1,798 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:47309e187318e605ae100a165c50e5202a12171431bd91c4d5ee691942d7d5f9
3
- size 69452
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_parameters": 9423832576,
4
+ "total_size": 18847665152
5
+ },
6
+ "weight_map": {
7
+ "lm_head.weight": "model-00004-of-00004.safetensors",
8
+ "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
9
+ "model.layers.0.input_layernorm.weight": "model-00003-of-00004.safetensors",
10
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
11
+ "model.layers.0.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
12
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
13
+ "model.layers.0.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
14
+ "model.layers.0.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
15
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
16
+ "model.layers.0.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
17
+ "model.layers.0.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
18
+ "model.layers.0.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
19
+ "model.layers.0.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
20
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
21
+ "model.layers.1.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
22
+ "model.layers.1.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
23
+ "model.layers.1.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
24
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
25
+ "model.layers.1.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
26
+ "model.layers.1.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
27
+ "model.layers.1.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
28
+ "model.layers.1.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
29
+ "model.layers.1.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
30
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
31
+ "model.layers.10.input_layernorm.weight": "model-00003-of-00004.safetensors",
32
+ "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
33
+ "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
34
+ "model.layers.10.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
35
+ "model.layers.10.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
36
+ "model.layers.10.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
37
+ "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
38
+ "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
39
+ "model.layers.10.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
40
+ "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
41
+ "model.layers.10.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
42
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
43
+ "model.layers.11.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
44
+ "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
45
+ "model.layers.11.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
46
+ "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
47
+ "model.layers.11.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
48
+ "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
49
+ "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
50
+ "model.layers.11.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
51
+ "model.layers.11.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
52
+ "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
53
+ "model.layers.12.input_layernorm.weight": "model-00004-of-00004.safetensors",
54
+ "model.layers.12.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
55
+ "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
56
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
57
+ "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
58
+ "model.layers.12.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
59
+ "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
60
+ "model.layers.12.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
61
+ "model.layers.12.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
62
+ "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
63
+ "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
64
+ "model.layers.13.input_layernorm.weight": "model-00001-of-00004.safetensors",
65
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
66
+ "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
67
+ "model.layers.13.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
68
+ "model.layers.13.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
69
+ "model.layers.13.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
70
+ "model.layers.13.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
71
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
72
+ "model.layers.13.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
73
+ "model.layers.13.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
74
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
75
+ "model.layers.14.input_layernorm.weight": "model-00003-of-00004.safetensors",
76
+ "model.layers.14.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
77
+ "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
78
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
79
+ "model.layers.14.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
80
+ "model.layers.14.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
81
+ "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
82
+ "model.layers.14.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
83
+ "model.layers.14.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
84
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
85
+ "model.layers.14.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
86
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
87
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
88
+ "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
89
+ "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
90
+ "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
91
+ "model.layers.15.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
92
+ "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
93
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
94
+ "model.layers.15.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
95
+ "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
96
+ "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
97
+ "model.layers.16.input_layernorm.weight": "model-00001-of-00004.safetensors",
98
+ "model.layers.16.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
99
+ "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
100
+ "model.layers.16.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
101
+ "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
102
+ "model.layers.16.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
103
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
104
+ "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
105
+ "model.layers.16.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
106
+ "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
107
+ "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
108
+ "model.layers.17.input_layernorm.weight": "model-00004-of-00004.safetensors",
109
+ "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
110
+ "model.layers.17.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
111
+ "model.layers.17.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
112
+ "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
113
+ "model.layers.17.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
114
+ "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
115
+ "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
116
+ "model.layers.17.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
117
+ "model.layers.17.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
118
+ "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
119
+ "model.layers.18.input_layernorm.weight": "model-00001-of-00004.safetensors",
120
+ "model.layers.18.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
121
+ "model.layers.18.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
122
+ "model.layers.18.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
123
+ "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
124
+ "model.layers.18.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
125
+ "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
126
+ "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
127
+ "model.layers.18.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
128
+ "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
129
+ "model.layers.18.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
130
+ "model.layers.19.input_layernorm.weight": "model-00004-of-00004.safetensors",
131
+ "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
132
+ "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
133
+ "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
134
+ "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
135
+ "model.layers.19.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
136
+ "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
137
+ "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
138
+ "model.layers.19.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
139
+ "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
140
+ "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
141
+ "model.layers.2.input_layernorm.weight": "model-00003-of-00004.safetensors",
142
+ "model.layers.2.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
143
+ "model.layers.2.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
144
+ "model.layers.2.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
145
+ "model.layers.2.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
146
+ "model.layers.2.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
147
+ "model.layers.2.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
148
+ "model.layers.2.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
149
+ "model.layers.2.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
150
+ "model.layers.2.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
151
+ "model.layers.2.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
152
+ "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
153
+ "model.layers.20.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
154
+ "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
155
+ "model.layers.20.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
156
+ "model.layers.20.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
157
+ "model.layers.20.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
158
+ "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
159
+ "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
160
+ "model.layers.20.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
161
+ "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
162
+ "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
163
+ "model.layers.21.input_layernorm.weight": "model-00002-of-00004.safetensors",
164
+ "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
165
+ "model.layers.21.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
166
+ "model.layers.21.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
167
+ "model.layers.21.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
168
+ "model.layers.21.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
169
+ "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
170
+ "model.layers.21.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
171
+ "model.layers.21.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
172
+ "model.layers.21.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
173
+ "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
174
+ "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
175
+ "model.layers.22.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
176
+ "model.layers.22.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
177
+ "model.layers.22.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
178
+ "model.layers.22.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
179
+ "model.layers.22.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
180
+ "model.layers.22.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
181
+ "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
182
+ "model.layers.22.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
183
+ "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
184
+ "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
185
+ "model.layers.23.input_layernorm.weight": "model-00002-of-00004.safetensors",
186
+ "model.layers.23.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
187
+ "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
188
+ "model.layers.23.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
189
+ "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
190
+ "model.layers.23.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
191
+ "model.layers.23.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
192
+ "model.layers.23.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
193
+ "model.layers.23.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
194
+ "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
195
+ "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
196
+ "model.layers.24.input_layernorm.weight": "model-00002-of-00004.safetensors",
197
+ "model.layers.24.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
198
+ "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
199
+ "model.layers.24.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
200
+ "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
201
+ "model.layers.24.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
202
+ "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
203
+ "model.layers.24.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
204
+ "model.layers.24.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
205
+ "model.layers.24.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
206
+ "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
207
+ "model.layers.25.input_layernorm.weight": "model-00004-of-00004.safetensors",
208
+ "model.layers.25.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
209
+ "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
210
+ "model.layers.25.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
211
+ "model.layers.25.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
212
+ "model.layers.25.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
213
+ "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
214
+ "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
215
+ "model.layers.25.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
216
+ "model.layers.25.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
217
+ "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
218
+ "model.layers.26.input_layernorm.weight": "model-00004-of-00004.safetensors",
219
+ "model.layers.26.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
220
+ "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
221
+ "model.layers.26.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
222
+ "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
223
+ "model.layers.26.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
224
+ "model.layers.26.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
225
+ "model.layers.26.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
226
+ "model.layers.26.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
227
+ "model.layers.26.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
228
+ "model.layers.26.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
229
+ "model.layers.27.input_layernorm.weight": "model-00002-of-00004.safetensors",
230
+ "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
231
+ "model.layers.27.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
232
+ "model.layers.27.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
233
+ "model.layers.27.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
234
+ "model.layers.27.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
235
+ "model.layers.27.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
236
+ "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
237
+ "model.layers.27.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
238
+ "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
239
+ "model.layers.27.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
240
+ "model.layers.28.input_layernorm.weight": "model-00001-of-00004.safetensors",
241
+ "model.layers.28.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
242
+ "model.layers.28.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
243
+ "model.layers.28.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
244
+ "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
245
+ "model.layers.28.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
246
+ "model.layers.28.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
247
+ "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
248
+ "model.layers.28.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
249
+ "model.layers.28.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
250
+ "model.layers.28.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
251
+ "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors",
252
+ "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
253
+ "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
254
+ "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
255
+ "model.layers.29.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
256
+ "model.layers.29.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
257
+ "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
258
+ "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
259
+ "model.layers.29.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
260
+ "model.layers.29.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
261
+ "model.layers.29.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
262
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
263
+ "model.layers.3.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
264
+ "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
265
+ "model.layers.3.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
266
+ "model.layers.3.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
267
+ "model.layers.3.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
268
+ "model.layers.3.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
269
+ "model.layers.3.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
270
+ "model.layers.3.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
271
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
272
+ "model.layers.3.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
273
+ "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors",
274
+ "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
275
+ "model.layers.30.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
276
+ "model.layers.30.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
277
+ "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
278
+ "model.layers.30.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
279
+ "model.layers.30.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
280
+ "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
281
+ "model.layers.30.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
282
+ "model.layers.30.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
283
+ "model.layers.30.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
284
+ "model.layers.31.input_layernorm.weight": "model-00002-of-00004.safetensors",
285
+ "model.layers.31.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
286
+ "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
287
+ "model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
288
+ "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
289
+ "model.layers.31.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
290
+ "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
291
+ "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
292
+ "model.layers.31.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
293
+ "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
294
+ "model.layers.31.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
295
+ "model.layers.32.input_layernorm.weight": "model-00003-of-00004.safetensors",
296
+ "model.layers.32.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
297
+ "model.layers.32.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
298
+ "model.layers.32.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
299
+ "model.layers.32.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
300
+ "model.layers.32.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
301
+ "model.layers.32.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
302
+ "model.layers.32.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
303
+ "model.layers.32.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
304
+ "model.layers.32.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
305
+ "model.layers.32.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
306
+ "model.layers.33.input_layernorm.weight": "model-00004-of-00004.safetensors",
307
+ "model.layers.33.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
308
+ "model.layers.33.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
309
+ "model.layers.33.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
310
+ "model.layers.33.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
311
+ "model.layers.33.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
312
+ "model.layers.33.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
313
+ "model.layers.33.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
314
+ "model.layers.33.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
315
+ "model.layers.33.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
316
+ "model.layers.33.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
317
+ "model.layers.34.input_layernorm.weight": "model-00001-of-00004.safetensors",
318
+ "model.layers.34.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
319
+ "model.layers.34.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
320
+ "model.layers.34.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
321
+ "model.layers.34.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
322
+ "model.layers.34.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
323
+ "model.layers.34.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
324
+ "model.layers.34.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
325
+ "model.layers.34.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
326
+ "model.layers.34.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
327
+ "model.layers.34.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
328
+ "model.layers.35.input_layernorm.weight": "model-00003-of-00004.safetensors",
329
+ "model.layers.35.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
330
+ "model.layers.35.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
331
+ "model.layers.35.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
332
+ "model.layers.35.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
333
+ "model.layers.35.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
334
+ "model.layers.35.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
335
+ "model.layers.35.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
336
+ "model.layers.35.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
337
+ "model.layers.35.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
338
+ "model.layers.35.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
339
+ "model.layers.4.input_layernorm.weight": "model-00002-of-00004.safetensors",
340
+ "model.layers.4.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
341
+ "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
342
+ "model.layers.4.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
343
+ "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
344
+ "model.layers.4.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
345
+ "model.layers.4.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
346
+ "model.layers.4.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
347
+ "model.layers.4.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
348
+ "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
349
+ "model.layers.4.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
350
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
351
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
352
+ "model.layers.5.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
353
+ "model.layers.5.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
354
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
355
+ "model.layers.5.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
356
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
357
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
358
+ "model.layers.5.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
359
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
360
+ "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
361
+ "model.layers.6.input_layernorm.weight": "model-00002-of-00004.safetensors",
362
+ "model.layers.6.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
363
+ "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
364
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
365
+ "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
366
+ "model.layers.6.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
367
+ "model.layers.6.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
368
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
369
+ "model.layers.6.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
370
+ "model.layers.6.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
371
+ "model.layers.6.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
372
+ "model.layers.7.input_layernorm.weight": "model-00003-of-00004.safetensors",
373
+ "model.layers.7.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
374
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
375
+ "model.layers.7.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
376
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
377
+ "model.layers.7.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
378
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
379
+ "model.layers.7.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
380
+ "model.layers.7.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
381
+ "model.layers.7.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
382
+ "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
383
+ "model.layers.8.input_layernorm.weight": "model-00004-of-00004.safetensors",
384
+ "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
385
+ "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
386
+ "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
387
+ "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
388
+ "model.layers.8.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
389
+ "model.layers.8.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
390
+ "model.layers.8.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
391
+ "model.layers.8.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
392
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
393
+ "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
394
+ "model.layers.9.input_layernorm.weight": "model-00003-of-00004.safetensors",
395
+ "model.layers.9.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
396
+ "model.layers.9.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
397
+ "model.layers.9.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
398
+ "model.layers.9.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
399
+ "model.layers.9.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
400
+ "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
401
+ "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
402
+ "model.layers.9.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
403
+ "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
404
+ "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
405
+ "model.norm.weight": "model-00002-of-00004.safetensors",
406
+ "model.qwen2vl_vision_tower.blocks.0.attn.proj.bias": "model-00004-of-00004.safetensors",
407
+ "model.qwen2vl_vision_tower.blocks.0.attn.proj.weight": "model-00003-of-00004.safetensors",
408
+ "model.qwen2vl_vision_tower.blocks.0.attn.qkv.bias": "model-00001-of-00004.safetensors",
409
+ "model.qwen2vl_vision_tower.blocks.0.attn.qkv.weight": "model-00001-of-00004.safetensors",
410
+ "model.qwen2vl_vision_tower.blocks.0.mlp.fc1.bias": "model-00001-of-00004.safetensors",
411
+ "model.qwen2vl_vision_tower.blocks.0.mlp.fc1.weight": "model-00002-of-00004.safetensors",
412
+ "model.qwen2vl_vision_tower.blocks.0.mlp.fc2.bias": "model-00004-of-00004.safetensors",
413
+ "model.qwen2vl_vision_tower.blocks.0.mlp.fc2.weight": "model-00002-of-00004.safetensors",
414
+ "model.qwen2vl_vision_tower.blocks.0.norm1.bias": "model-00003-of-00004.safetensors",
415
+ "model.qwen2vl_vision_tower.blocks.0.norm1.weight": "model-00003-of-00004.safetensors",
416
+ "model.qwen2vl_vision_tower.blocks.0.norm2.bias": "model-00001-of-00004.safetensors",
417
+ "model.qwen2vl_vision_tower.blocks.0.norm2.weight": "model-00004-of-00004.safetensors",
418
+ "model.qwen2vl_vision_tower.blocks.1.attn.proj.bias": "model-00002-of-00004.safetensors",
419
+ "model.qwen2vl_vision_tower.blocks.1.attn.proj.weight": "model-00001-of-00004.safetensors",
420
+ "model.qwen2vl_vision_tower.blocks.1.attn.qkv.bias": "model-00002-of-00004.safetensors",
421
+ "model.qwen2vl_vision_tower.blocks.1.attn.qkv.weight": "model-00004-of-00004.safetensors",
422
+ "model.qwen2vl_vision_tower.blocks.1.mlp.fc1.bias": "model-00001-of-00004.safetensors",
423
+ "model.qwen2vl_vision_tower.blocks.1.mlp.fc1.weight": "model-00001-of-00004.safetensors",
424
+ "model.qwen2vl_vision_tower.blocks.1.mlp.fc2.bias": "model-00003-of-00004.safetensors",
425
+ "model.qwen2vl_vision_tower.blocks.1.mlp.fc2.weight": "model-00002-of-00004.safetensors",
426
+ "model.qwen2vl_vision_tower.blocks.1.norm1.bias": "model-00001-of-00004.safetensors",
427
+ "model.qwen2vl_vision_tower.blocks.1.norm1.weight": "model-00001-of-00004.safetensors",
428
+ "model.qwen2vl_vision_tower.blocks.1.norm2.bias": "model-00003-of-00004.safetensors",
429
+ "model.qwen2vl_vision_tower.blocks.1.norm2.weight": "model-00002-of-00004.safetensors",
430
+ "model.qwen2vl_vision_tower.blocks.10.attn.proj.bias": "model-00004-of-00004.safetensors",
431
+ "model.qwen2vl_vision_tower.blocks.10.attn.proj.weight": "model-00002-of-00004.safetensors",
432
+ "model.qwen2vl_vision_tower.blocks.10.attn.qkv.bias": "model-00001-of-00004.safetensors",
433
+ "model.qwen2vl_vision_tower.blocks.10.attn.qkv.weight": "model-00001-of-00004.safetensors",
434
+ "model.qwen2vl_vision_tower.blocks.10.mlp.fc1.bias": "model-00004-of-00004.safetensors",
435
+ "model.qwen2vl_vision_tower.blocks.10.mlp.fc1.weight": "model-00001-of-00004.safetensors",
436
+ "model.qwen2vl_vision_tower.blocks.10.mlp.fc2.bias": "model-00004-of-00004.safetensors",
437
+ "model.qwen2vl_vision_tower.blocks.10.mlp.fc2.weight": "model-00004-of-00004.safetensors",
438
+ "model.qwen2vl_vision_tower.blocks.10.norm1.bias": "model-00002-of-00004.safetensors",
439
+ "model.qwen2vl_vision_tower.blocks.10.norm1.weight": "model-00004-of-00004.safetensors",
440
+ "model.qwen2vl_vision_tower.blocks.10.norm2.bias": "model-00002-of-00004.safetensors",
441
+ "model.qwen2vl_vision_tower.blocks.10.norm2.weight": "model-00001-of-00004.safetensors",
442
+ "model.qwen2vl_vision_tower.blocks.11.attn.proj.bias": "model-00003-of-00004.safetensors",
443
+ "model.qwen2vl_vision_tower.blocks.11.attn.proj.weight": "model-00003-of-00004.safetensors",
444
+ "model.qwen2vl_vision_tower.blocks.11.attn.qkv.bias": "model-00004-of-00004.safetensors",
445
+ "model.qwen2vl_vision_tower.blocks.11.attn.qkv.weight": "model-00002-of-00004.safetensors",
446
+ "model.qwen2vl_vision_tower.blocks.11.mlp.fc1.bias": "model-00004-of-00004.safetensors",
447
+ "model.qwen2vl_vision_tower.blocks.11.mlp.fc1.weight": "model-00001-of-00004.safetensors",
448
+ "model.qwen2vl_vision_tower.blocks.11.mlp.fc2.bias": "model-00002-of-00004.safetensors",
449
+ "model.qwen2vl_vision_tower.blocks.11.mlp.fc2.weight": "model-00002-of-00004.safetensors",
450
+ "model.qwen2vl_vision_tower.blocks.11.norm1.bias": "model-00002-of-00004.safetensors",
451
+ "model.qwen2vl_vision_tower.blocks.11.norm1.weight": "model-00003-of-00004.safetensors",
452
+ "model.qwen2vl_vision_tower.blocks.11.norm2.bias": "model-00001-of-00004.safetensors",
453
+ "model.qwen2vl_vision_tower.blocks.11.norm2.weight": "model-00001-of-00004.safetensors",
454
+ "model.qwen2vl_vision_tower.blocks.12.attn.proj.bias": "model-00002-of-00004.safetensors",
455
+ "model.qwen2vl_vision_tower.blocks.12.attn.proj.weight": "model-00002-of-00004.safetensors",
456
+ "model.qwen2vl_vision_tower.blocks.12.attn.qkv.bias": "model-00002-of-00004.safetensors",
457
+ "model.qwen2vl_vision_tower.blocks.12.attn.qkv.weight": "model-00004-of-00004.safetensors",
458
+ "model.qwen2vl_vision_tower.blocks.12.mlp.fc1.bias": "model-00002-of-00004.safetensors",
459
+ "model.qwen2vl_vision_tower.blocks.12.mlp.fc1.weight": "model-00003-of-00004.safetensors",
460
+ "model.qwen2vl_vision_tower.blocks.12.mlp.fc2.bias": "model-00002-of-00004.safetensors",
461
+ "model.qwen2vl_vision_tower.blocks.12.mlp.fc2.weight": "model-00004-of-00004.safetensors",
462
+ "model.qwen2vl_vision_tower.blocks.12.norm1.bias": "model-00003-of-00004.safetensors",
463
+ "model.qwen2vl_vision_tower.blocks.12.norm1.weight": "model-00001-of-00004.safetensors",
464
+ "model.qwen2vl_vision_tower.blocks.12.norm2.bias": "model-00001-of-00004.safetensors",
465
+ "model.qwen2vl_vision_tower.blocks.12.norm2.weight": "model-00002-of-00004.safetensors",
466
+ "model.qwen2vl_vision_tower.blocks.13.attn.proj.bias": "model-00001-of-00004.safetensors",
467
+ "model.qwen2vl_vision_tower.blocks.13.attn.proj.weight": "model-00001-of-00004.safetensors",
468
+ "model.qwen2vl_vision_tower.blocks.13.attn.qkv.bias": "model-00002-of-00004.safetensors",
469
+ "model.qwen2vl_vision_tower.blocks.13.attn.qkv.weight": "model-00002-of-00004.safetensors",
470
+ "model.qwen2vl_vision_tower.blocks.13.mlp.fc1.bias": "model-00002-of-00004.safetensors",
471
+ "model.qwen2vl_vision_tower.blocks.13.mlp.fc1.weight": "model-00002-of-00004.safetensors",
472
+ "model.qwen2vl_vision_tower.blocks.13.mlp.fc2.bias": "model-00003-of-00004.safetensors",
473
+ "model.qwen2vl_vision_tower.blocks.13.mlp.fc2.weight": "model-00004-of-00004.safetensors",
474
+ "model.qwen2vl_vision_tower.blocks.13.norm1.bias": "model-00004-of-00004.safetensors",
475
+ "model.qwen2vl_vision_tower.blocks.13.norm1.weight": "model-00004-of-00004.safetensors",
476
+ "model.qwen2vl_vision_tower.blocks.13.norm2.bias": "model-00004-of-00004.safetensors",
477
+ "model.qwen2vl_vision_tower.blocks.13.norm2.weight": "model-00001-of-00004.safetensors",
478
+ "model.qwen2vl_vision_tower.blocks.14.attn.proj.bias": "model-00002-of-00004.safetensors",
479
+ "model.qwen2vl_vision_tower.blocks.14.attn.proj.weight": "model-00003-of-00004.safetensors",
480
+ "model.qwen2vl_vision_tower.blocks.14.attn.qkv.bias": "model-00002-of-00004.safetensors",
481
+ "model.qwen2vl_vision_tower.blocks.14.attn.qkv.weight": "model-00004-of-00004.safetensors",
482
+ "model.qwen2vl_vision_tower.blocks.14.mlp.fc1.bias": "model-00004-of-00004.safetensors",
483
+ "model.qwen2vl_vision_tower.blocks.14.mlp.fc1.weight": "model-00001-of-00004.safetensors",
484
+ "model.qwen2vl_vision_tower.blocks.14.mlp.fc2.bias": "model-00001-of-00004.safetensors",
485
+ "model.qwen2vl_vision_tower.blocks.14.mlp.fc2.weight": "model-00004-of-00004.safetensors",
486
+ "model.qwen2vl_vision_tower.blocks.14.norm1.bias": "model-00004-of-00004.safetensors",
487
+ "model.qwen2vl_vision_tower.blocks.14.norm1.weight": "model-00002-of-00004.safetensors",
488
+ "model.qwen2vl_vision_tower.blocks.14.norm2.bias": "model-00003-of-00004.safetensors",
489
+ "model.qwen2vl_vision_tower.blocks.14.norm2.weight": "model-00001-of-00004.safetensors",
490
+ "model.qwen2vl_vision_tower.blocks.15.attn.proj.bias": "model-00004-of-00004.safetensors",
491
+ "model.qwen2vl_vision_tower.blocks.15.attn.proj.weight": "model-00002-of-00004.safetensors",
492
+ "model.qwen2vl_vision_tower.blocks.15.attn.qkv.bias": "model-00003-of-00004.safetensors",
493
+ "model.qwen2vl_vision_tower.blocks.15.attn.qkv.weight": "model-00002-of-00004.safetensors",
494
+ "model.qwen2vl_vision_tower.blocks.15.mlp.fc1.bias": "model-00001-of-00004.safetensors",
495
+ "model.qwen2vl_vision_tower.blocks.15.mlp.fc1.weight": "model-00004-of-00004.safetensors",
496
+ "model.qwen2vl_vision_tower.blocks.15.mlp.fc2.bias": "model-00004-of-00004.safetensors",
497
+ "model.qwen2vl_vision_tower.blocks.15.mlp.fc2.weight": "model-00003-of-00004.safetensors",
498
+ "model.qwen2vl_vision_tower.blocks.15.norm1.bias": "model-00002-of-00004.safetensors",
499
+ "model.qwen2vl_vision_tower.blocks.15.norm1.weight": "model-00003-of-00004.safetensors",
500
+ "model.qwen2vl_vision_tower.blocks.15.norm2.bias": "model-00004-of-00004.safetensors",
501
+ "model.qwen2vl_vision_tower.blocks.15.norm2.weight": "model-00002-of-00004.safetensors",
502
+ "model.qwen2vl_vision_tower.blocks.16.attn.proj.bias": "model-00002-of-00004.safetensors",
503
+ "model.qwen2vl_vision_tower.blocks.16.attn.proj.weight": "model-00001-of-00004.safetensors",
504
+ "model.qwen2vl_vision_tower.blocks.16.attn.qkv.bias": "model-00001-of-00004.safetensors",
505
+ "model.qwen2vl_vision_tower.blocks.16.attn.qkv.weight": "model-00003-of-00004.safetensors",
506
+ "model.qwen2vl_vision_tower.blocks.16.mlp.fc1.bias": "model-00004-of-00004.safetensors",
507
+ "model.qwen2vl_vision_tower.blocks.16.mlp.fc1.weight": "model-00004-of-00004.safetensors",
508
+ "model.qwen2vl_vision_tower.blocks.16.mlp.fc2.bias": "model-00001-of-00004.safetensors",
509
+ "model.qwen2vl_vision_tower.blocks.16.mlp.fc2.weight": "model-00003-of-00004.safetensors",
510
+ "model.qwen2vl_vision_tower.blocks.16.norm1.bias": "model-00003-of-00004.safetensors",
511
+ "model.qwen2vl_vision_tower.blocks.16.norm1.weight": "model-00002-of-00004.safetensors",
512
+ "model.qwen2vl_vision_tower.blocks.16.norm2.bias": "model-00001-of-00004.safetensors",
513
+ "model.qwen2vl_vision_tower.blocks.16.norm2.weight": "model-00003-of-00004.safetensors",
514
+ "model.qwen2vl_vision_tower.blocks.17.attn.proj.bias": "model-00002-of-00004.safetensors",
515
+ "model.qwen2vl_vision_tower.blocks.17.attn.proj.weight": "model-00004-of-00004.safetensors",
516
+ "model.qwen2vl_vision_tower.blocks.17.attn.qkv.bias": "model-00003-of-00004.safetensors",
517
+ "model.qwen2vl_vision_tower.blocks.17.attn.qkv.weight": "model-00003-of-00004.safetensors",
518
+ "model.qwen2vl_vision_tower.blocks.17.mlp.fc1.bias": "model-00004-of-00004.safetensors",
519
+ "model.qwen2vl_vision_tower.blocks.17.mlp.fc1.weight": "model-00002-of-00004.safetensors",
520
+ "model.qwen2vl_vision_tower.blocks.17.mlp.fc2.bias": "model-00003-of-00004.safetensors",
521
+ "model.qwen2vl_vision_tower.blocks.17.mlp.fc2.weight": "model-00003-of-00004.safetensors",
522
+ "model.qwen2vl_vision_tower.blocks.17.norm1.bias": "model-00004-of-00004.safetensors",
523
+ "model.qwen2vl_vision_tower.blocks.17.norm1.weight": "model-00003-of-00004.safetensors",
524
+ "model.qwen2vl_vision_tower.blocks.17.norm2.bias": "model-00001-of-00004.safetensors",
525
+ "model.qwen2vl_vision_tower.blocks.17.norm2.weight": "model-00003-of-00004.safetensors",
526
+ "model.qwen2vl_vision_tower.blocks.18.attn.proj.bias": "model-00002-of-00004.safetensors",
527
+ "model.qwen2vl_vision_tower.blocks.18.attn.proj.weight": "model-00002-of-00004.safetensors",
528
+ "model.qwen2vl_vision_tower.blocks.18.attn.qkv.bias": "model-00003-of-00004.safetensors",
529
+ "model.qwen2vl_vision_tower.blocks.18.attn.qkv.weight": "model-00002-of-00004.safetensors",
530
+ "model.qwen2vl_vision_tower.blocks.18.mlp.fc1.bias": "model-00004-of-00004.safetensors",
531
+ "model.qwen2vl_vision_tower.blocks.18.mlp.fc1.weight": "model-00002-of-00004.safetensors",
532
+ "model.qwen2vl_vision_tower.blocks.18.mlp.fc2.bias": "model-00004-of-00004.safetensors",
533
+ "model.qwen2vl_vision_tower.blocks.18.mlp.fc2.weight": "model-00002-of-00004.safetensors",
534
+ "model.qwen2vl_vision_tower.blocks.18.norm1.bias": "model-00001-of-00004.safetensors",
535
+ "model.qwen2vl_vision_tower.blocks.18.norm1.weight": "model-00002-of-00004.safetensors",
536
+ "model.qwen2vl_vision_tower.blocks.18.norm2.bias": "model-00004-of-00004.safetensors",
537
+ "model.qwen2vl_vision_tower.blocks.18.norm2.weight": "model-00004-of-00004.safetensors",
538
+ "model.qwen2vl_vision_tower.blocks.19.attn.proj.bias": "model-00004-of-00004.safetensors",
539
+ "model.qwen2vl_vision_tower.blocks.19.attn.proj.weight": "model-00002-of-00004.safetensors",
540
+ "model.qwen2vl_vision_tower.blocks.19.attn.qkv.bias": "model-00004-of-00004.safetensors",
541
+ "model.qwen2vl_vision_tower.blocks.19.attn.qkv.weight": "model-00004-of-00004.safetensors",
542
+ "model.qwen2vl_vision_tower.blocks.19.mlp.fc1.bias": "model-00001-of-00004.safetensors",
543
+ "model.qwen2vl_vision_tower.blocks.19.mlp.fc1.weight": "model-00003-of-00004.safetensors",
544
+ "model.qwen2vl_vision_tower.blocks.19.mlp.fc2.bias": "model-00003-of-00004.safetensors",
545
+ "model.qwen2vl_vision_tower.blocks.19.mlp.fc2.weight": "model-00003-of-00004.safetensors",
546
+ "model.qwen2vl_vision_tower.blocks.19.norm1.bias": "model-00003-of-00004.safetensors",
547
+ "model.qwen2vl_vision_tower.blocks.19.norm1.weight": "model-00004-of-00004.safetensors",
548
+ "model.qwen2vl_vision_tower.blocks.19.norm2.bias": "model-00004-of-00004.safetensors",
549
+ "model.qwen2vl_vision_tower.blocks.19.norm2.weight": "model-00002-of-00004.safetensors",
550
+ "model.qwen2vl_vision_tower.blocks.2.attn.proj.bias": "model-00002-of-00004.safetensors",
551
+ "model.qwen2vl_vision_tower.blocks.2.attn.proj.weight": "model-00003-of-00004.safetensors",
552
+ "model.qwen2vl_vision_tower.blocks.2.attn.qkv.bias": "model-00001-of-00004.safetensors",
553
+ "model.qwen2vl_vision_tower.blocks.2.attn.qkv.weight": "model-00002-of-00004.safetensors",
554
+ "model.qwen2vl_vision_tower.blocks.2.mlp.fc1.bias": "model-00002-of-00004.safetensors",
555
+ "model.qwen2vl_vision_tower.blocks.2.mlp.fc1.weight": "model-00004-of-00004.safetensors",
556
+ "model.qwen2vl_vision_tower.blocks.2.mlp.fc2.bias": "model-00002-of-00004.safetensors",
557
+ "model.qwen2vl_vision_tower.blocks.2.mlp.fc2.weight": "model-00004-of-00004.safetensors",
558
+ "model.qwen2vl_vision_tower.blocks.2.norm1.bias": "model-00001-of-00004.safetensors",
559
+ "model.qwen2vl_vision_tower.blocks.2.norm1.weight": "model-00002-of-00004.safetensors",
560
+ "model.qwen2vl_vision_tower.blocks.2.norm2.bias": "model-00002-of-00004.safetensors",
561
+ "model.qwen2vl_vision_tower.blocks.2.norm2.weight": "model-00001-of-00004.safetensors",
562
+ "model.qwen2vl_vision_tower.blocks.20.attn.proj.bias": "model-00001-of-00004.safetensors",
563
+ "model.qwen2vl_vision_tower.blocks.20.attn.proj.weight": "model-00003-of-00004.safetensors",
564
+ "model.qwen2vl_vision_tower.blocks.20.attn.qkv.bias": "model-00003-of-00004.safetensors",
565
+ "model.qwen2vl_vision_tower.blocks.20.attn.qkv.weight": "model-00001-of-00004.safetensors",
566
+ "model.qwen2vl_vision_tower.blocks.20.mlp.fc1.bias": "model-00004-of-00004.safetensors",
567
+ "model.qwen2vl_vision_tower.blocks.20.mlp.fc1.weight": "model-00001-of-00004.safetensors",
568
+ "model.qwen2vl_vision_tower.blocks.20.mlp.fc2.bias": "model-00004-of-00004.safetensors",
569
+ "model.qwen2vl_vision_tower.blocks.20.mlp.fc2.weight": "model-00003-of-00004.safetensors",
570
+ "model.qwen2vl_vision_tower.blocks.20.norm1.bias": "model-00001-of-00004.safetensors",
571
+ "model.qwen2vl_vision_tower.blocks.20.norm1.weight": "model-00003-of-00004.safetensors",
572
+ "model.qwen2vl_vision_tower.blocks.20.norm2.bias": "model-00003-of-00004.safetensors",
573
+ "model.qwen2vl_vision_tower.blocks.20.norm2.weight": "model-00002-of-00004.safetensors",
574
+ "model.qwen2vl_vision_tower.blocks.21.attn.proj.bias": "model-00003-of-00004.safetensors",
575
+ "model.qwen2vl_vision_tower.blocks.21.attn.proj.weight": "model-00003-of-00004.safetensors",
576
+ "model.qwen2vl_vision_tower.blocks.21.attn.qkv.bias": "model-00003-of-00004.safetensors",
577
+ "model.qwen2vl_vision_tower.blocks.21.attn.qkv.weight": "model-00002-of-00004.safetensors",
578
+ "model.qwen2vl_vision_tower.blocks.21.mlp.fc1.bias": "model-00001-of-00004.safetensors",
579
+ "model.qwen2vl_vision_tower.blocks.21.mlp.fc1.weight": "model-00003-of-00004.safetensors",
580
+ "model.qwen2vl_vision_tower.blocks.21.mlp.fc2.bias": "model-00002-of-00004.safetensors",
581
+ "model.qwen2vl_vision_tower.blocks.21.mlp.fc2.weight": "model-00004-of-00004.safetensors",
582
+ "model.qwen2vl_vision_tower.blocks.21.norm1.bias": "model-00002-of-00004.safetensors",
583
+ "model.qwen2vl_vision_tower.blocks.21.norm1.weight": "model-00004-of-00004.safetensors",
584
+ "model.qwen2vl_vision_tower.blocks.21.norm2.bias": "model-00002-of-00004.safetensors",
585
+ "model.qwen2vl_vision_tower.blocks.21.norm2.weight": "model-00004-of-00004.safetensors",
586
+ "model.qwen2vl_vision_tower.blocks.22.attn.proj.bias": "model-00004-of-00004.safetensors",
587
+ "model.qwen2vl_vision_tower.blocks.22.attn.proj.weight": "model-00003-of-00004.safetensors",
588
+ "model.qwen2vl_vision_tower.blocks.22.attn.qkv.bias": "model-00002-of-00004.safetensors",
589
+ "model.qwen2vl_vision_tower.blocks.22.attn.qkv.weight": "model-00001-of-00004.safetensors",
590
+ "model.qwen2vl_vision_tower.blocks.22.mlp.fc1.bias": "model-00002-of-00004.safetensors",
591
+ "model.qwen2vl_vision_tower.blocks.22.mlp.fc1.weight": "model-00002-of-00004.safetensors",
592
+ "model.qwen2vl_vision_tower.blocks.22.mlp.fc2.bias": "model-00002-of-00004.safetensors",
593
+ "model.qwen2vl_vision_tower.blocks.22.mlp.fc2.weight": "model-00002-of-00004.safetensors",
594
+ "model.qwen2vl_vision_tower.blocks.22.norm1.bias": "model-00004-of-00004.safetensors",
595
+ "model.qwen2vl_vision_tower.blocks.22.norm1.weight": "model-00002-of-00004.safetensors",
596
+ "model.qwen2vl_vision_tower.blocks.22.norm2.bias": "model-00003-of-00004.safetensors",
597
+ "model.qwen2vl_vision_tower.blocks.22.norm2.weight": "model-00002-of-00004.safetensors",
598
+ "model.qwen2vl_vision_tower.blocks.23.attn.proj.bias": "model-00003-of-00004.safetensors",
599
+ "model.qwen2vl_vision_tower.blocks.23.attn.proj.weight": "model-00004-of-00004.safetensors",
600
+ "model.qwen2vl_vision_tower.blocks.23.attn.qkv.bias": "model-00003-of-00004.safetensors",
601
+ "model.qwen2vl_vision_tower.blocks.23.attn.qkv.weight": "model-00003-of-00004.safetensors",
602
+ "model.qwen2vl_vision_tower.blocks.23.mlp.fc1.bias": "model-00003-of-00004.safetensors",
603
+ "model.qwen2vl_vision_tower.blocks.23.mlp.fc1.weight": "model-00002-of-00004.safetensors",
604
+ "model.qwen2vl_vision_tower.blocks.23.mlp.fc2.bias": "model-00003-of-00004.safetensors",
605
+ "model.qwen2vl_vision_tower.blocks.23.mlp.fc2.weight": "model-00001-of-00004.safetensors",
606
+ "model.qwen2vl_vision_tower.blocks.23.norm1.bias": "model-00002-of-00004.safetensors",
607
+ "model.qwen2vl_vision_tower.blocks.23.norm1.weight": "model-00002-of-00004.safetensors",
608
+ "model.qwen2vl_vision_tower.blocks.23.norm2.bias": "model-00002-of-00004.safetensors",
609
+ "model.qwen2vl_vision_tower.blocks.23.norm2.weight": "model-00002-of-00004.safetensors",
610
+ "model.qwen2vl_vision_tower.blocks.24.attn.proj.bias": "model-00001-of-00004.safetensors",
611
+ "model.qwen2vl_vision_tower.blocks.24.attn.proj.weight": "model-00004-of-00004.safetensors",
612
+ "model.qwen2vl_vision_tower.blocks.24.attn.qkv.bias": "model-00002-of-00004.safetensors",
613
+ "model.qwen2vl_vision_tower.blocks.24.attn.qkv.weight": "model-00002-of-00004.safetensors",
614
+ "model.qwen2vl_vision_tower.blocks.24.mlp.fc1.bias": "model-00002-of-00004.safetensors",
615
+ "model.qwen2vl_vision_tower.blocks.24.mlp.fc1.weight": "model-00004-of-00004.safetensors",
616
+ "model.qwen2vl_vision_tower.blocks.24.mlp.fc2.bias": "model-00002-of-00004.safetensors",
617
+ "model.qwen2vl_vision_tower.blocks.24.mlp.fc2.weight": "model-00002-of-00004.safetensors",
618
+ "model.qwen2vl_vision_tower.blocks.24.norm1.bias": "model-00001-of-00004.safetensors",
619
+ "model.qwen2vl_vision_tower.blocks.24.norm1.weight": "model-00003-of-00004.safetensors",
620
+ "model.qwen2vl_vision_tower.blocks.24.norm2.bias": "model-00003-of-00004.safetensors",
621
+ "model.qwen2vl_vision_tower.blocks.24.norm2.weight": "model-00002-of-00004.safetensors",
622
+ "model.qwen2vl_vision_tower.blocks.25.attn.proj.bias": "model-00003-of-00004.safetensors",
623
+ "model.qwen2vl_vision_tower.blocks.25.attn.proj.weight": "model-00004-of-00004.safetensors",
624
+ "model.qwen2vl_vision_tower.blocks.25.attn.qkv.bias": "model-00004-of-00004.safetensors",
625
+ "model.qwen2vl_vision_tower.blocks.25.attn.qkv.weight": "model-00002-of-00004.safetensors",
626
+ "model.qwen2vl_vision_tower.blocks.25.mlp.fc1.bias": "model-00003-of-00004.safetensors",
627
+ "model.qwen2vl_vision_tower.blocks.25.mlp.fc1.weight": "model-00003-of-00004.safetensors",
628
+ "model.qwen2vl_vision_tower.blocks.25.mlp.fc2.bias": "model-00004-of-00004.safetensors",
629
+ "model.qwen2vl_vision_tower.blocks.25.mlp.fc2.weight": "model-00002-of-00004.safetensors",
630
+ "model.qwen2vl_vision_tower.blocks.25.norm1.bias": "model-00004-of-00004.safetensors",
631
+ "model.qwen2vl_vision_tower.blocks.25.norm1.weight": "model-00003-of-00004.safetensors",
632
+ "model.qwen2vl_vision_tower.blocks.25.norm2.bias": "model-00004-of-00004.safetensors",
633
+ "model.qwen2vl_vision_tower.blocks.25.norm2.weight": "model-00001-of-00004.safetensors",
634
+ "model.qwen2vl_vision_tower.blocks.26.attn.proj.bias": "model-00001-of-00004.safetensors",
635
+ "model.qwen2vl_vision_tower.blocks.26.attn.proj.weight": "model-00001-of-00004.safetensors",
636
+ "model.qwen2vl_vision_tower.blocks.26.attn.qkv.bias": "model-00001-of-00004.safetensors",
637
+ "model.qwen2vl_vision_tower.blocks.26.attn.qkv.weight": "model-00003-of-00004.safetensors",
638
+ "model.qwen2vl_vision_tower.blocks.26.mlp.fc1.bias": "model-00003-of-00004.safetensors",
639
+ "model.qwen2vl_vision_tower.blocks.26.mlp.fc1.weight": "model-00001-of-00004.safetensors",
640
+ "model.qwen2vl_vision_tower.blocks.26.mlp.fc2.bias": "model-00003-of-00004.safetensors",
641
+ "model.qwen2vl_vision_tower.blocks.26.mlp.fc2.weight": "model-00001-of-00004.safetensors",
642
+ "model.qwen2vl_vision_tower.blocks.26.norm1.bias": "model-00003-of-00004.safetensors",
643
+ "model.qwen2vl_vision_tower.blocks.26.norm1.weight": "model-00002-of-00004.safetensors",
644
+ "model.qwen2vl_vision_tower.blocks.26.norm2.bias": "model-00002-of-00004.safetensors",
645
+ "model.qwen2vl_vision_tower.blocks.26.norm2.weight": "model-00002-of-00004.safetensors",
646
+ "model.qwen2vl_vision_tower.blocks.27.attn.proj.bias": "model-00001-of-00004.safetensors",
647
+ "model.qwen2vl_vision_tower.blocks.27.attn.proj.weight": "model-00001-of-00004.safetensors",
648
+ "model.qwen2vl_vision_tower.blocks.27.attn.qkv.bias": "model-00002-of-00004.safetensors",
649
+ "model.qwen2vl_vision_tower.blocks.27.attn.qkv.weight": "model-00003-of-00004.safetensors",
650
+ "model.qwen2vl_vision_tower.blocks.27.mlp.fc1.bias": "model-00002-of-00004.safetensors",
651
+ "model.qwen2vl_vision_tower.blocks.27.mlp.fc1.weight": "model-00002-of-00004.safetensors",
652
+ "model.qwen2vl_vision_tower.blocks.27.mlp.fc2.bias": "model-00004-of-00004.safetensors",
653
+ "model.qwen2vl_vision_tower.blocks.27.mlp.fc2.weight": "model-00003-of-00004.safetensors",
654
+ "model.qwen2vl_vision_tower.blocks.27.norm1.bias": "model-00002-of-00004.safetensors",
655
+ "model.qwen2vl_vision_tower.blocks.27.norm1.weight": "model-00002-of-00004.safetensors",
656
+ "model.qwen2vl_vision_tower.blocks.27.norm2.bias": "model-00003-of-00004.safetensors",
657
+ "model.qwen2vl_vision_tower.blocks.27.norm2.weight": "model-00003-of-00004.safetensors",
658
+ "model.qwen2vl_vision_tower.blocks.28.attn.proj.bias": "model-00003-of-00004.safetensors",
659
+ "model.qwen2vl_vision_tower.blocks.28.attn.proj.weight": "model-00004-of-00004.safetensors",
660
+ "model.qwen2vl_vision_tower.blocks.28.attn.qkv.bias": "model-00003-of-00004.safetensors",
661
+ "model.qwen2vl_vision_tower.blocks.28.attn.qkv.weight": "model-00004-of-00004.safetensors",
662
+ "model.qwen2vl_vision_tower.blocks.28.mlp.fc1.bias": "model-00004-of-00004.safetensors",
663
+ "model.qwen2vl_vision_tower.blocks.28.mlp.fc1.weight": "model-00002-of-00004.safetensors",
664
+ "model.qwen2vl_vision_tower.blocks.28.mlp.fc2.bias": "model-00003-of-00004.safetensors",
665
+ "model.qwen2vl_vision_tower.blocks.28.mlp.fc2.weight": "model-00002-of-00004.safetensors",
666
+ "model.qwen2vl_vision_tower.blocks.28.norm1.bias": "model-00003-of-00004.safetensors",
667
+ "model.qwen2vl_vision_tower.blocks.28.norm1.weight": "model-00004-of-00004.safetensors",
668
+ "model.qwen2vl_vision_tower.blocks.28.norm2.bias": "model-00001-of-00004.safetensors",
669
+ "model.qwen2vl_vision_tower.blocks.28.norm2.weight": "model-00003-of-00004.safetensors",
670
+ "model.qwen2vl_vision_tower.blocks.29.attn.proj.bias": "model-00002-of-00004.safetensors",
671
+ "model.qwen2vl_vision_tower.blocks.29.attn.proj.weight": "model-00004-of-00004.safetensors",
672
+ "model.qwen2vl_vision_tower.blocks.29.attn.qkv.bias": "model-00003-of-00004.safetensors",
673
+ "model.qwen2vl_vision_tower.blocks.29.attn.qkv.weight": "model-00003-of-00004.safetensors",
674
+ "model.qwen2vl_vision_tower.blocks.29.mlp.fc1.bias": "model-00003-of-00004.safetensors",
675
+ "model.qwen2vl_vision_tower.blocks.29.mlp.fc1.weight": "model-00001-of-00004.safetensors",
676
+ "model.qwen2vl_vision_tower.blocks.29.mlp.fc2.bias": "model-00001-of-00004.safetensors",
677
+ "model.qwen2vl_vision_tower.blocks.29.mlp.fc2.weight": "model-00002-of-00004.safetensors",
678
+ "model.qwen2vl_vision_tower.blocks.29.norm1.bias": "model-00004-of-00004.safetensors",
679
+ "model.qwen2vl_vision_tower.blocks.29.norm1.weight": "model-00003-of-00004.safetensors",
680
+ "model.qwen2vl_vision_tower.blocks.29.norm2.bias": "model-00002-of-00004.safetensors",
681
+ "model.qwen2vl_vision_tower.blocks.29.norm2.weight": "model-00001-of-00004.safetensors",
682
+ "model.qwen2vl_vision_tower.blocks.3.attn.proj.bias": "model-00002-of-00004.safetensors",
683
+ "model.qwen2vl_vision_tower.blocks.3.attn.proj.weight": "model-00003-of-00004.safetensors",
684
+ "model.qwen2vl_vision_tower.blocks.3.attn.qkv.bias": "model-00002-of-00004.safetensors",
685
+ "model.qwen2vl_vision_tower.blocks.3.attn.qkv.weight": "model-00003-of-00004.safetensors",
686
+ "model.qwen2vl_vision_tower.blocks.3.mlp.fc1.bias": "model-00001-of-00004.safetensors",
687
+ "model.qwen2vl_vision_tower.blocks.3.mlp.fc1.weight": "model-00004-of-00004.safetensors",
688
+ "model.qwen2vl_vision_tower.blocks.3.mlp.fc2.bias": "model-00003-of-00004.safetensors",
689
+ "model.qwen2vl_vision_tower.blocks.3.mlp.fc2.weight": "model-00002-of-00004.safetensors",
690
+ "model.qwen2vl_vision_tower.blocks.3.norm1.bias": "model-00003-of-00004.safetensors",
691
+ "model.qwen2vl_vision_tower.blocks.3.norm1.weight": "model-00003-of-00004.safetensors",
692
+ "model.qwen2vl_vision_tower.blocks.3.norm2.bias": "model-00002-of-00004.safetensors",
693
+ "model.qwen2vl_vision_tower.blocks.3.norm2.weight": "model-00001-of-00004.safetensors",
694
+ "model.qwen2vl_vision_tower.blocks.30.attn.proj.bias": "model-00002-of-00004.safetensors",
695
+ "model.qwen2vl_vision_tower.blocks.30.attn.proj.weight": "model-00004-of-00004.safetensors",
696
+ "model.qwen2vl_vision_tower.blocks.30.attn.qkv.bias": "model-00002-of-00004.safetensors",
697
+ "model.qwen2vl_vision_tower.blocks.30.attn.qkv.weight": "model-00002-of-00004.safetensors",
698
+ "model.qwen2vl_vision_tower.blocks.30.mlp.fc1.bias": "model-00002-of-00004.safetensors",
699
+ "model.qwen2vl_vision_tower.blocks.30.mlp.fc1.weight": "model-00003-of-00004.safetensors",
700
+ "model.qwen2vl_vision_tower.blocks.30.mlp.fc2.bias": "model-00001-of-00004.safetensors",
701
+ "model.qwen2vl_vision_tower.blocks.30.mlp.fc2.weight": "model-00004-of-00004.safetensors",
702
+ "model.qwen2vl_vision_tower.blocks.30.norm1.bias": "model-00003-of-00004.safetensors",
703
+ "model.qwen2vl_vision_tower.blocks.30.norm1.weight": "model-00003-of-00004.safetensors",
704
+ "model.qwen2vl_vision_tower.blocks.30.norm2.bias": "model-00004-of-00004.safetensors",
705
+ "model.qwen2vl_vision_tower.blocks.30.norm2.weight": "model-00003-of-00004.safetensors",
706
+ "model.qwen2vl_vision_tower.blocks.31.attn.proj.bias": "model-00003-of-00004.safetensors",
707
+ "model.qwen2vl_vision_tower.blocks.31.attn.proj.weight": "model-00002-of-00004.safetensors",
708
+ "model.qwen2vl_vision_tower.blocks.31.attn.qkv.bias": "model-00004-of-00004.safetensors",
709
+ "model.qwen2vl_vision_tower.blocks.31.attn.qkv.weight": "model-00004-of-00004.safetensors",
710
+ "model.qwen2vl_vision_tower.blocks.31.mlp.fc1.bias": "model-00004-of-00004.safetensors",
711
+ "model.qwen2vl_vision_tower.blocks.31.mlp.fc1.weight": "model-00001-of-00004.safetensors",
712
+ "model.qwen2vl_vision_tower.blocks.31.mlp.fc2.bias": "model-00003-of-00004.safetensors",
713
+ "model.qwen2vl_vision_tower.blocks.31.mlp.fc2.weight": "model-00003-of-00004.safetensors",
714
+ "model.qwen2vl_vision_tower.blocks.31.norm1.bias": "model-00004-of-00004.safetensors",
715
+ "model.qwen2vl_vision_tower.blocks.31.norm1.weight": "model-00004-of-00004.safetensors",
716
+ "model.qwen2vl_vision_tower.blocks.31.norm2.bias": "model-00003-of-00004.safetensors",
717
+ "model.qwen2vl_vision_tower.blocks.31.norm2.weight": "model-00004-of-00004.safetensors",
718
+ "model.qwen2vl_vision_tower.blocks.4.attn.proj.bias": "model-00002-of-00004.safetensors",
719
+ "model.qwen2vl_vision_tower.blocks.4.attn.proj.weight": "model-00004-of-00004.safetensors",
720
+ "model.qwen2vl_vision_tower.blocks.4.attn.qkv.bias": "model-00001-of-00004.safetensors",
721
+ "model.qwen2vl_vision_tower.blocks.4.attn.qkv.weight": "model-00003-of-00004.safetensors",
722
+ "model.qwen2vl_vision_tower.blocks.4.mlp.fc1.bias": "model-00001-of-00004.safetensors",
723
+ "model.qwen2vl_vision_tower.blocks.4.mlp.fc1.weight": "model-00002-of-00004.safetensors",
724
+ "model.qwen2vl_vision_tower.blocks.4.mlp.fc2.bias": "model-00002-of-00004.safetensors",
725
+ "model.qwen2vl_vision_tower.blocks.4.mlp.fc2.weight": "model-00002-of-00004.safetensors",
726
+ "model.qwen2vl_vision_tower.blocks.4.norm1.bias": "model-00003-of-00004.safetensors",
727
+ "model.qwen2vl_vision_tower.blocks.4.norm1.weight": "model-00003-of-00004.safetensors",
728
+ "model.qwen2vl_vision_tower.blocks.4.norm2.bias": "model-00004-of-00004.safetensors",
729
+ "model.qwen2vl_vision_tower.blocks.4.norm2.weight": "model-00002-of-00004.safetensors",
730
+ "model.qwen2vl_vision_tower.blocks.5.attn.proj.bias": "model-00001-of-00004.safetensors",
731
+ "model.qwen2vl_vision_tower.blocks.5.attn.proj.weight": "model-00004-of-00004.safetensors",
732
+ "model.qwen2vl_vision_tower.blocks.5.attn.qkv.bias": "model-00002-of-00004.safetensors",
733
+ "model.qwen2vl_vision_tower.blocks.5.attn.qkv.weight": "model-00003-of-00004.safetensors",
734
+ "model.qwen2vl_vision_tower.blocks.5.mlp.fc1.bias": "model-00002-of-00004.safetensors",
735
+ "model.qwen2vl_vision_tower.blocks.5.mlp.fc1.weight": "model-00001-of-00004.safetensors",
736
+ "model.qwen2vl_vision_tower.blocks.5.mlp.fc2.bias": "model-00001-of-00004.safetensors",
737
+ "model.qwen2vl_vision_tower.blocks.5.mlp.fc2.weight": "model-00004-of-00004.safetensors",
738
+ "model.qwen2vl_vision_tower.blocks.5.norm1.bias": "model-00001-of-00004.safetensors",
739
+ "model.qwen2vl_vision_tower.blocks.5.norm1.weight": "model-00001-of-00004.safetensors",
740
+ "model.qwen2vl_vision_tower.blocks.5.norm2.bias": "model-00004-of-00004.safetensors",
741
+ "model.qwen2vl_vision_tower.blocks.5.norm2.weight": "model-00001-of-00004.safetensors",
742
+ "model.qwen2vl_vision_tower.blocks.6.attn.proj.bias": "model-00001-of-00004.safetensors",
743
+ "model.qwen2vl_vision_tower.blocks.6.attn.proj.weight": "model-00001-of-00004.safetensors",
744
+ "model.qwen2vl_vision_tower.blocks.6.attn.qkv.bias": "model-00002-of-00004.safetensors",
745
+ "model.qwen2vl_vision_tower.blocks.6.attn.qkv.weight": "model-00004-of-00004.safetensors",
746
+ "model.qwen2vl_vision_tower.blocks.6.mlp.fc1.bias": "model-00002-of-00004.safetensors",
747
+ "model.qwen2vl_vision_tower.blocks.6.mlp.fc1.weight": "model-00003-of-00004.safetensors",
748
+ "model.qwen2vl_vision_tower.blocks.6.mlp.fc2.bias": "model-00004-of-00004.safetensors",
749
+ "model.qwen2vl_vision_tower.blocks.6.mlp.fc2.weight": "model-00002-of-00004.safetensors",
750
+ "model.qwen2vl_vision_tower.blocks.6.norm1.bias": "model-00004-of-00004.safetensors",
751
+ "model.qwen2vl_vision_tower.blocks.6.norm1.weight": "model-00002-of-00004.safetensors",
752
+ "model.qwen2vl_vision_tower.blocks.6.norm2.bias": "model-00003-of-00004.safetensors",
753
+ "model.qwen2vl_vision_tower.blocks.6.norm2.weight": "model-00002-of-00004.safetensors",
754
+ "model.qwen2vl_vision_tower.blocks.7.attn.proj.bias": "model-00004-of-00004.safetensors",
755
+ "model.qwen2vl_vision_tower.blocks.7.attn.proj.weight": "model-00001-of-00004.safetensors",
756
+ "model.qwen2vl_vision_tower.blocks.7.attn.qkv.bias": "model-00004-of-00004.safetensors",
757
+ "model.qwen2vl_vision_tower.blocks.7.attn.qkv.weight": "model-00001-of-00004.safetensors",
758
+ "model.qwen2vl_vision_tower.blocks.7.mlp.fc1.bias": "model-00003-of-00004.safetensors",
759
+ "model.qwen2vl_vision_tower.blocks.7.mlp.fc1.weight": "model-00003-of-00004.safetensors",
760
+ "model.qwen2vl_vision_tower.blocks.7.mlp.fc2.bias": "model-00002-of-00004.safetensors",
761
+ "model.qwen2vl_vision_tower.blocks.7.mlp.fc2.weight": "model-00002-of-00004.safetensors",
762
+ "model.qwen2vl_vision_tower.blocks.7.norm1.bias": "model-00003-of-00004.safetensors",
763
+ "model.qwen2vl_vision_tower.blocks.7.norm1.weight": "model-00002-of-00004.safetensors",
764
+ "model.qwen2vl_vision_tower.blocks.7.norm2.bias": "model-00003-of-00004.safetensors",
765
+ "model.qwen2vl_vision_tower.blocks.7.norm2.weight": "model-00002-of-00004.safetensors",
766
+ "model.qwen2vl_vision_tower.blocks.8.attn.proj.bias": "model-00003-of-00004.safetensors",
767
+ "model.qwen2vl_vision_tower.blocks.8.attn.proj.weight": "model-00003-of-00004.safetensors",
768
+ "model.qwen2vl_vision_tower.blocks.8.attn.qkv.bias": "model-00004-of-00004.safetensors",
769
+ "model.qwen2vl_vision_tower.blocks.8.attn.qkv.weight": "model-00004-of-00004.safetensors",
770
+ "model.qwen2vl_vision_tower.blocks.8.mlp.fc1.bias": "model-00003-of-00004.safetensors",
771
+ "model.qwen2vl_vision_tower.blocks.8.mlp.fc1.weight": "model-00003-of-00004.safetensors",
772
+ "model.qwen2vl_vision_tower.blocks.8.mlp.fc2.bias": "model-00002-of-00004.safetensors",
773
+ "model.qwen2vl_vision_tower.blocks.8.mlp.fc2.weight": "model-00001-of-00004.safetensors",
774
+ "model.qwen2vl_vision_tower.blocks.8.norm1.bias": "model-00004-of-00004.safetensors",
775
+ "model.qwen2vl_vision_tower.blocks.8.norm1.weight": "model-00003-of-00004.safetensors",
776
+ "model.qwen2vl_vision_tower.blocks.8.norm2.bias": "model-00004-of-00004.safetensors",
777
+ "model.qwen2vl_vision_tower.blocks.8.norm2.weight": "model-00003-of-00004.safetensors",
778
+ "model.qwen2vl_vision_tower.blocks.9.attn.proj.bias": "model-00002-of-00004.safetensors",
779
+ "model.qwen2vl_vision_tower.blocks.9.attn.proj.weight": "model-00002-of-00004.safetensors",
780
+ "model.qwen2vl_vision_tower.blocks.9.attn.qkv.bias": "model-00002-of-00004.safetensors",
781
+ "model.qwen2vl_vision_tower.blocks.9.attn.qkv.weight": "model-00002-of-00004.safetensors",
782
+ "model.qwen2vl_vision_tower.blocks.9.mlp.fc1.bias": "model-00001-of-00004.safetensors",
783
+ "model.qwen2vl_vision_tower.blocks.9.mlp.fc1.weight": "model-00001-of-00004.safetensors",
784
+ "model.qwen2vl_vision_tower.blocks.9.mlp.fc2.bias": "model-00002-of-00004.safetensors",
785
+ "model.qwen2vl_vision_tower.blocks.9.mlp.fc2.weight": "model-00001-of-00004.safetensors",
786
+ "model.qwen2vl_vision_tower.blocks.9.norm1.bias": "model-00002-of-00004.safetensors",
787
+ "model.qwen2vl_vision_tower.blocks.9.norm1.weight": "model-00001-of-00004.safetensors",
788
+ "model.qwen2vl_vision_tower.blocks.9.norm2.bias": "model-00002-of-00004.safetensors",
789
+ "model.qwen2vl_vision_tower.blocks.9.norm2.weight": "model-00003-of-00004.safetensors",
790
+ "model.qwen2vl_vision_tower.merger.ln_q.bias": "model-00004-of-00004.safetensors",
791
+ "model.qwen2vl_vision_tower.merger.ln_q.weight": "model-00003-of-00004.safetensors",
792
+ "model.qwen2vl_vision_tower.merger.mlp.0.bias": "model-00001-of-00004.safetensors",
793
+ "model.qwen2vl_vision_tower.merger.mlp.0.weight": "model-00003-of-00004.safetensors",
794
+ "model.qwen2vl_vision_tower.merger.mlp.2.bias": "model-00004-of-00004.safetensors",
795
+ "model.qwen2vl_vision_tower.merger.mlp.2.weight": "model-00001-of-00004.safetensors",
796
+ "model.qwen2vl_vision_tower.patch_embed.proj.weight": "model-00003-of-00004.safetensors"
797
+ }
798
+ }
modeling_projector.py CHANGED
@@ -1,3 +1,308 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ad432832007fb6483072944a20a929927548d7b39eb49a9c9a9492fcffae233c
3
- size 13141
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ import torch.nn as nn
4
+ from .utils import IMAGE_INDICATOR_IDS
5
+
6
+
7
+ def build_vision_projector(config, delay_load=False, **kwargs):
8
+ projector_type = getattr(config, 'mm_projector_type', 'linear')
9
+
10
+ if projector_type == 'conv_adapter':
11
+ return ConvAdapter(config.mm_hidden_size, config.hidden_size, getattr(config, "mlp_hidden_dim", None))
12
+ elif projector_type == 'mlp_pixel_shuffle':
13
+ return MlpPixelShuffle(config.mm_hidden_size, config.hidden_size,
14
+ config.pixelshuffle_downsample_ratio, getattr(config, "mlp_hidden_dim", None))
15
+ elif projector_type == 'ovis_conv_adapter':
16
+ return OvisConvAdapter(config.mm_hidden_size, config.hidden_size, getattr(config, "mlp_hidden_dim", 32000),
17
+ getattr(config, "tokenize_function", "softmax"))
18
+ elif projector_type == 'ovis2_adapter':
19
+ return Ovis2Adapter(config.mm_hidden_size, config.hidden_size, getattr(config, "mlp_hidden_dim", 66536),
20
+ getattr(config, "hidden_stride", 2), getattr(config, "pooling_stride", 1), getattr(config, "tokenize_function", "softmax"))
21
+ elif projector_type == 'ovis_conv_adapter_navit':
22
+ return OvisConvAdapterNavit(1280, config.hidden_size, getattr(config, "mlp_hidden_dim", 32000), getattr(config, "tokenize_function", "softmax"))
23
+ raise ValueError(f'Unknown projector type: {projector_type}')
24
+
25
+
26
+ class ConvAdapter(nn.Module):
27
+ def __init__(self, dim_in, dim_out, mlp_hidden_dim=None):
28
+ super().__init__()
29
+ self.mm_projector_type = 'conv_adapter'
30
+ if mlp_hidden_dim is None:
31
+ self.mlp = nn.Sequential(
32
+ nn.Linear(dim_in, dim_out),
33
+ nn.GELU(),
34
+ nn.Linear(dim_out, dim_out)
35
+ )
36
+ else:
37
+ self.mlp = nn.Sequential(
38
+ nn.Linear(dim_in, mlp_hidden_dim),
39
+ nn.GELU(),
40
+ nn.Linear(mlp_hidden_dim, dim_out)
41
+ )
42
+ self.conv = nn.Conv2d(dim_out, dim_out, kernel_size=(3, 3), stride=(2, 2), padding=1)
43
+
44
+ def forward(self, x):
45
+ """
46
+ Args:
47
+ x (torch.Tensor): image features
48
+ shape (F, v, D)
49
+ Returns:
50
+ shape (F, n, D) where n is token_num that has been reduced
51
+ """
52
+ x = self.mlp(x)
53
+
54
+ f, v, d = x.shape
55
+ s = int(math.sqrt(v - 1))
56
+ x = x[:, 1:, :] # remove cls_token
57
+ x = x.reshape(f, s, s, d).permute([0, 3, 1, 2])
58
+ x = self.conv(x)
59
+ x = x.permute([0, 2, 3, 1]).reshape(f, -1, d)
60
+ return x
61
+
62
+
63
+ class MlpPixelShuffle(nn.Module):
64
+ def __init__(self, dim_in, dim_out, pixelshuffle_downsample_ratio, mlp_hidden_dim=None):
65
+ super().__init__()
66
+ self.mm_projector_type = 'mlp_pixel_shuffle'
67
+ if mlp_hidden_dim is None:
68
+ self.mlp = nn.Sequential(
69
+ nn.Linear(int(dim_in * (pixelshuffle_downsample_ratio ** 2)), dim_out),
70
+ nn.GELU(),
71
+ nn.Linear(dim_out, dim_out)
72
+ )
73
+ else:
74
+ self.mlp = nn.Sequential(
75
+ nn.Linear(int(dim_in * (pixelshuffle_downsample_ratio ** 2)), mlp_hidden_dim),
76
+ nn.GELU(),
77
+ nn.Linear(mlp_hidden_dim, dim_out)
78
+ )
79
+ self.scale_factor = pixelshuffle_downsample_ratio
80
+
81
+ def pixel_shuffle(self, x, scale_factor=2):
82
+ # change scale_factor from float to int
83
+
84
+ n, w, h, c = x.size()
85
+ # N, W, H, C --> N, W, H / scale, C * scale
86
+ x = x.view(n, w, int(h / scale_factor), int(c * scale_factor))
87
+ # N, W, H / scale, C * scale --> N, H / scale, W, C * scale
88
+ x = x.permute(0, 2, 1, 3).contiguous()
89
+ # N, H / scale, W, C * scale --> N, H / scale, W / scale, C * (scale ** 2)
90
+ x = x.view(n, int(h / scale_factor), int(w / scale_factor),
91
+ int(c * (scale_factor * scale_factor)))
92
+
93
+ x = x.permute(0, 2, 1, 3).contiguous()
94
+
95
+ return x
96
+
97
+ def forward(self, x):
98
+ """
99
+ Args:
100
+ x (torch.Tensor): image features
101
+ shape (F, v, D)
102
+ Returns:
103
+ shape (F, n, D) where n is token_num that has been reduced
104
+ """
105
+ x = x[:, 1:, :] # remove cls_token
106
+ h = w = int(x.shape[1] ** 0.5)
107
+ x = x.view(x.shape[0], h, w, -1)
108
+ x = self.pixel_shuffle(x, self.scale_factor)
109
+ x = self.mlp(x)
110
+ x = x.view(x.shape[0],-1,x.shape[-1])
111
+ return x
112
+
113
+
114
+ class OvisConvAdapter(nn.Module):
115
+ def __init__(self, dim_in, dim_out, vocab_size, tokenize_function="softmax"):
116
+ super().__init__()
117
+ self.mm_projector_type = 'ovis_conv_adapter'
118
+ self.conv = nn.Conv2d(dim_in, dim_in, kernel_size=(3, 3), stride=(2, 2), padding=1)
119
+ self.mlp = torch.nn.Sequential(
120
+ torch.nn.Linear(dim_in, vocab_size, bias=False),
121
+ torch.nn.LayerNorm(vocab_size)
122
+ )
123
+ self.embedding = torch.nn.Embedding(vocab_size, dim_out)
124
+ self.tokenize_function = tokenize_function
125
+
126
+ def tokenize(self, logits):
127
+ def st_argmax(y_soft, dim): # straight-through softmax
128
+ index = y_soft.max(dim, keepdim=True)[1]
129
+ y_hard = torch.zeros_like(y_soft, memory_format=torch.legacy_contiguous_format).scatter_(dim, index, 1.0)
130
+ ret = y_hard - y_soft.detach() + y_soft
131
+ return ret
132
+
133
+ if self.tokenize_function == 'softmax':
134
+ tokens = torch.nn.functional.softmax(logits, dim=-1)
135
+ elif self.tokenize_function == 'gumbel_argmax':
136
+ tokens = torch.nn.functional.gumbel_softmax(logits, tau=self.config.tau, hard=True)
137
+ elif self.tokenize_function == 'st_argmax':
138
+ tokens = st_argmax(logits, dim=-1)
139
+ else:
140
+ raise ValueError(
141
+ 'Invalid `max_type`, expected softmax or gumbel_argmax or st_argmax,'
142
+ f' but got {self.config.tokenize_function}'
143
+ )
144
+ return tokens
145
+
146
+ def forward(self, x):
147
+ """
148
+ Args:
149
+ x (torch.Tensor): image features
150
+ shape (F, v, D)
151
+ Returns:
152
+ shape (F, n, D) where n is token_num that has been reduced
153
+ """
154
+ # conv
155
+ f, v, d = x.shape
156
+ s = int(math.sqrt(v - 1))
157
+ x = x[:, 1:, :] # remove cls_token
158
+ x = x.reshape(f, s, s, d).permute([0, 3, 1, 2])
159
+ x = self.conv(x)
160
+ x = x.permute([0, 2, 3, 1]).reshape(f, -1, d)
161
+
162
+ # tokenize
163
+ logits = self.mlp(x)
164
+ visual_tokens = self.tokenize(logits)
165
+
166
+ # get embeddings
167
+ out = torch.matmul(visual_tokens, self.embedding.weight)
168
+
169
+ return out
170
+
171
+
172
+ class Ovis2Adapter(nn.Module):
173
+ def __init__(self, dim_in, dim_out, vocab_size, hidden_stride=2, pooling_stride=1, tokenize_function="softmax"):
174
+ super().__init__()
175
+ head_dim = vocab_size - len(IMAGE_INDICATOR_IDS)
176
+ self.mm_projector_type = 'ovis2_adapter'
177
+ self.hidden_stride = hidden_stride
178
+ self.tokenize_function = tokenize_function
179
+ self.head = torch.nn.Sequential(
180
+ torch.nn.Linear(
181
+ dim_in * self.hidden_stride * self.hidden_stride, head_dim,
182
+ bias=False
183
+ ),
184
+ torch.nn.LayerNorm(head_dim)
185
+ )
186
+ self.embedding = torch.nn.Embedding(vocab_size, dim_out)
187
+ self.pool_s = pooling_stride
188
+ print("pooling_stride: ", pooling_stride)
189
+
190
+ def encode(self, features):
191
+ # merge number of `hidden_stride * hidden_stride` hidden states together to reduce token sequence length
192
+ # e.g., for hidden_stride=2, this leads to a token length reduction: 1024 -> 256 for aimv2
193
+ features = features[:, 1:, :]
194
+ if self.hidden_stride > 1:
195
+ n, l, d = features.shape # this `d` maybe different from the above `d
196
+ sqrt_l = int(l ** 0.5)
197
+ assert sqrt_l ** 2 == l, "The token sequence length should be a perfect square."
198
+ features = features.reshape(n, sqrt_l, sqrt_l, d)
199
+ pl = (self.hidden_stride - (sqrt_l % self.hidden_stride)) % self.hidden_stride
200
+ features = torch.nn.functional.pad(features, (0, 0, 0, pl, 0, pl), "constant", 0)
201
+ sqrt_l += pl
202
+ features = features.reshape(n, sqrt_l // self.hidden_stride, self.hidden_stride,
203
+ sqrt_l // self.hidden_stride, self.hidden_stride, d)
204
+ features = features.permute(0, 1, 3, 2, 4, 5) # [n, sqrt_l/hs, sqrt_l/hs, hs, hs, d]
205
+ features = features.flatten(3) # [n, sqrt_l/hs, sqrt_l/hs, hs*hs*d]
206
+ features = features.reshape(
207
+ n, -1, self.hidden_stride * self.hidden_stride * d)
208
+
209
+ return features
210
+
211
+ def tokenize(self, logits):
212
+ def st_argmax(y_soft, dim): # straight-through softmax
213
+ index = y_soft.max(dim, keepdim=True)[1]
214
+ y_hard = torch.zeros_like(y_soft, memory_format=torch.legacy_contiguous_format).scatter_(dim, index, 1.0)
215
+ ret = y_hard - y_soft.detach() + y_soft
216
+ return ret
217
+
218
+ if self.tokenize_function == 'softmax':
219
+ tokens = torch.nn.functional.softmax(logits, dim=-1)
220
+ elif self.tokenize_function == 'gumbel_argmax':
221
+ tokens = torch.nn.functional.gumbel_softmax(logits, tau=self.config.tau, hard=True) # here need to be check???
222
+ elif self.tokenize_function == 'st_argmax':
223
+ tokens = st_argmax(logits, dim=-1)
224
+ else:
225
+ raise ValueError(
226
+ f'Invalid `max_type`, expected softmax or gumbel_argmax or st_argmax, but got {self.config.tokenize_function}')
227
+ return tokens
228
+
229
+ def forward(self, x):
230
+ """
231
+ Args:
232
+ x (torch.Tensor): image features
233
+ shape (F, v, D)
234
+ Returns:
235
+ shape (F, n, D) where n is token_num that has been reduced
236
+ """
237
+ # pixelshuffle
238
+ # f, v, d = x.shape
239
+ # s = int(math.sqrt(v))
240
+ x = self.encode(x)
241
+ # tokenize
242
+ logits = self.head(x)
243
+ visual_tokens = self.tokenize(logits)
244
+ batch_size, token_len, _ = visual_tokens.shape
245
+ padding_tensor = torch.zeros(size=(batch_size, token_len, len(IMAGE_INDICATOR_IDS)),
246
+ dtype=visual_tokens.dtype,
247
+ device=visual_tokens.device,
248
+ layout=visual_tokens.layout,
249
+ requires_grad=False)
250
+ visual_tokens = torch.cat([visual_tokens, padding_tensor], dim=2)
251
+ # get embeddings here need to change argmax
252
+ out = torch.matmul(visual_tokens, self.embedding.weight)
253
+
254
+ if self.pool_s > 1:
255
+ f, v, d = out.shape
256
+ s = int(math.sqrt(v))
257
+ out = out.reshape(f, s, s, d)
258
+ out = out.reshape(f, s // self.pool_s, self.pool_s, s // self.pool_s, self.pool_s, d)
259
+ out = out.permute([0, 1, 3, 5, 2, 4]).reshape(f, s // self.pool_s * s // self.pool_s, d, -1).mean(-1)
260
+ return out
261
+
262
+ class OvisConvAdapterNavit(nn.Module):
263
+ def __init__(self, dim_in, dim_out, vocab_size, tokenize_function="softmax"):
264
+ super().__init__()
265
+ self.mm_projector_type = 'ovis_conv_adapter_navit'
266
+ self.conv = nn.Conv2d(dim_in, dim_in, kernel_size=(2, 2), stride=(2, 2))
267
+ self.mlp = torch.nn.Sequential(
268
+ torch.nn.Linear(dim_in, vocab_size, bias=False),
269
+ torch.nn.LayerNorm(vocab_size)
270
+ )
271
+ self.embedding = torch.nn.Embedding(vocab_size, dim_out)
272
+ self.tokenize_function = tokenize_function
273
+ def tokenize(self, logits):
274
+ def st_argmax(y_soft, dim): # straight-through softmax
275
+ index = y_soft.max(dim, keepdim=True)[1]
276
+ y_hard = torch.zeros_like(y_soft, memory_format=torch.legacy_contiguous_format).scatter_(dim, index, 1.0)
277
+ ret = y_hard - y_soft.detach() + y_soft
278
+ return ret
279
+ if self.tokenize_function == 'softmax':
280
+ tokens = torch.nn.functional.softmax(logits, dim=-1)
281
+ elif self.tokenize_function == 'gumbel_argmax':
282
+ tokens = torch.nn.functional.gumbel_softmax(logits, tau=self.config.tau, hard=True)
283
+ elif self.tokenize_function == 'st_argmax':
284
+ tokens = st_argmax(logits, dim=-1)
285
+ else:
286
+ raise ValueError(
287
+ f'Invalid `max_type`, expected softmax or gumbel_argmax or st_argmax, but got {self.config.tokenize_function}')
288
+ return tokens
289
+ def forward(self, x):
290
+ """
291
+ Args:
292
+ x (torch.Tensor): image features of navit
293
+ shape (v, D)
294
+ Returns:
295
+ shape (n, D) where n is token_num that has been reduced
296
+ """
297
+ # conv
298
+ _, d = x.shape
299
+ x = x.reshape(-1, 2, 2, d).permute([0, 3, 1, 2])
300
+ x = self.conv(x)
301
+ x = x.permute([0, 2, 3, 1]).reshape(-1, d)
302
+ # tokenize
303
+ logits = self.mlp(x)
304
+ visual_tokens = self.tokenize(logits)
305
+ # get embeddings
306
+ out = torch.matmul(visual_tokens, self.embedding.weight)
307
+
308
+ return out
modeling_valley.py CHANGED
@@ -589,6 +589,7 @@ class ValleyQwen3ForCausalLM(Qwen3ForCausalLM, ValleyMetaForCausalLM):
589
  shift_labels = shift_labels.to(shift_logits.device)
590
  loss = torch.stack([loss_fct(shift_logits[i], shift_labels[i]) for i in range(bs)])
591
 
 
592
  if not return_dict:
593
  output = (logits,) + outputs[1:]
594
  return (loss,) + output if loss is not None else output
 
589
  shift_labels = shift_labels.to(shift_logits.device)
590
  loss = torch.stack([loss_fct(shift_logits[i], shift_labels[i]) for i in range(bs)])
591
 
592
+
593
  if not return_dict:
594
  output = (logits,) + outputs[1:]
595
  return (loss,) + output if loss is not None else output
modeling_vision_tower.py CHANGED
@@ -1,3 +1,323 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:73071f97b2ad6a714bd77d272d1df0491ba96eeb33faaef3131748b0dc8e8dd3
3
- size 13063
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from transformers.models.qwen2_vl.modeling_qwen2_vl import Qwen2VisionTransformerPretrainedModel
4
+ from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import Qwen2_5_VisionTransformerPretrainedModel
5
+ from transformers import PretrainedConfig
6
+
7
+ siglip_config = PretrainedConfig.from_dict(
8
+ {
9
+ "attention_dropout": 0.0,
10
+ "hidden_act": "gelu_pytorch_tanh",
11
+ "hidden_size": 1152,
12
+ "image_size": 384,
13
+ "intermediate_size": 4304,
14
+ "layer_norm_eps": 1e-06,
15
+ "model_type": "siglip_vision_model",
16
+ "num_attention_heads": 16,
17
+ "num_channels": 3,
18
+ "num_hidden_layers": 27,
19
+ "patch_size": 14,
20
+ }
21
+ )
22
+
23
+ qwen2vl_vit_config = PretrainedConfig.from_dict(
24
+ {
25
+ "depth": 32,
26
+ "embed_dim": 1280,
27
+ "hidden_act": "quick_gelu",
28
+ "hidden_size": 3584,
29
+ "in_channels": 3,
30
+ "in_chans": 3,
31
+ "mlp_ratio": 4,
32
+ "model_type": "qwen2_vl",
33
+ "num_heads": 16,
34
+ "patch_size": 14,
35
+ "spatial_merge_size": 2,
36
+ "spatial_patch_size": 14,
37
+ "temporal_patch_size": 2,
38
+ "_attn_implementation": "flash_attention_2",
39
+ "_attn_implementation_internal": "flash_attention_2"
40
+ }
41
+ )
42
+
43
+ qwen2_5vl_vit_config = PretrainedConfig.from_dict(
44
+ {
45
+ "depth": 32,
46
+ "hidden_act": "silu",
47
+ "hidden_size": 1280,
48
+ "intermediate_size": 3420,
49
+ "num_heads": 16,
50
+ "in_chans": 3,
51
+ "out_hidden_size": 3584,
52
+ "patch_size": 14,
53
+ "spatial_merge_size": 2,
54
+ "spatial_patch_size": 14,
55
+ "window_size": 112,
56
+ "fullatt_block_indexes": [
57
+ 7,
58
+ 15,
59
+ 23,
60
+ 31
61
+ ],
62
+ "tokens_per_second": 2,
63
+ "temporal_patch_size": 2
64
+ }
65
+ )
66
+
67
+ aimv2_config = PretrainedConfig.from_dict(
68
+ {
69
+ "hidden_size": 1024,
70
+ "image_size": 448,
71
+ "intermediate_size": 2816,
72
+ "model_type": "aimv2",
73
+ "num_attention_heads": 8,
74
+ "num_channels": 3,
75
+ "num_hidden_layers": 24,
76
+ "patch_size": 14,
77
+ "projection_dropout": 0.0,
78
+ "qkv_bias": False,
79
+ "rms_norm_eps": 1e-05,
80
+ "torch_dtype": "float32",
81
+ "transformers_version": "4.46.3",
82
+ "auto_map": {
83
+ "AutoConfig": "configuration_aimv2.AIMv2Config",
84
+ "AutoModel": "modeling_aimv2.AIMv2Model",
85
+ },
86
+ }
87
+ )
88
+
89
+ def wrapped_qwen2vl_vision_tower(vision_tower_cfg, qwen2vl_vision_tower):
90
+ if getattr(vision_tower_cfg, "only_navit", False) and \
91
+ getattr(vision_tower_cfg, "navit_use_mm_projector", False):
92
+ qwen2vl_vision_tower.merger = torch.nn.Identity()
93
+ print("navit_use_mm_projector is NOT None, so we need to initialize a new merger...")
94
+
95
+ else:
96
+ old_linear = qwen2vl_vision_tower.merger.mlp[-1] # shape: 5120 * 3584, 3584 is dim of LLM, 5120 is the hidden_dim of merger
97
+ navit_merger_hidden_dim = getattr(vision_tower_cfg, "navit_merger_hidden_dim", None)
98
+
99
+ rule1 = old_linear.out_features != vision_tower_cfg.hidden_size
100
+ rule2 = navit_merger_hidden_dim is not None and navit_merger_hidden_dim != old_linear.in_features
101
+
102
+ if rule1 or rule2:
103
+ del qwen2vl_vision_tower.merger
104
+ qwen2vl_vision_tower.merger = CustomPatchMerger(
105
+ dim=vision_tower_cfg.hidden_size, # output_dim of merger, also the dim of LLM
106
+ context_dim=1280, # 1280 is the hidden_dim of qwen2vl_vision_tower, so input_dim of merger is 1280*4=5120 (2*2 pixel shuffle)
107
+ hidden_dim=navit_merger_hidden_dim if navit_merger_hidden_dim is not None else old_linear.in_features # hidden_dim of merger
108
+ )
109
+ print("output_dim of original merger is not match or navit_merger_hidden_dim is not match, we need to initialize a new merger...")
110
+
111
+ return qwen2vl_vision_tower
112
+
113
+ def build_vision_tower(vision_tower_cfg, **kwargs):
114
+ vision_tower = getattr(vision_tower_cfg, "mm_vision_tower", getattr(vision_tower_cfg, "vision_tower", None))
115
+ if "siglip-so400m-patch14-384" in vision_tower or "Oryx-ViT" in vision_tower or "navit" in vision_tower.lower():
116
+ # if 'navit' in vision_tower, vision_tower_cfg.eagle_vision_tower is not None and vision_tower_cfg.only_navit is True
117
+ if "navit" in vision_tower.lower():
118
+ assert getattr(vision_tower_cfg, "only_navit", False) and \
119
+ getattr(vision_tower_cfg, "eagle_vision_tower", None) is not None
120
+
121
+ if getattr(vision_tower_cfg, "eagle_vision_tower", None) is not None:
122
+ if "Qwen2.5-VL" in vision_tower_cfg.eagle_vision_tower:
123
+ if getattr(vision_tower_cfg, "_vit_attn_implementation", None) is not None:
124
+ qwen2_5vl_vit_config._attn_implementation = vision_tower_cfg._vit_attn_implementation
125
+ qwen2_5vl_vit_config._attn_implementation_internal = vision_tower_cfg._vit_attn_implementation
126
+ qwen2vl_vision_tower = Qwen2_5_VisionTransformerPretrainedModel._from_config(qwen2_5vl_vit_config)
127
+ elif "Qwen2-VL" in vision_tower_cfg.eagle_vision_tower:
128
+ if getattr(vision_tower_cfg, "_vit_attn_implementation", None) is not None:
129
+ qwen2vl_vit_config._attn_implementation = vision_tower_cfg._vit_attn_implementation
130
+ qwen2vl_vit_config._attn_implementation_internal = vision_tower_cfg._vit_attn_implementation
131
+ qwen2vl_vision_tower = Qwen2VisionTransformerPretrainedModel._from_config(qwen2vl_vit_config)
132
+ else:
133
+ raise ValueError(f"Unknown vision tower: {vision_tower_cfg.eagle_vision_tower}")
134
+
135
+ qwen2vl_vision_tower = wrapped_qwen2vl_vision_tower(vision_tower_cfg, qwen2vl_vision_tower)
136
+ qwen2vl_vision_tower.requires_grad_(False)
137
+ if getattr(vision_tower_cfg, "only_navit", False):
138
+ return None, qwen2vl_vision_tower
139
+ else:
140
+ siglip_vision_tower = SigLipVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
141
+ return siglip_vision_tower, qwen2vl_vision_tower
142
+
143
+ # only return siglip vision tower if eagle vision tower is None
144
+ else:
145
+ return SigLipVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
146
+ elif "aimv2-huge-patch14-448" in vision_tower or "Ovis2-8B-visual" in vision_tower:
147
+ return AIMv2VisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
148
+ elif "aimv2-large-patch14-448" in vision_tower or "Ovis2-2B-visual" in vision_tower:
149
+ return AIMv2VisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
150
+ else:
151
+ raise ValueError(f"Unknown vision tower: {vision_tower}")
152
+
153
+ class SigLipVisionTower(nn.Module):
154
+ def __init__(self, vision_tower, args, delay_load=False, cache_dir="./cache_dir"):
155
+ super().__init__()
156
+ self.is_loaded = False
157
+ self.image_tower_name = vision_tower
158
+ self.select_layer = args.mm_vision_select_layer
159
+ self.select_feature = getattr(args, "mm_vision_select_feature", "patch")
160
+ self.cache_dir = cache_dir
161
+
162
+ if not delay_load:
163
+ self.load_model()
164
+ else:
165
+ from transformers import SiglipVisionModel
166
+ self.cfg_only = siglip_config
167
+ self.vision_tower = SiglipVisionModel._from_config(siglip_config) # dummy-load
168
+
169
+ def load_model(self):
170
+ from transformers import SiglipVisionModel
171
+ self.vision_tower = SiglipVisionModel._from_config(siglip_config)
172
+ self.vision_tower.requires_grad_(False)
173
+ self.is_loaded = True
174
+
175
+ def feature_select(self, image_forward_outs):
176
+ assert self.select_feature == "cls_patch"
177
+ image_features = torch.cat([image_forward_outs[:, :1, :], image_forward_outs], dim=1)
178
+ return image_features
179
+
180
+ def forward(self, images):
181
+ if type(images) is list:
182
+ image_features = []
183
+ for image in images:
184
+ image_forward_out = self.vision_tower(
185
+ image.to(device=self.device, dtype=self.dtype).unsqueeze(0),
186
+ output_hidden_states=True,
187
+ return_dict=True,
188
+ )
189
+ image_feature = self.feature_select(image_forward_out.last_hidden_state).to(image.dtype)
190
+ image_features.append(image_feature)
191
+ else:
192
+ image_forward_outs = self.vision_tower(
193
+ images.to(device=self.device, dtype=self.dtype),
194
+ output_hidden_states=True,
195
+ return_dict=True,
196
+ )
197
+ image_features = self.feature_select(image_forward_outs.last_hidden_state).to(images.dtype)
198
+
199
+ return image_features
200
+
201
+ @property
202
+ def dummy_feature(self):
203
+ return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
204
+
205
+ @property
206
+ def dtype(self):
207
+ return self.vision_tower.dtype
208
+
209
+ @property
210
+ def device(self):
211
+ return self.vision_tower.device
212
+
213
+ @property
214
+ def config(self):
215
+ if self.is_loaded:
216
+ return self.vision_tower.config
217
+ else:
218
+ return self.cfg_only
219
+
220
+ @property
221
+ def hidden_size(self):
222
+ return self.config.hidden_size
223
+
224
+ @property
225
+ def num_patches(self):
226
+ return (self.config.image_size // self.config.patch_size) ** 2
227
+
228
+
229
+ class CustomPatchMerger(nn.Module):
230
+ def __init__(self, dim: int, context_dim: int, hidden_dim: int, spatial_merge_size: int = 2) -> None:
231
+ super().__init__()
232
+ self.input_dim = context_dim * (spatial_merge_size**2)
233
+ self.ln_q = nn.LayerNorm(context_dim, eps=1e-6)
234
+ self.mlp = nn.Sequential(
235
+ nn.Linear(self.input_dim, hidden_dim),
236
+ nn.GELU(),
237
+ nn.Linear(hidden_dim, dim),
238
+ )
239
+
240
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
241
+ x = self.mlp(self.ln_q(x).view(-1, self.input_dim))
242
+ return x
243
+
244
+
245
+
246
+
247
+ class AIMv2VisionTower(nn.Module):
248
+ def __init__(self, vision_tower, args, delay_load=False, cache_dir='./cache_dir'):
249
+ super().__init__()
250
+
251
+ self.is_loaded = False
252
+
253
+ self.image_tower_name = vision_tower
254
+ self.select_layer = args.mm_vision_select_layer
255
+ self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
256
+
257
+ self.cache_dir = cache_dir
258
+ if not delay_load:
259
+ self.load_model()
260
+ else:
261
+ from transformers import AutoConfig, AutoModel
262
+ # self.cfg_only = AutoConfig.from_pretrained(self.image_tower_name, cache_dir=self.cache_dir, trust_remote_code=True)
263
+ # self.vision_tower = AutoModel.from_pretrained(self.vision_tower_name, trust_remote_code=True) # dummy-load
264
+ self.cfg_only = aimv2_config
265
+ self.vision_tower = AutoModel._from_config(aimv2_config) # dummy-load
266
+
267
+
268
+ def load_model(self):
269
+ from transformers import AutoConfig, AutoModel, AutoProcessor
270
+ self.image_processor = AutoProcessor.from_pretrained(self.image_tower_name, trust_remote_code=True)
271
+ self.vision_tower = AutoModel.from_pretrained(self.image_tower_name, trust_remote_code=True)
272
+ self.vision_tower.requires_grad_(False)
273
+ # self.image_processor.crop_size = self.image_processor.crop_size['height']
274
+ self.image_processor.crop_size = self.image_processor.size["shortest_edge"]
275
+
276
+ self.is_loaded = True
277
+
278
+ def feature_select(self, image_forward_outs):
279
+ assert self.select_feature == 'cls_patch'
280
+ image_features = torch.cat([image_forward_outs[:, :1, :], image_forward_outs], dim=1)
281
+ return image_features
282
+
283
+ def forward(self, images):
284
+ if type(images) is list:
285
+ image_features = []
286
+ for image in images:
287
+ image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True
288
+ ,return_dict=True,)
289
+ image_feature = self.feature_select(image_forward_out.last_hidden_state).to(image.dtype)
290
+ image_features.append(image_feature)
291
+ else:
292
+ image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True
293
+ ,return_dict=True,)
294
+ image_features = self.feature_select(image_forward_outs.last_hidden_state).to(images.dtype)
295
+
296
+ return image_features
297
+
298
+ # @property
299
+ # def dummy_feature(self):
300
+ # return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
301
+
302
+ @property
303
+ def dtype(self):
304
+ return self.vision_tower.dtype
305
+
306
+ @property
307
+ def device(self):
308
+ return self.vision_tower.device
309
+
310
+ @property
311
+ def config(self):
312
+ if self.is_loaded:
313
+ return self.vision_tower.config
314
+ else:
315
+ return self.cfg_only
316
+
317
+ @property
318
+ def hidden_size(self):
319
+ return self.config.hidden_size
320
+
321
+ @property
322
+ def num_patches(self):
323
+ return (self.config.image_size // self.config.patch_size) ** 2
preprocessor_config.json CHANGED
@@ -1,3 +1,6 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:580004e4d551f368ffe2173ef7da9a20c99d5d5fb4145d8f07a02d5ae6ef5ffa
3
- size 131
 
 
 
 
1
+ {
2
+ "processor_class": "ValleyProcessor",
3
+ "auto_map": {
4
+ "AutoProcessor": "processing_valley.ValleyProcessor"
5
+ }
6
+ }
processing_valley.py CHANGED
@@ -1,3 +1,618 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:211d2edfcd4c52e98f5192298409adfa77d6ae03a969b97112f6214d0b538600
3
- size 25837
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import types
3
+ import io
4
+ import torch
5
+ import os
6
+ from PIL import Image
7
+ import argparse
8
+ from qwen_vl_utils import fetch_image
9
+
10
+ from transformers import (
11
+ ProcessorMixin,
12
+ SiglipImageProcessor,
13
+ BatchFeature,
14
+ Qwen2VLImageProcessor,
15
+ PreTrainedTokenizer,
16
+ AutoImageProcessor,
17
+ CLIPImageProcessor,
18
+ )
19
+
20
+ from .utils import (
21
+ process_anyres_image,
22
+ preprocess_image_ovis,
23
+ ovis_template_process,
24
+ BLACK_IMG_ENV,
25
+ DEFAULT_IM_END_TOKEN,
26
+ DEFAULT_IM_START_TOKEN,
27
+ DEFAULT_IMAGE_TOKEN,
28
+ DEFAULT_VI_END_TOKEN,
29
+ DEFAULT_VI_START_TOKEN,
30
+ DEFAULT_VIDEO_TOKEN,
31
+ IMAGE_TOKEN_INDEX,
32
+ SEQ_MAX_LEN,
33
+ IGNORE_INDEX,
34
+ )
35
+
36
+ siglip_processor_config = {
37
+ "do_normalize": True,
38
+ "do_rescale": True,
39
+ "do_resize": True,
40
+ "image_mean": [
41
+ 0.5,
42
+ 0.5,
43
+ 0.5
44
+ ],
45
+ "image_processor_type": "SiglipImageProcessor",
46
+ "image_std": [
47
+ 0.5,
48
+ 0.5,
49
+ 0.5
50
+ ],
51
+ "processor_class": "SiglipProcessor",
52
+ "resample": 3,
53
+ "rescale_factor": 0.00392156862745098,
54
+ "size": {
55
+ "height": 384,
56
+ "width": 384
57
+ }
58
+ }
59
+
60
+ qwen2vl_processor_config = {
61
+ "min_pixels": 3136,
62
+ "max_pixels": 12845056,
63
+ "patch_size": 14,
64
+ "temporal_patch_size": 2,
65
+ "merge_size": 2,
66
+ "image_mean": [
67
+ 0.48145466,
68
+ 0.4578275,
69
+ 0.40821073
70
+ ],
71
+ "image_std": [
72
+ 0.26862954,
73
+ 0.26130258,
74
+ 0.27577711
75
+ ],
76
+ "image_processor_type": "Qwen2VLImageProcessor",
77
+ "processor_class": "Qwen2VLProcessor"
78
+ }
79
+
80
+ aimv2_processor_config = {
81
+ "crop_size": {
82
+ "height": 448,
83
+ "width": 448
84
+ },
85
+ "do_center_crop": True,
86
+ "do_convert_rgb": True,
87
+ "do_normalize": True,
88
+ "do_rescale": True,
89
+ "do_resize": True,
90
+ "image_mean": [
91
+ 0.48145466,
92
+ 0.4578275,
93
+ 0.40821073
94
+ ],
95
+ "image_processor_type": "CLIPImageProcessor",
96
+ "image_std": [
97
+ 0.26862954,
98
+ 0.26130258,
99
+ 0.27577711
100
+ ],
101
+ "resample": 3,
102
+ "rescale_factor": 0.00392156862745098,
103
+ "size": {
104
+ "shortest_edge": 448
105
+ }
106
+ }
107
+
108
+
109
+ class ValleyProcessor(ProcessorMixin):
110
+ attributes = ["tokenizer"]
111
+ optional_attributes = [
112
+ "max_pixels",
113
+ "min_pixels",
114
+ "anyres",
115
+ "only_crop_single_image",
116
+ "grid_pinpoints",
117
+ "use_special_start_end_token",
118
+ "only_navit",
119
+ "chat_template",
120
+ "process_mode",
121
+ ]
122
+ tokenizer_class = "AutoTokenizer"
123
+
124
+ def __init__(self, tokenizer=None, chat_template=None, **kwargs):
125
+ super().__init__(tokenizer=tokenizer, chat_template=chat_template, **kwargs)
126
+ self.black_img = BLACK_IMG_ENV
127
+ self.siglip_image_processor = SiglipImageProcessor.from_dict(siglip_processor_config)
128
+ self.qwen2vl_image_processor = Qwen2VLImageProcessor.from_dict(qwen2vl_processor_config)
129
+ self.aimv2_image_processor = CLIPImageProcessor.from_dict(aimv2_processor_config)
130
+ self.anyres = kwargs.get("anyres", True)
131
+ self.grid_pinpoints = kwargs.get("grid_pinpoints", "(1x1),...,(3x3)")
132
+ self.only_crop_single_image = kwargs.get("only_crop_single_image", True)
133
+ self.use_special_start_end_token = kwargs.get("use_special_start_end_token", True)
134
+ self.only_navit = kwargs.get("only_navit", False)
135
+ self.process_mode = kwargs.get("process_mode", "qwen3")
136
+
137
+ self.aimv2_crop_size = self.aimv2_image_processor.size["shortest_edge"]
138
+
139
+
140
+ def preprocess_images_siglip(self, images) -> torch.FloatTensor:
141
+ if isinstance(images[0], str):
142
+ images_pil = [Image.open(img).convert("RGB") for img in images]
143
+ elif isinstance(images[0], Image.Image):
144
+ images_pil = [img.convert("RGB") for img in images]
145
+ elif isinstance(images[0], bytes):
146
+ images_pil = [Image.open(io.BytesIO(img)).convert("RGB") for img in images]
147
+ else:
148
+ raise ValueError("unsupported type")
149
+
150
+ processed_images = []
151
+ have_multi_images = len(images_pil) > 1
152
+ for img in images_pil:
153
+ if self.anyres:
154
+ if not self.only_crop_single_image or not have_multi_images:
155
+ image = process_anyres_image(img, self.siglip_image_processor, self.grid_pinpoints)
156
+ else:
157
+ image = [self.siglip_image_processor(img, return_tensors="pt")["pixel_values"][0]]
158
+ else:
159
+ image = self.siglip_image_processor(img, return_tensors="pt")["pixel_values"][0]
160
+
161
+ processed_images.append(image)
162
+
163
+ if not self.anyres:
164
+ return torch.stack(processed_images, dim=0)
165
+ else:
166
+ return [torch.stack(img, dim=0) for img in processed_images]
167
+
168
+ def preprocess_images_qwen2vl(self, images) -> dict:
169
+ if isinstance(images[0], str):
170
+ images_pil = [Image.open(img).convert("RGB") for img in images]
171
+ elif isinstance(images[0], Image.Image):
172
+ images_pil = [img.convert("RGB") for img in images]
173
+ elif isinstance(images[0], bytes):
174
+ images_pil = [Image.open(io.BytesIO(img)).convert("RGB") for img in images]
175
+ else:
176
+ raise ValueError("unsupported type")
177
+
178
+ image_sizes = [[x.size for x in images_pil]]
179
+ data_dict_qwen2vl = self.qwen2vl_image_processor(
180
+ [fetch_image({"image": img}) for img in images_pil],
181
+ return_tensors="pt"
182
+ )
183
+
184
+ data_dict_qwen2vl["image_sizes"] = image_sizes
185
+
186
+ return data_dict_qwen2vl
187
+
188
+ def preprocess_multimodal(self, conversations):
189
+ for sentence in conversations:
190
+ if sentence["role"] == "system":
191
+ continue
192
+ segs = re.split(DEFAULT_IMAGE_TOKEN, sentence["content"])
193
+ if self.use_special_start_end_token:
194
+ sentence["content"] = (DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN).join(segs)
195
+ else:
196
+ sentence["content"] = DEFAULT_IMAGE_TOKEN.join(segs)
197
+
198
+ return conversations
199
+
200
+ def preprocess_images_aimv2(self, images) -> torch.FloatTensor:
201
+ processed_images = []
202
+ image_sizes_list = []
203
+ have_multi_images = len(images) > 1
204
+ for image_file in images:
205
+ if isinstance(image_file, str):
206
+ img = Image.open(image_file).convert("RGB")
207
+ elif isinstance(image_file, Image.Image):
208
+ img = image_file.convert("RGB")
209
+ elif isinstance(image_file, bytes):
210
+ img = Image.open(io.BytesIO(image_file)).convert("RGB")
211
+ else:
212
+ raise ValueError("unsupported type")
213
+ image_sizes_list.append(img.size)
214
+ if self.anyres:
215
+ if not self.only_crop_single_image or not have_multi_images:
216
+ img, ovis_image_placeholders = preprocess_image_ovis(img, image_processor=self.aimv2_image_processor, crop_size=self.aimv2_crop_size, max_partition=9)
217
+ else:
218
+ img, ovis_image_placeholders = preprocess_image_ovis(img, image_processor=self.aimv2_image_processor, crop_size=self.aimv2_crop_size, max_partition=1)
219
+ else:
220
+ img, ovis_image_placeholders = preprocess_image_ovis(img, image_processor=self.aimv2_image_processor, crop_size=self.aimv2_crop_size, max_partition=1)
221
+ img = (img, ovis_image_placeholders)
222
+ processed_images.append(img)
223
+
224
+ if not self.anyres:
225
+ return [(img[0], img[1]) for img in processed_images], [image_sizes_list]
226
+ else:
227
+ return [(torch.cat(img[0], dim=0), img[1]) for img in processed_images], [image_sizes_list]
228
+
229
+
230
+ def preprocess_qwen2(
231
+ self,
232
+ conversations,
233
+ tokenizer: PreTrainedTokenizer,
234
+ has_image: bool = False,
235
+ inference: bool = False,
236
+ only_mask_system: bool = False,
237
+ ) -> dict:
238
+ conv = types.SimpleNamespace(
239
+ system="You are a helpful assistant.",
240
+ roles=("user", "assistant"),
241
+ version="qwen2",
242
+ offset=0,
243
+ sep="<|im_start|>",
244
+ sep2="<|im_end|>\n",
245
+ )
246
+
247
+ # Check system prompt
248
+ assert conversations[0]["role"] == "system"
249
+ if conversations[0]["content"] == None:
250
+ conversations[0]["content"] = conv.system # use default system prompt
251
+
252
+ # Check conversation sequence
253
+ for j, sentence in enumerate(conversations[1:]):
254
+ role = sentence["role"]
255
+ assert role == conv.roles[j % 2], "The conversation sequence is incorrect."
256
+
257
+ conversation_str = tokenizer.apply_chat_template(conversations, tokenize=False, add_generation_prompt=inference)
258
+
259
+ # Mask targets
260
+ rounds = conversation_str.split(conv.sep2)
261
+ input_ids_ = torch.tensor([], dtype=torch.int64)
262
+ targets_ = torch.tensor([], dtype=torch.int64)
263
+ for i, rou in enumerate(rounds):
264
+ if rou == "":
265
+ continue
266
+ if (not inference) or (i < (len(rounds) - 1)):
267
+ rou += conv.sep2
268
+ if has_image:
269
+ cur_input_ids_ = self.tokenizer_image_token(rou, tokenizer, return_tensors='pt')
270
+ input_ids_ = torch.cat([input_ids_, cur_input_ids_], dim=0)
271
+ if only_mask_system:
272
+ mask_len = len(self.tokenizer_image_token(re.sub(rf'{conv.roles[0]}\n[\s\S]*', f'{conv.roles[0]}:', rou),
273
+ tokenizer))
274
+ else:
275
+ mask_len = len(self.tokenizer_image_token(re.sub(rf'{conv.roles[1]}\n[\s\S]*', f'{conv.roles[1]}:', rou),
276
+ tokenizer))
277
+ targets_ = torch.cat([targets_, torch.tensor([-100] * mask_len), cur_input_ids_[mask_len:]], dim=0)
278
+ else:
279
+ cur_input_ids_ = tokenizer(rou, return_tensors='pt')["input_ids"][0, :]
280
+ input_ids_ = torch.cat([input_ids_, cur_input_ids_], dim=0)
281
+ mask_len = len(tokenizer(re.sub(rf'{conv.roles[1]}\n[\s\S]*', rf'{conv.roles[1]}:', rou))["input_ids"][:])
282
+ targets_ = torch.cat([targets_, torch.tensor([-100] * mask_len), cur_input_ids_[mask_len:]], dim=0)
283
+
284
+ return {"input_ids": input_ids_, "labels": targets_}
285
+
286
+
287
+ def preprocess_qwen3(
288
+ self,
289
+ conversations,
290
+ tokenizer: PreTrainedTokenizer,
291
+ has_image: bool = False,
292
+ inference: bool = False,
293
+ only_mask_system: bool = False,
294
+ enable_thinking: bool = False, #ZYF Modify to support enable_thinking
295
+ ) -> dict:
296
+ conv = types.SimpleNamespace(
297
+ system="You are a helpful assistant.",
298
+ roles=("user", "assistant"),
299
+ version="qwen3",
300
+ offset=0,
301
+ sep="<|im_start|>",
302
+ sep2="<|im_end|>\n",
303
+ )
304
+ #print(conversations)
305
+
306
+ # Check system prompt
307
+
308
+ assert conversations[0]["role"] == "system"
309
+ if conversations[0]["content"] == None:
310
+ conversations[0]["content"] = conv.system # use default system prompt
311
+ # if conversations[0]['role'] == "system":
312
+ # conversations = conversations[1:]
313
+
314
+ # Check conversation sequence
315
+ # print(conversations)
316
+ for j, sentence in enumerate(conversations[1:]):
317
+ role = sentence["role"]
318
+ assert role == conv.roles[j % 2], "The conversation sequence is incorrect."
319
+
320
+ conversation_str = tokenizer.apply_chat_template(conversations, tokenize=False, add_generation_prompt=inference, enable_thinking=enable_thinking) #ZYF Modify to support thinking
321
+
322
+ # Mask targets
323
+ rounds = conversation_str.split(conv.sep2)
324
+ input_ids_ = torch.tensor([], dtype=torch.int64)
325
+ targets_ = torch.tensor([], dtype=torch.int64)
326
+ for i, rou in enumerate(rounds):
327
+ if rou == "":
328
+ continue
329
+ if (not inference) or (i < (len(rounds) - 1)):
330
+ rou += conv.sep2
331
+ if has_image:
332
+ cur_input_ids_ = self.tokenizer_image_token(rou, tokenizer, return_tensors='pt')
333
+ input_ids_ = torch.cat([input_ids_, cur_input_ids_], dim=0)
334
+ if only_mask_system:
335
+ mask_len = len(self.tokenizer_image_token(re.sub(rf'{conv.roles[0]}\n[\s\S]*', f'{conv.roles[0]}:', rou),
336
+ tokenizer))
337
+ else:
338
+ mask_len = len(self.tokenizer_image_token(re.sub(rf'{conv.roles[1]}\n[\s\S]*', f'{conv.roles[1]}:', rou),
339
+ tokenizer))
340
+ targets_ = torch.cat([targets_, torch.tensor([-100] * mask_len), cur_input_ids_[mask_len:]], dim=0)
341
+ else:
342
+ cur_input_ids_ = tokenizer(rou, return_tensors='pt')["input_ids"][0, :]
343
+ input_ids_ = torch.cat([input_ids_, cur_input_ids_], dim=0)
344
+ mask_len = len(tokenizer(re.sub(rf'{conv.roles[1]}\n[\s\S]*', rf'{conv.roles[1]}:', rou))["input_ids"][:])
345
+ targets_ = torch.cat([targets_, torch.tensor([-100] * mask_len), cur_input_ids_[mask_len:]], dim=0)
346
+
347
+ return {"input_ids": input_ids_, "labels": targets_}
348
+
349
+
350
+ def preprocess_ovis2(
351
+ self,
352
+ source, # do not include system prompt
353
+ tokenizer: PreTrainedTokenizer,
354
+ has_image: bool = False,
355
+ inference: bool = False,
356
+ only_mask_system: bool = False,
357
+ video_len: int = 0,
358
+ ):
359
+ # print(source)
360
+ judge_format = "from" in source[0].keys()
361
+
362
+ if judge_format:
363
+ if source[-1]["from"] == "gpt":
364
+ source = source[:-1]
365
+
366
+ roles = {"human": 'user', "gpt": 'assistant'}
367
+ input_ids = []
368
+ labels = []
369
+ messages = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
370
+ for message in source:
371
+ if message["from"] == "human":
372
+ user = message["value"]
373
+ if '<image>' not in user and '<video>' not in user:
374
+ messages += f"<|im_start|>{roles['human']}\n" + user + "<|im_end|>\n"
375
+
376
+ if '<image>' in user:
377
+ # import re
378
+ # image_count = user.count('<image>')
379
+ # user = re.sub(r'<image>', '', user).strip()
380
+ # user = '\n'.join([f'Image {i+1}: <image>' for i in range(image_count)]) + '\n' + user
381
+ messages += f"<|im_start|>{roles['human']}\n" + user + "<|im_end|>\n"
382
+
383
+ if '<video>' in user:
384
+ user = user.replace('<video>', '\n'.join(['<image>'] * video_len) + '\n')
385
+ messages += f"<|im_start|>{roles['human']}\n" + user + "<|im_end|>\n"
386
+
387
+
388
+ elif message["from"] == "gpt":
389
+ assistant = message["value"]
390
+ messages += f"<|im_start|>{roles['gpt']}\n" + assistant + "<|im_end|>\n"
391
+ if inference:
392
+ messages += f"<|im_start|>{roles['gpt']}\n"
393
+ else:
394
+ messages = messages[:-1] # remove the final '\n',keep <|im_end|> as the end
395
+
396
+ messages = messages.split('<image>')
397
+ messages = [tokenizer.encode(m) for m in messages]
398
+ for m in messages[:-1]:
399
+ input_ids += m
400
+ input_ids += [IMAGE_TOKEN_INDEX]
401
+ input_ids += messages[-1]
402
+
403
+ # mask last assistant
404
+ head_id = tokenizer.encode(f'<|im_start|>{roles["gpt"]}\n')
405
+ last_id = None
406
+ for i, id in enumerate(input_ids):
407
+ if input_ids[i:i+len(head_id)] == head_id:
408
+ last_id = i+len(head_id)
409
+ if i+len(head_id) > len(input_ids):
410
+ break
411
+
412
+ assert last_id != None
413
+ labels = len(input_ids) * [IGNORE_INDEX]
414
+ labels[last_id:] = input_ids[last_id:]
415
+ return {"input_ids": torch.tensor(input_ids), "labels": torch.tensor(labels)}
416
+
417
+ else:
418
+ if source[-1]["role"] == "assistant":
419
+ source = source[:-1]
420
+
421
+ input_ids = []
422
+ labels = []
423
+ messages = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
424
+ for message in source:
425
+ if message["role"] == "user":
426
+ user = message["value"]
427
+ if '<image>' not in user and '<video>' not in user:
428
+ messages += f"<|im_start|>user\n" + user + "<|im_end|>\n"
429
+
430
+ if '<image>' in user:
431
+ # import re
432
+ # image_count = user.count('<image>')
433
+ # user = re.sub(r'<image>', '', user).strip()
434
+ # user = '\n'.join([f'Image {i+1}: <image>' for i in range(image_count)]) + '\n' + user
435
+ messages += f"<|im_start|>user\n" + user + "<|im_end|>\n"
436
+
437
+ if '<video>' in user:
438
+ user = user.replace('<video>', '\n'.join(['<image>'] * video_len) + '\n')
439
+ messages += f"<|im_start|>user\n" + user + "<|im_end|>\n"
440
+
441
+ elif message["role"] == "assistant":
442
+ assistant = message["value"]
443
+ messages += f"<|im_start|>assistant\n" + assistant + "<|im_end|>\n"
444
+ if inference:
445
+ messages += f"<|im_start|>assistant\n"
446
+ else:
447
+ messages = messages[:-1] # remove the final '\n',keep <|im_end|> as the end
448
+
449
+ messages = messages.split('<image>')
450
+ messages = [tokenizer.encode(m) for m in messages]
451
+ for m in messages[:-1]:
452
+ input_ids += m
453
+ input_ids += [IMAGE_TOKEN_INDEX]
454
+ input_ids += messages[-1]
455
+
456
+ # mask last assistant
457
+ head_id = tokenizer.encode(f'<|im_start|>assistant\n')
458
+ last_id = None
459
+ for i, id in enumerate(input_ids):
460
+ if input_ids[i:i+len(head_id)] == head_id:
461
+ last_id = i+len(head_id)
462
+ if i+len(head_id) > len(input_ids):
463
+ break
464
+
465
+ assert last_id != None
466
+ labels = len(input_ids) * [IGNORE_INDEX]
467
+ labels[last_id:] = input_ids[last_id:]
468
+ return {"input_ids": torch.tensor(input_ids), "labels": torch.tensor(labels)}
469
+
470
+
471
+ def tokenizer_image_token(
472
+ self,
473
+ prompt,
474
+ tokenizer,
475
+ image_token_index=IMAGE_TOKEN_INDEX,
476
+ return_tensors=None,
477
+ ):
478
+ def split_with_token(string, token):
479
+ result = string.split(token)
480
+ for i in range(len(result) - 1):
481
+ result.insert(i * 2 + 1, token)
482
+ return result
483
+
484
+ if len(prompt) > SEQ_MAX_LEN:
485
+ raise ValueError("sequence is too long !!!")
486
+
487
+ prompt_chunks = split_with_token(prompt, DEFAULT_IMAGE_TOKEN)
488
+ input_ids, offset = ([tokenizer.bos_token_id], 1) if getattr(tokenizer,'bos_token',None) else ([], 0)
489
+ token2index = {DEFAULT_IMAGE_TOKEN: image_token_index}
490
+ for chunk in prompt_chunks:
491
+ if chunk in token2index:
492
+ input_ids.append(token2index[chunk])
493
+ else:
494
+ chunk_ids = tokenizer(chunk).input_ids
495
+ if chunk_ids[0] != getattr(tokenizer,'bos_token_id', None):
496
+ offset = 0
497
+ input_ids.extend(chunk_ids[offset:])
498
+
499
+ if return_tensors is not None:
500
+ if return_tensors == "pt":
501
+ return torch.tensor(input_ids, dtype=torch.long)
502
+ raise ValueError(f"Unsupported tensor type: {return_tensors}")
503
+ return input_ids
504
+
505
+
506
+
507
+ def __call__(self, messages, inference=True, **kwargs) -> BatchFeature:
508
+ # print("+++++++++++"*5+"Process get"+"++++++++++"*5)
509
+ # print(messages)
510
+ # print("+++++++++++"*10)
511
+ process_mode = self.process_mode
512
+ if process_mode == "ovis2":
513
+ video_len = kwargs.get('video_len', 0)
514
+ # max_tile_num = kwargs.get('max_tile_num', 1)
515
+
516
+ if "images" not in messages or not messages["images"] or not messages["images"][0]:
517
+ images = [self.black_img]
518
+ elif type(messages["images"]) == str:
519
+ images = [messages["images"]]
520
+ else:
521
+ images = messages["images"]
522
+
523
+ conversations = messages["conversations"]
524
+
525
+ # adapt for user-assistant format, transform to human-gpt format
526
+ if "role" in conversations[0]:
527
+ new_conversations = []
528
+ for conversation in conversations:
529
+ if conversation["role"] == "system":
530
+ new_conversations.append({"from": "system", "value": conversation["content"]})
531
+ elif conversation["role"] == "user":
532
+ new_conversations.append({"from": "human", "value": conversation["content"]})
533
+ elif conversation["role"] == "assistant":
534
+ new_conversations.append({"from": "gpt", "value": conversation["content"]})
535
+ conversations = new_conversations
536
+
537
+ # add <image> token
538
+ first_conv = conversations[1] if conversations[0]["from"] == "system" else conversations[0]
539
+ if images and "<image>" not in first_conv["value"]:
540
+ image_token = "\n".join(["<image>"] * len(images))
541
+ first_conv["value"] = f"{image_token}\n{first_conv['value']}"
542
+
543
+ data_dict = self.preprocess_ovis2(conversations, self.tokenizer, has_image=True, only_mask_system=False, inference=inference, video_len=video_len)
544
+ data_dict['images'], data_dict['image_sizes'] = self.preprocess_images_aimv2(images)
545
+ data_dict = ovis_template_process(data_dict)
546
+ # be batch
547
+ data_dict['images'] = [data_dict['images']]
548
+ data_dict['input_ids'] = data_dict['input_ids'].unsqueeze(0)
549
+ return BatchFeature(data={**data_dict})
550
+
551
+ elif process_mode == "qwen2" or process_mode == "qwen3":
552
+ max_pixels=kwargs.get("max_pixels", self.max_pixels)
553
+ min_pixels=kwargs.get("min_pixels", self.min_pixels)
554
+ if max_pixels is not None:
555
+ self.qwen2vl_image_processor.max_pixels = max_pixels
556
+ if min_pixels is not None:
557
+ self.qwen2vl_image_processor.min_pixels = min_pixels
558
+
559
+ # Deal with images
560
+ if "images" not in messages or not messages["images"] or not messages["images"][0]:
561
+ images = [self.black_img]
562
+ elif type(messages["images"]) == str:
563
+ images = [messages["images"]]
564
+ else:
565
+ images = messages["images"]
566
+
567
+ # Deal with conversations
568
+ conversations = messages["conversations"]
569
+ if conversations[0]["role"] != "system":
570
+ conversations = [{"role":"system", "content": None}] + conversations # dummy system prompt
571
+
572
+ # Insert special token `<image>`
573
+ assert conversations[1]["role"] == "user"
574
+ if images and "<image>" not in conversations[1]["content"]:
575
+ image_token = " ".join(["<image>"] * len(images))
576
+ conversations[1]["content"] = f"{image_token}\n{conversations[1]['content']}"
577
+
578
+ # The last message should be assistant if inference=True
579
+ if inference:
580
+ assert conversations[-1]["role"] == "user", "the last message should be assistant if inference=True"
581
+
582
+ # Image preprocess
583
+ if self.only_navit:
584
+ precessed_images_siglip = None
585
+ else:
586
+ precessed_images_siglip = self.preprocess_images_siglip(images)
587
+ processed_data_dict_qwen2vl = self.preprocess_images_qwen2vl(images)
588
+ source = self.preprocess_multimodal(conversations)
589
+ if process_mode == "qwen2":
590
+ data_dict = self.preprocess_qwen2(source, self.tokenizer, has_image=True, only_mask_system=False, inference=inference)
591
+ if process_mode == "qwen3":
592
+ # ZYF Modify to support thinking
593
+ enable_thinking = kwargs.get("enable_thinking", True) #默认开启
594
+ data_dict = self.preprocess_qwen3(source, self.tokenizer, has_image=True, only_mask_system=False, inference=inference, enable_thinking=enable_thinking)
595
+ # Construct batch data
596
+ data_dict["input_ids"] = data_dict["input_ids"].unsqueeze(0) # batch_size = 1
597
+ data_dict["labels"] = data_dict["labels"].unsqueeze(0)
598
+ data_dict["images"] = [precessed_images_siglip]
599
+
600
+ return BatchFeature(data={**data_dict, **processed_data_dict_qwen2vl})
601
+ else:
602
+ raise ValueError(f"Unsupported process mode: {process_mode}")
603
+
604
+ def batch_decode(self, *args, **kwargs):
605
+ """
606
+ This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
607
+ refer to the docstring of this method for more information.
608
+ """
609
+ return self.tokenizer.batch_decode(*args, **kwargs)
610
+
611
+
612
+ def decode(self, *args, **kwargs):
613
+ """
614
+ This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
615
+ the docstring of this method for more information.
616
+ """
617
+ return self.tokenizer.decode(*args, **kwargs)
618
+
special_tokens_map.json CHANGED
@@ -1,3 +1,37 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:8311bb5d0985329c5f9af218ea4b325fa39ab2b5a4655181ec65fc1a9f2702c7
3
- size 709
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>",
16
+ "<im_start>",
17
+ "<im_end>",
18
+ "<vi_start>",
19
+ "<vi_end>",
20
+ "<cor>",
21
+ "<\\cor>"
22
+ ],
23
+ "eos_token": {
24
+ "content": "<|im_end|>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "<|endoftext|>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
tokenizer_config.json CHANGED
@@ -1,3 +1,298 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:eeb1cc1135873700e48cddc62037d02509513ddd52affce419806f26edbdf5f6
3
- size 6828
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<tool_response>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "151666": {
190
+ "content": "</tool_response>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "151667": {
198
+ "content": "<think>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "151668": {
206
+ "content": "</think>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ },
213
+ "151669": {
214
+ "content": "<im_start>",
215
+ "lstrip": false,
216
+ "normalized": false,
217
+ "rstrip": false,
218
+ "single_word": false,
219
+ "special": true
220
+ },
221
+ "151670": {
222
+ "content": "<im_end>",
223
+ "lstrip": false,
224
+ "normalized": false,
225
+ "rstrip": false,
226
+ "single_word": false,
227
+ "special": true
228
+ },
229
+ "151671": {
230
+ "content": "<vi_start>",
231
+ "lstrip": false,
232
+ "normalized": false,
233
+ "rstrip": false,
234
+ "single_word": false,
235
+ "special": true
236
+ },
237
+ "151672": {
238
+ "content": "<vi_end>",
239
+ "lstrip": false,
240
+ "normalized": false,
241
+ "rstrip": false,
242
+ "single_word": false,
243
+ "special": true
244
+ },
245
+ "151673": {
246
+ "content": "<cor>",
247
+ "lstrip": false,
248
+ "normalized": false,
249
+ "rstrip": false,
250
+ "single_word": false,
251
+ "special": true
252
+ },
253
+ "151674": {
254
+ "content": "<\\cor>",
255
+ "lstrip": false,
256
+ "normalized": false,
257
+ "rstrip": false,
258
+ "single_word": false,
259
+ "special": true
260
+ }
261
+ },
262
+ "additional_special_tokens": [
263
+ "<|im_start|>",
264
+ "<|im_end|>",
265
+ "<|object_ref_start|>",
266
+ "<|object_ref_end|>",
267
+ "<|box_start|>",
268
+ "<|box_end|>",
269
+ "<|quad_start|>",
270
+ "<|quad_end|>",
271
+ "<|vision_start|>",
272
+ "<|vision_end|>",
273
+ "<|vision_pad|>",
274
+ "<|image_pad|>",
275
+ "<|video_pad|>",
276
+ "<im_start>",
277
+ "<im_end>",
278
+ "<vi_start>",
279
+ "<vi_end>",
280
+ "<cor>",
281
+ "<\\cor>"
282
+ ],
283
+ "auto_map": {
284
+ "AutoProcessor": "/mnt/bn/ecomcommonnas/zhangshuo/easyguard/checkpoints/VALLEY_B8_V1_GTHINKER_ENABLE_THINKING_COLD_START_V0908_MERGED_V1/checkpoint-400--processing_valley.ValleyProcessor"
285
+ },
286
+ "bos_token": null,
287
+ "clean_up_tokenization_spaces": false,
288
+ "eos_token": "<|im_end|>",
289
+ "errors": "replace",
290
+ "extra_special_tokens": {},
291
+ "model_max_length": 4096,
292
+ "pad_token": "<|endoftext|>",
293
+ "padding_side": "right",
294
+ "processor_class": "ValleyProcessor",
295
+ "split_special_tokens": false,
296
+ "tokenizer_class": "Qwen2Tokenizer",
297
+ "unk_token": null
298
+ }
utils.py CHANGED
@@ -1,3 +1,409 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b00b4131a3462f2ac50dd86d99aaafd0d21e98f24bf843f349d622cbe0d2bcb2
3
- size 16377
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PIL import Image
2
+ from io import BytesIO
3
+ import base64
4
+ import math
5
+ import ast
6
+ import re
7
+ import torch
8
+ from transformers import StoppingCriteria
9
+
10
+ IGNORE_INDEX = -100
11
+ IMAGE_TOKEN_INDEX = -200
12
+ GANDALF_TOKEN_INDEX = -300
13
+ DEFAULT_PAD_TOKEN = "[PAD]"
14
+ DEFAULT_EOS_TOKEN = "</s>"
15
+ DEFAULT_BOS_TOKEN = "</s>"
16
+ DEFAULT_UNK_TOKEN = "<unk>"
17
+ DEFAULT_IMAGE_TOKEN = "<image>"
18
+ DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
19
+ DEFAULT_IM_START_TOKEN = "<im_start>"
20
+ DEFAULT_IM_END_TOKEN = "<im_end>"
21
+ DEFAULT_VIDEO_TOKEN = "<video>"
22
+ DEFAULT_VIDEO_FRAME_TOKEN = "<vi_frame>"
23
+ DEFAULT_VI_START_TOKEN = "<vi_start>"
24
+ DEFAULT_VI_END_TOKEN = "<vi_end>"
25
+ DEFAULT_EOC_TOKEN = "<eoc>"
26
+ COR_START_TOKEN = "<cor>"
27
+ COR_END_TOKEN = "<\cor>"
28
+ SEQ_MAX_LEN = 50000
29
+ BLACK_IMG_ENV = b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x03\x00\x00\x00\x03\x08\x02\x00\x00\x00\xd9J"\xe8\x00\x00\x00\x12IDAT\x08\x1dcd\x80\x01F\x06\x18`d\x80\x01\x00\x00Z\x00\x04we\x03N\x00\x00\x00\x00IEND\xaeB`\x82'
30
+
31
+
32
+ def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
33
+ """
34
+ Calculate the shape of the image patch grid after the preprocessing for images of any resolution.
35
+ Args:
36
+ image_size (tuple): The size of the input image in the format (width, height).
37
+ grid_pinpoints (str): A string representation of a list of possible resolutions.
38
+ patch_size (int): The size of each image patch.
39
+ Returns:
40
+ tuple: The shape of the image patch grid in the format (width, height).
41
+ """
42
+ if isinstance(grid_pinpoints, str) and "x" in grid_pinpoints:
43
+ assert patch_size in [224, 336, 384, 448, 512], "patch_size should be in [224, 336, 384, 448, 512]"
44
+ # Use regex to extract the range from the input string
45
+ matches = re.findall(r"\((\d+)x(\d+)\)", grid_pinpoints)
46
+ range_start = tuple(map(int, matches[0]))
47
+ range_end = tuple(map(int, matches[-1]))
48
+ # Generate a matrix of tuples from (range_start[0], range_start[1]) to (range_end[0], range_end[1])
49
+ grid_pinpoints = [
50
+ (i, j)
51
+ for i in range(range_start[0], range_end[0] + 1)
52
+ for j in range(range_start[1], range_end[1] + 1)
53
+ ]
54
+ # Multiply all elements by patch_size
55
+ grid_pinpoints = [[dim * patch_size for dim in pair] for pair in grid_pinpoints]
56
+ if type(grid_pinpoints) is list:
57
+ possible_resolutions = grid_pinpoints
58
+ else:
59
+ possible_resolutions = ast.literal_eval(grid_pinpoints)
60
+ width, height = select_best_resolution(image_size, possible_resolutions)
61
+ return width // patch_size, height // patch_size
62
+
63
+ def select_best_resolution(original_size, possible_resolutions):
64
+ """
65
+ Selects the best resolution from a list of possible resolutions based on the original size.
66
+ Args:
67
+ original_size (tuple): The original size of the image in the format (width, height).
68
+ possible_resolutions (list): A list of possible resolutions in the format
69
+ [(width1, height1), (width2, height2), ...].
70
+ Returns:
71
+ tuple: The best fit resolution in the format (width, height).
72
+ """
73
+ original_width, original_height = original_size
74
+ best_fit = None
75
+ max_effective_resolution = 0
76
+ min_wasted_resolution = float("inf")
77
+
78
+ for width, height in possible_resolutions:
79
+ # Calculate the downscaled size to keep the aspect ratio
80
+ scale = min(width / original_width, height / original_height)
81
+ downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
82
+
83
+ # Calculate effective and wasted resolutions
84
+ effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
85
+ wasted_resolution = (width * height) - effective_resolution
86
+
87
+ if effective_resolution > max_effective_resolution or \
88
+ (effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution):
89
+ max_effective_resolution = effective_resolution
90
+ min_wasted_resolution = wasted_resolution
91
+ best_fit = (width, height)
92
+
93
+ return best_fit
94
+
95
+
96
+ def unpad_image(tensor, original_size):
97
+ """
98
+ Unpads a PyTorch tensor of a padded and resized image.
99
+ Args:
100
+ tensor (torch.Tensor): The image tensor, assumed to be in CxHxW format.
101
+ original_size (tuple): The original size of the image (height, width).
102
+ Returns:
103
+ torch.Tensor: The unpadded image tensor.
104
+ """
105
+ original_width, original_height = original_size
106
+ current_height, current_width = tensor.shape[1:]
107
+
108
+ # Compute aspect ratios
109
+ original_aspect_ratio = original_width / original_height
110
+ current_aspect_ratio = current_width / current_height
111
+
112
+ # Determine padding size and direction
113
+ if original_aspect_ratio > current_aspect_ratio:
114
+ # Padding was added to the height
115
+ scale_factor = current_width / original_width
116
+ new_height = int(original_height * scale_factor)
117
+ padding = (current_height - new_height) // 2
118
+ unpadded_tensor = tensor[:, padding: current_height - padding, :]
119
+ else:
120
+ # Padding was added to the width
121
+ scale_factor = current_height / original_height
122
+ new_width = int(original_width * scale_factor)
123
+ padding = (current_width - new_width) // 2
124
+ unpadded_tensor = tensor[:, :, padding: current_width - padding]
125
+
126
+ return unpadded_tensor
127
+
128
+
129
+ def process_anyres_image(image, processor, grid_pinpoints):
130
+ """
131
+ Process an image with variable resolutions.
132
+ Args:
133
+ image (PIL.Image.Image): The input image to be processed.
134
+ processor: The image processor object.
135
+ grid_pinpoints (str): A string representation of a list of possible resolutions.
136
+ Returns:
137
+ torch.Tensor: A tensor containing the processed image patches.
138
+ """
139
+ # Convert grid_pinpoints from string to list
140
+ if isinstance(grid_pinpoints, str) and "x" in grid_pinpoints:
141
+ try:
142
+ patch_size = processor.size["height"]
143
+ except Exception:
144
+ patch_size = processor.size["shortest_edge"]
145
+ assert patch_size in [224, 336, 384, 448, 512], "patch_size should be in [224, 336, 384, 448, 512]"
146
+ # Use regex to extract the range from the input string
147
+ matches = re.findall(r"\((\d+)x(\d+)\)", grid_pinpoints)
148
+ range_start = tuple(map(int, matches[0]))
149
+ range_end = tuple(map(int, matches[-1]))
150
+ # Generate a matrix of tuples from (range_start[0], range_start[1]) to (range_end[0], range_end[1])
151
+ grid_pinpoints = [
152
+ (i, j)
153
+ for i in range(range_start[0], range_end[0] + 1)
154
+ for j in range(range_start[1], range_end[1] + 1)
155
+ ]
156
+ # Multiply all elements by patch_size
157
+ grid_pinpoints = [[dim * patch_size for dim in pair] for pair in grid_pinpoints]
158
+
159
+ if type(grid_pinpoints) is list:
160
+ possible_resolutions = grid_pinpoints
161
+ else:
162
+ possible_resolutions = ast.literal_eval(grid_pinpoints)
163
+ best_resolution = select_best_resolution(image.size, possible_resolutions)
164
+ image_padded = resize_and_pad_image(image, best_resolution)
165
+
166
+ patches = divide_to_patches(image_padded, processor.size["height"])
167
+
168
+ # FIXME: this seems to be a bug that it resizes instead of pad.
169
+ # but to keep it consistent with previous, i will keep it as it is
170
+ # TODO: uncomment below to ablate with the padding
171
+ if isinstance(processor.size, dict):
172
+ shortest_edge = processor.size["height"]
173
+ else:
174
+ shortest_edge = min(processor.size)
175
+ image_original_resize = image.resize((shortest_edge, shortest_edge))
176
+ # image_padded_square = expand2square(image, tuple(int(x*255) for x in processor.image_mean))
177
+
178
+ image_patches = [image_original_resize] + patches
179
+ image_patches = [
180
+ processor.preprocess(image_patch, return_tensors="pt")["pixel_values"][0]
181
+ for image_patch in image_patches
182
+ ]
183
+ # return torch.stack(image_patches, dim=0)
184
+ return image_patches
185
+
186
+ def resize_and_pad_image(image, target_resolution):
187
+ """
188
+ Resize and pad an image to a target resolution while maintaining aspect ratio.
189
+ Args:
190
+ image (PIL.Image.Image): The input image.
191
+ target_resolution (tuple): The target resolution (width, height) of the image.
192
+ Returns:
193
+ PIL.Image.Image: The resized and padded image.
194
+ """
195
+ original_width, original_height = image.size
196
+ target_width, target_height = target_resolution
197
+
198
+ # Determine which dimension (width or height) to fill
199
+ scale_w = target_width / original_width
200
+ scale_h = target_height / original_height
201
+
202
+ if scale_w < scale_h:
203
+ # Width will be filled completely
204
+ new_width = target_width
205
+ new_height = min(math.ceil(original_height * scale_w), target_height)
206
+ else:
207
+ # Height will be filled completely
208
+ new_height = target_height
209
+ new_width = min(math.ceil(original_width * scale_h), target_width)
210
+
211
+ # Resize the image
212
+ resized_image = image.resize((new_width, new_height))
213
+
214
+ # Create a new image with the target size and paste the resized image onto it
215
+ new_image = Image.new("RGB", (target_width, target_height), (0, 0, 0))
216
+ paste_x = (target_width - new_width) // 2
217
+ paste_y = (target_height - new_height) // 2
218
+ new_image.paste(resized_image, (paste_x, paste_y))
219
+
220
+ return new_image
221
+
222
+ def divide_to_patches(image, patch_size):
223
+ """
224
+ Divides an image into patches of a specified size.
225
+ Args:
226
+ image (PIL.Image.Image): The input image.
227
+ patch_size (int): The size of each patch.
228
+ Returns:
229
+ list: A list of PIL.Image.Image objects representing the patches.
230
+ """
231
+ patches = []
232
+ width, height = image.size
233
+ for i in range(0, height, patch_size):
234
+ for j in range(0, width, patch_size):
235
+ box = (j, i, j + patch_size, i + patch_size)
236
+ patch = image.crop(box)
237
+ patches.append(patch)
238
+
239
+ return patches
240
+
241
+
242
+ from typing import List
243
+ import PIL.Image
244
+ import torch
245
+ import transformers
246
+ IGNORE_ID = -100
247
+ IMAGE_TOKEN_ID = -200
248
+ IMAGE_TOKEN = "<image>"
249
+ IMAGE_ATOM_ID = -300
250
+ IMAGE_INDICATOR_IDS = [-301, -302, -303, -304, -305]
251
+
252
+
253
+ def construct_image_placeholders(grid):
254
+ image_placeholders = [IMAGE_INDICATOR_IDS[0], IMAGE_ATOM_ID, IMAGE_INDICATOR_IDS[1]]
255
+ if grid[0] * grid[1] > 1:
256
+ for r in range(grid[0]):
257
+ for c in range(grid[1]):
258
+ image_placeholders.append(IMAGE_ATOM_ID)
259
+ if c < grid[1] - 1:
260
+ image_placeholders.append(IMAGE_INDICATOR_IDS[2])
261
+ if r < grid[0] - 1:
262
+ image_placeholders.append(IMAGE_INDICATOR_IDS[3])
263
+ image_placeholders.append(IMAGE_INDICATOR_IDS[4])
264
+ return image_placeholders
265
+
266
+
267
+ def preprocess_image_ovis(image: PIL.Image.Image, image_processor, crop_size, max_partition=9, covering_threshold=0.9, convert_to_rgb=True):
268
+ def _preprocess(img: PIL.Image.Image, side):
269
+ # first resize and preprocess
270
+ w, h = img.size
271
+ if w == h:
272
+ new_width = new_height = side
273
+ elif w > h:
274
+ new_width = side
275
+ new_height = int(h / w * new_width)
276
+ else:
277
+ new_height = side
278
+ new_width = int(w / h * new_height)
279
+ new_size = dict(height=new_height, width=new_width)
280
+ pixel_values = image_processor.preprocess(img, size=new_size, return_tensors='pt')['pixel_values']
281
+
282
+ # then pad to square
283
+ square_values = torch.zeros([1, 3, side, side], dtype=pixel_values.dtype, device=pixel_values.device)
284
+ new_height, new_width = pixel_values.shape[2:]
285
+ if new_height == new_width:
286
+ square_values[:, :, :, :] = pixel_values
287
+ elif new_height > new_width:
288
+ from_index = (side - new_width) // 2
289
+ square_values[:, :, :, from_index:from_index + new_width] = pixel_values
290
+ else:
291
+ from_index = (side - new_height) // 2
292
+ square_values[:, :, from_index:from_index + new_height, :] = pixel_values
293
+
294
+ return square_values
295
+
296
+ def _partition(img, grid):
297
+ w, h = img.size
298
+ row_height = h // grid[0]
299
+ col_width = w // grid[1]
300
+
301
+ partition = []
302
+ for row in range(grid[0]):
303
+ for col in range(grid[1]):
304
+ left = col * col_width
305
+ upper = row * row_height
306
+ right = w if col == grid[1] - 1 else (col + 1) * col_width
307
+ lower = h if row == grid[0] - 1 else (row + 1) * row_height
308
+ partition.append((left, upper, right, lower))
309
+
310
+ return partition
311
+
312
+ def _covering_area(left, upper, right, lower, side):
313
+ w = right - left
314
+ h = lower - upper
315
+ w, h = max(w, h), min(w, h)
316
+ if w > side:
317
+ h = h / w * side
318
+ w = side
319
+ return w * h
320
+
321
+ def _get_best_grid(img, side):
322
+ img_area = img.size[0] * img.size[1]
323
+
324
+ candidate_grids = []
325
+ for i in range(1, max_partition + 1):
326
+ for j in range(1, max_partition + 1):
327
+ if i * j <= max_partition:
328
+ candidate_grids.append((i, j))
329
+
330
+ all_grids = []
331
+ good_grids = []
332
+ for grid in candidate_grids:
333
+ partition = _partition(img, grid)
334
+ covering_ratio = sum([_covering_area(*p, side) for p in partition]) / img_area
335
+ assert covering_ratio <= 1.0
336
+ all_grids.append((grid, covering_ratio))
337
+ if covering_ratio > covering_threshold:
338
+ good_grids.append((grid, covering_ratio))
339
+
340
+ if len(good_grids) > 0:
341
+ # pick the good partition with minimum #sub_images and break the tie using covering_ratio
342
+ return sorted(good_grids, key=lambda x: (x[0][0] * x[0][1], -x[1]))[0][0]
343
+ else:
344
+ # pick the partition with maximum covering_ratio and break the tie using #sub_images
345
+ return sorted(all_grids, key=lambda x: (-x[1], x[0][0] * x[0][1]))[0][0]
346
+
347
+ if convert_to_rgb and image.mode != 'RGB':
348
+ image = image.convert('RGB')
349
+
350
+ # sides = self.get_image_size()
351
+ sides = [crop_size, crop_size]
352
+ if sides[0] != sides[1]:
353
+ raise ValueError('get_image_size() returns non-square size')
354
+ side = sides[0]
355
+ grid = _get_best_grid(image, side)
356
+ partition = _partition(image, grid)
357
+ crops = [image.crop(p) for p in partition]
358
+ if len(crops) > 1:
359
+ crops.insert(0, image)
360
+ # pixel_values = torch.cat([_preprocess(crop, side) for crop in crops], dim=0)
361
+ pixel_values = [_preprocess(crop, side) for crop in crops] # cat in the outer function
362
+ image_placeholders = construct_image_placeholders(grid)
363
+ return pixel_values, image_placeholders
364
+
365
+
366
+
367
+ def ovis_template_process(data_dict):
368
+ image = data_dict['images']
369
+ input_ids = data_dict['input_ids']
370
+ labels = data_dict['labels']
371
+ placeholder = []
372
+ new_input_ids = []
373
+ new_labels = []
374
+ for img in image:
375
+ placeholder.append(img[1])
376
+
377
+ indices = torch.nonzero(input_ids==IMAGE_TOKEN_ID).squeeze(1)
378
+ assert len(placeholder) == len(indices)
379
+
380
+ cnt = 0
381
+ idx = 0
382
+ for ids in input_ids:
383
+ if ids == IMAGE_TOKEN_ID:
384
+ for i in placeholder[cnt]:
385
+ new_input_ids.append(i)
386
+ new_labels.append(-100)
387
+ cnt += 1
388
+ idx += 1
389
+ else:
390
+ new_input_ids.append(input_ids[idx])
391
+ new_labels.append(labels[idx])
392
+ idx += 1
393
+
394
+ assert len(new_input_ids) == len(new_labels)
395
+ assert len(placeholder) == cnt
396
+
397
+ data_dict['images'] = [img[0] for img in data_dict['images']] # (3,3,448,448)
398
+ data_dict['input_ids'] = torch.tensor(new_input_ids)
399
+ data_dict['labels'] = torch.tensor(new_labels)
400
+ return data_dict
401
+
402
+
403
+ def pad_truncate_sequence(multimodal_max_length, sequences: List[torch.Tensor], batch_first: bool = True, padding_value: float = 0.0, left_padding: bool = False) -> torch.Tensor:
404
+ if not left_padding:
405
+ pad_sequence = torch.nn.utils.rnn.pad_sequence(sequences, batch_first=batch_first, padding_value=padding_value)
406
+ return pad_sequence[:,:multimodal_max_length]
407
+ else:
408
+ pad_sequence = torch.nn.utils.rnn.pad_sequence([i.flip(dims=[0]) for i in sequences],batch_first=True, padding_value=padding_value).flip(dims=[1])
409
+ return pad_sequence[:,multimodal_max_length:]