acsfid commited on
Commit
d00ea0a
·
verified ·
1 Parent(s): 2be8689

Upload PaddleOCR-VL split vision encoder artifacts

Browse files
README.md ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ library_name: PaddleOCR
4
+ tags:
5
+ - PaddleOCR
6
+ - PaddleOCR-VL
7
+ - vision-encoder
8
+ - multimodal
9
+ - document-parsing
10
+ ---
11
+
12
+ # PaddleOCR-VL Split Vision Encoder
13
+
14
+ This repository contains the extracted PaddleOCR-VL split visual artifacts uploaded separately from the full VLM.
15
+
16
+ ## Contents
17
+
18
+ - `vision_tower_config.json`
19
+ - `vision_tower.safetensors`
20
+ - `projector_config.json`
21
+ - `projector.safetensors`
22
+
23
+
24
+ ## Architecture
25
+
26
+ - Vision tower hidden size: `1152`
27
+ - Projector output hidden size: `1024`
28
+ - Target repo: `acsfid/PaddleOCR-VL-VisionEncoder`
29
+
30
+ ## Usage
31
+
32
+ ```python
33
+ from model.extracted_vision_encoder import PaddleOCRVLVisionTower, PaddleOCRVLProjector
34
+
35
+ artifact_dir = "."
36
+ vision_tower = PaddleOCRVLVisionTower.from_pretrained(artifact_dir)
37
+ projector = PaddleOCRVLProjector.from_pretrained(artifact_dir)
38
+ ```
39
+
40
+ The intended split flow is:
41
+
42
+ ```text
43
+ image_processor -> vision_tower -> projector -> decoder-ready image embeddings
44
+ ```
45
+
46
+ ## Included Python Source
47
+
48
+ This repo also includes the Python source files needed to load and use the split artifacts:
49
+
50
+ - `model/__init__.py`
51
+ - `model/configuration_paddleocr_vl.py`
52
+ - `model/image_processing_paddleocr_vl.py`
53
+ - `model/modeling_paddleocr_vl.py`
54
+ - `model/extracted_vision_encoder.py`
55
+ - `requirements.txt`
56
+
57
+ That means after cloning or downloading this repo, you can directly import the split classes for inference or later training work.
58
+
59
+
model/__init__.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .extracted_vision_encoder import (
2
+ PaddleOCRVLProjector,
3
+ PaddleOCRVLVisionEncoder,
4
+ PaddleOCRVLVisionTower,
5
+ )
6
+
7
+ __all__ = [
8
+ "PaddleOCRVLVisionTower",
9
+ "PaddleOCRVLProjector",
10
+ "PaddleOCRVLVisionEncoder",
11
+ ]
model/configuration_paddleocr_vl.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from transformers.configuration_utils import PretrainedConfig
16
+ from transformers.modeling_rope_utils import rope_config_validation
17
+
18
+ class PaddleOCRVisionConfig(PretrainedConfig):
19
+ model_type = "paddleocr_vl"
20
+ base_config_key = "vision_config"
21
+
22
+ def __init__(
23
+ self,
24
+ hidden_size=768,
25
+ intermediate_size=3072,
26
+ num_hidden_layers=12,
27
+ num_attention_heads=12,
28
+ num_channels=3,
29
+ image_size=224,
30
+ patch_size=14,
31
+ hidden_act="gelu_pytorch_tanh",
32
+ layer_norm_eps=1e-6,
33
+ attention_dropout=0.0,
34
+ spatial_merge_size=2,
35
+ temporal_patch_size=2,
36
+ tokens_per_second=2,
37
+ **kwargs,
38
+ ):
39
+ super().__init__(**kwargs)
40
+
41
+ self.hidden_size = hidden_size
42
+ self.intermediate_size = intermediate_size
43
+ self.num_hidden_layers = num_hidden_layers
44
+ self.num_attention_heads = num_attention_heads
45
+ self.num_channels = num_channels
46
+ self.patch_size = patch_size
47
+ self.image_size = image_size
48
+ self.attention_dropout = attention_dropout
49
+ self.layer_norm_eps = layer_norm_eps
50
+ self.hidden_act = hidden_act
51
+ self.spatial_merge_size = spatial_merge_size
52
+ self.temporal_patch_size = temporal_patch_size
53
+ self.tokens_per_second = tokens_per_second
54
+
55
+
56
+
57
+ class PaddleOCRVLConfig(PretrainedConfig):
58
+ """
59
+ Configuration class.
60
+
61
+ This class stores the configuration of an Ernie model, defining the model architecture.
62
+ It inherits from PretrainedConfig and can be used to control model outputs.
63
+ """
64
+
65
+ model_type = "paddleocr_vl"
66
+ keys_to_ignore_at_inference = ["past_key_values"]
67
+ sub_configs = {"vision_config": PaddleOCRVisionConfig}
68
+
69
+ # Default tensor parallel plan for base model `Qwen3`
70
+ base_model_tp_plan = {
71
+ "layers.*.self_attn.q_proj": "colwise",
72
+ "layers.*.self_attn.k_proj": "colwise",
73
+ "layers.*.self_attn.v_proj": "colwise",
74
+ "layers.*.self_attn.o_proj": "rowwise",
75
+ "layers.*.mlp.gate_proj": "colwise",
76
+ "layers.*.mlp.up_proj": "colwise",
77
+ "layers.*.mlp.down_proj": "rowwise",
78
+ }
79
+ base_model_pp_plan = {
80
+ "embed_tokens": (["input_ids"], ["inputs_embeds"]),
81
+ "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
82
+ "norm": (["hidden_states"], ["hidden_states"]),
83
+ }
84
+
85
+ def __init__(
86
+ self,
87
+ vocab_size=32000,
88
+ hidden_size=768,
89
+ intermediate_size=11008,
90
+ max_position_embeddings=32768,
91
+ num_hidden_layers=2,
92
+ num_attention_heads=2,
93
+ image_token_id=101304,
94
+ video_token_id=101305,
95
+ vision_start_token_id=101306,
96
+ rms_norm_eps=1e-6,
97
+ use_cache=False,
98
+ use_flash_attention=False,
99
+ pad_token_id=0,
100
+ bos_token_id=1,
101
+ eos_token_id=2,
102
+ head_dim=128,
103
+ hidden_act="silu",
104
+ use_bias=False,
105
+ rope_theta=10000,
106
+ weight_share_add_bias=True,
107
+ ignored_index=-100,
108
+ attention_probs_dropout_prob=0.0,
109
+ hidden_dropout_prob=0.0,
110
+ compression_ratio: float = 1.0,
111
+ num_key_value_heads=None,
112
+ max_sequence_length=None,
113
+ tie_word_embeddings=False,
114
+ vision_config=None,
115
+ rope_scaling=None,
116
+ **kwargs,
117
+ ):
118
+ """
119
+ Initialize configuration with default or specified parameters.
120
+
121
+ Args:
122
+ vocab_size (int): Size of the vocabulary (number of unique tokens)
123
+ hidden_size (int): Dimensionality of the encoder layers and the pooler layer
124
+ intermediate_size (int): Dimensionality of the "intermediate" (feed-forward) layer
125
+ max_position_embeddings (int): Maximum sequence length the model can handle
126
+ num_hidden_layers (int): Number of hidden layers in the Transformer encoder
127
+ num_attention_heads (int): Number of attention heads for each attention layer
128
+ rms_norm_eps (float): The epsilon used by the RMS normalization layers
129
+ use_cache (bool): Whether to use caching for faster generation (decoding)
130
+ use_flash_attention (bool): Whether to use FlashAttention for optimized attention computation
131
+ pad_token_id (int): Token ID used for padding sequences
132
+ bos_token_id (int): Token ID used for beginning-of-sequence
133
+ eos_token_id (int): Token ID used for end-of-sequence
134
+ use_bias (bool): Whether to use bias terms in linear layers
135
+ rope_theta (float): The base period of the RoPE embeddings
136
+ weight_share_add_bias (bool): Whether to share bias weights in certain layers
137
+ ignored_index (int): Target value that is ignored during loss computation
138
+ attention_probs_dropout_prob (float): Dropout probability for attention weights
139
+ hidden_dropout_prob (float): Dropout probability for hidden layers
140
+ compression_ratio (float): Ratio for KV cache compression (1.0 = no compression)
141
+ num_key_value_heads (int): Number of key/value heads (for Grouped Query Attention)
142
+ max_sequence_length (int): Maximum sequence length for positional embeddings
143
+ **kwargs: Additional keyword arguments passed to parent class
144
+ """
145
+
146
+ # Set default for tied embeddings if not specified.
147
+ super().__init__(
148
+ pad_token_id=pad_token_id,
149
+ bos_token_id=bos_token_id,
150
+ eos_token_id=eos_token_id,
151
+ **kwargs,
152
+ )
153
+ if isinstance(vision_config, dict):
154
+ self.vision_config = self.sub_configs["vision_config"](**vision_config)
155
+ elif vision_config is None:
156
+ self.vision_config = self.sub_configs["vision_config"]()
157
+ self.vocab_size = vocab_size
158
+ self.hidden_size = hidden_size
159
+ self.intermediate_size = intermediate_size
160
+ self.max_position_embeddings = max_position_embeddings
161
+ self.num_hidden_layers = num_hidden_layers
162
+ self.num_attention_heads = num_attention_heads
163
+ self.rms_norm_eps = rms_norm_eps
164
+ self.use_cache = use_cache
165
+ self.use_flash_attention = use_flash_attention
166
+ self.pad_token_id = pad_token_id
167
+ self.bos_token_id = bos_token_id
168
+ self.eos_token_id = eos_token_id
169
+ self.image_token_id = image_token_id
170
+ self.video_token_id = video_token_id
171
+ self.vision_start_token_id = vision_start_token_id
172
+ self.head_dim = head_dim
173
+ self.hidden_act=hidden_act
174
+ self.sliding_window = None
175
+ self.hidden_size = hidden_size
176
+ self.use_bias = use_bias
177
+ self.weight_share_add_bias = weight_share_add_bias
178
+ self.rope_theta = rope_theta
179
+ self.ignored_index = ignored_index
180
+ self.attention_probs_dropout_prob = attention_probs_dropout_prob
181
+ self.hidden_dropout_prob = hidden_dropout_prob
182
+ self.compression_ratio = compression_ratio
183
+ self.num_key_value_heads = num_key_value_heads
184
+ self.max_sequence_length = max_sequence_length
185
+ self.rope_scaling = rope_scaling
186
+ if self.rope_scaling is not None and "type" in self.rope_scaling:
187
+ if self.rope_scaling["type"] == "mrope":
188
+ self.rope_scaling["type"] = "default"
189
+ self.rope_scaling["rope_type"] = self.rope_scaling["type"]
190
+ rope_config_validation(self, ignore_keys={"mrope_section"})
191
+ super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
model/extracted_vision_encoder.py ADDED
@@ -0,0 +1,512 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from pathlib import Path
3
+ from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
4
+
5
+ import numpy as np
6
+ import torch
7
+ from transformers.modeling_outputs import BaseModelOutputWithPooling
8
+ from transformers.processing_utils import BatchFeature
9
+
10
+ from .configuration_paddleocr_vl import PaddleOCRVLConfig
11
+ from .image_processing_paddleocr_vl import PaddleOCRVLImageProcessor
12
+ from .modeling_paddleocr_vl import PaddleOCRVisionModel, Projector
13
+
14
+
15
+ VISION_TOWER_CONFIG_NAME = "vision_tower_config.json"
16
+ VISION_TOWER_WEIGHTS_NAME = "vision_tower.safetensors"
17
+ PROJECTOR_CONFIG_NAME = "projector_config.json"
18
+ PROJECTOR_WEIGHTS_NAME = "projector.safetensors"
19
+ FULL_MODEL_CONFIG_NAME = "config.json"
20
+ FULL_MODEL_WEIGHTS_NAME = "model.safetensors"
21
+ FULL_VISUAL_PREFIX = "visual."
22
+ FULL_PROJECTOR_PREFIX = "mlp_AR."
23
+ STANDALONE_VISUAL_PREFIX = "visual."
24
+ STANDALONE_PROJECTOR_PREFIX = "projector."
25
+
26
+
27
+ def _read_json(path: Union[str, Path]) -> Dict[str, Any]:
28
+ with open(path, "r", encoding="utf-8") as f:
29
+ return json.load(f)
30
+
31
+
32
+ def _write_json(path: Union[str, Path], payload: Dict[str, Any]) -> None:
33
+ with open(path, "w", encoding="utf-8") as f:
34
+ json.dump(payload, f, indent=2, ensure_ascii=False)
35
+
36
+
37
+ def _normalize_image_grid_thw(
38
+ image_grid_thw: Union[torch.Tensor, Sequence[Any]]
39
+ ) -> List[Tuple[int, int, int]]:
40
+ if isinstance(image_grid_thw, torch.Tensor):
41
+ return [tuple(int(v) for v in row.tolist()) for row in image_grid_thw]
42
+
43
+ normalized: List[Tuple[int, int, int]] = []
44
+ for item in image_grid_thw:
45
+ if isinstance(item, torch.Tensor):
46
+ normalized.append(tuple(int(v) for v in item.tolist()))
47
+ else:
48
+ normalized.append(tuple(int(v) for v in item))
49
+ return normalized
50
+
51
+
52
+ def build_vision_encoder_export_config(
53
+ full_config: Union[PaddleOCRVLConfig, Dict[str, Any]]
54
+ ) -> Dict[str, Any]:
55
+ if isinstance(full_config, PaddleOCRVLConfig):
56
+ full_config_dict = full_config.to_dict()
57
+ else:
58
+ full_config_dict = dict(full_config)
59
+
60
+ vision_config = dict(full_config_dict["vision_config"])
61
+
62
+ return {
63
+ "model_type": "paddleocr_vl_vision_encoder",
64
+ "architectures": ["PaddleOCRVLVisionEncoder"],
65
+ "source_model_type": full_config_dict.get("model_type", "paddleocr_vl"),
66
+ "source_architecture": "PaddleOCRVLForConditionalGeneration",
67
+ "text_hidden_size": full_config_dict["hidden_size"],
68
+ "image_token_id": full_config_dict.get("image_token_id"),
69
+ "vision_start_token_id": full_config_dict.get("vision_start_token_id"),
70
+ "vision_end_token_id": full_config_dict.get("vision_end_token_id"),
71
+ "torch_dtype": full_config_dict.get("torch_dtype"),
72
+ "vision_config": vision_config,
73
+ "projector": {
74
+ "merge_kernel_size": [2, 2],
75
+ "input_hidden_size": vision_config["hidden_size"],
76
+ "output_hidden_size": full_config_dict["hidden_size"],
77
+ },
78
+ "required_weight_prefixes": [
79
+ STANDALONE_VISUAL_PREFIX,
80
+ STANDALONE_PROJECTOR_PREFIX,
81
+ ],
82
+ "source_weight_prefixes": {
83
+ "visual": FULL_VISUAL_PREFIX,
84
+ "projector": FULL_PROJECTOR_PREFIX,
85
+ },
86
+ "full_model_config": full_config_dict,
87
+ }
88
+
89
+
90
+ def build_vision_tower_export_config(
91
+ full_config: Union[PaddleOCRVLConfig, Dict[str, Any]]
92
+ ) -> Dict[str, Any]:
93
+ combined = build_vision_encoder_export_config(full_config)
94
+ return {
95
+ "model_type": "paddleocr_vl_vision_tower",
96
+ "architectures": ["PaddleOCRVLVisionTower"],
97
+ "torch_dtype": combined.get("torch_dtype"),
98
+ "vision_config": combined["vision_config"],
99
+ "required_weight_prefixes": [STANDALONE_VISUAL_PREFIX],
100
+ "source_weight_prefixes": {"visual": FULL_VISUAL_PREFIX},
101
+ "full_model_config": combined["full_model_config"],
102
+ }
103
+
104
+
105
+ def build_projector_export_config(
106
+ full_config: Union[PaddleOCRVLConfig, Dict[str, Any]]
107
+ ) -> Dict[str, Any]:
108
+ combined = build_vision_encoder_export_config(full_config)
109
+ return {
110
+ "model_type": "paddleocr_vl_projector",
111
+ "architectures": ["PaddleOCRVLProjector"],
112
+ "torch_dtype": combined.get("torch_dtype"),
113
+ "vision_config": combined["vision_config"],
114
+ "text_hidden_size": combined["text_hidden_size"],
115
+ "projector": combined["projector"],
116
+ "required_weight_prefixes": [STANDALONE_PROJECTOR_PREFIX],
117
+ "source_weight_prefixes": {"projector": FULL_PROJECTOR_PREFIX},
118
+ "full_model_config": combined["full_model_config"],
119
+ }
120
+
121
+
122
+ def remap_full_model_state_dict_to_vision_encoder_parts(
123
+ full_state_dict: Dict[str, torch.Tensor]
124
+ ) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor], Dict[str, List[str]]]:
125
+ visual_state_dict: Dict[str, torch.Tensor] = {}
126
+ projector_state_dict: Dict[str, torch.Tensor] = {}
127
+ consumed_visual: List[str] = []
128
+ consumed_projector: List[str] = []
129
+
130
+ for key, value in full_state_dict.items():
131
+ if key.startswith(FULL_VISUAL_PREFIX):
132
+ new_key = STANDALONE_VISUAL_PREFIX + key[len(FULL_VISUAL_PREFIX) :]
133
+ visual_state_dict[new_key] = value
134
+ consumed_visual.append(key)
135
+ elif key.startswith(FULL_PROJECTOR_PREFIX):
136
+ new_key = STANDALONE_PROJECTOR_PREFIX + key[len(FULL_PROJECTOR_PREFIX) :]
137
+ projector_state_dict[new_key] = value
138
+ consumed_projector.append(key)
139
+
140
+ if not consumed_visual:
141
+ raise ValueError("No visual.* weights were found in the full model state dict.")
142
+ if not consumed_projector:
143
+ raise ValueError("No mlp_AR.* weights were found in the full model state dict.")
144
+
145
+ return visual_state_dict, projector_state_dict, {
146
+ "visual": sorted(consumed_visual),
147
+ "projector": sorted(consumed_projector),
148
+ }
149
+
150
+
151
+ def remap_full_model_state_dict_to_vision_encoder(
152
+ full_state_dict: Dict[str, torch.Tensor]
153
+ ) -> Tuple[Dict[str, torch.Tensor], Dict[str, List[str]]]:
154
+ visual_state_dict, projector_state_dict, consumed = (
155
+ remap_full_model_state_dict_to_vision_encoder_parts(full_state_dict)
156
+ )
157
+ remapped = {}
158
+ remapped.update(visual_state_dict)
159
+ remapped.update(projector_state_dict)
160
+ return remapped, consumed
161
+
162
+
163
+ def _load_safetensors_state_dict(path: Union[str, Path]) -> Dict[str, torch.Tensor]:
164
+ try:
165
+ from safetensors.torch import load_file
166
+ except ImportError as e:
167
+ raise RuntimeError(
168
+ "Loading safetensors requires the `safetensors` package to be installed."
169
+ ) from e
170
+
171
+ return load_file(str(path))
172
+
173
+
174
+ def _save_safetensors_state_dict(
175
+ state_dict: Dict[str, torch.Tensor], path: Union[str, Path]
176
+ ) -> None:
177
+ try:
178
+ from safetensors.torch import save_file
179
+ except ImportError as e:
180
+ raise RuntimeError(
181
+ "Saving safetensors requires the `safetensors` package to be installed."
182
+ ) from e
183
+
184
+ save_file(state_dict, str(path))
185
+
186
+
187
+ def extract_and_save_vision_encoder_artifacts(
188
+ full_config: Union[PaddleOCRVLConfig, Dict[str, Any]],
189
+ full_state_dict: Dict[str, torch.Tensor],
190
+ output_dir: Union[str, Path],
191
+ ) -> Dict[str, Any]:
192
+ output_dir = Path(output_dir)
193
+ output_dir.mkdir(parents=True, exist_ok=True)
194
+
195
+ vision_tower_config = build_vision_tower_export_config(full_config)
196
+ projector_config = build_projector_export_config(full_config)
197
+ visual_state_dict, projector_state_dict, consumed = (
198
+ remap_full_model_state_dict_to_vision_encoder_parts(full_state_dict)
199
+ )
200
+ _save_safetensors_state_dict(
201
+ visual_state_dict, output_dir / VISION_TOWER_WEIGHTS_NAME
202
+ )
203
+ _write_json(output_dir / VISION_TOWER_CONFIG_NAME, vision_tower_config)
204
+ _save_safetensors_state_dict(
205
+ projector_state_dict, output_dir / PROJECTOR_WEIGHTS_NAME
206
+ )
207
+ _write_json(output_dir / PROJECTOR_CONFIG_NAME, projector_config)
208
+
209
+ combined_export_config = build_vision_encoder_export_config(full_config)
210
+ combined_state_dict, _ = remap_full_model_state_dict_to_vision_encoder(
211
+ full_state_dict
212
+ )
213
+ combined_dir = output_dir / "combined"
214
+ combined_dir.mkdir(parents=True, exist_ok=True)
215
+ _save_safetensors_state_dict(
216
+ combined_state_dict, combined_dir / "vision_encoder.safetensors"
217
+ )
218
+ _write_json(combined_dir / "vision_encoder_config.json", combined_export_config)
219
+
220
+ metadata = {
221
+ "vision_tower_config_path": str(output_dir / VISION_TOWER_CONFIG_NAME),
222
+ "vision_tower_weights_path": str(output_dir / VISION_TOWER_WEIGHTS_NAME),
223
+ "projector_config_path": str(output_dir / PROJECTOR_CONFIG_NAME),
224
+ "projector_weights_path": str(output_dir / PROJECTOR_WEIGHTS_NAME),
225
+ "combined_config_path": str(combined_dir / "vision_encoder_config.json"),
226
+ "combined_weights_path": str(combined_dir / "vision_encoder.safetensors"),
227
+ "num_exported_visual_tensors": len(visual_state_dict),
228
+ "num_exported_projector_tensors": len(projector_state_dict),
229
+ "consumed_full_model_keys": consumed,
230
+ }
231
+ return metadata
232
+
233
+
234
+ class PaddleOCRVLVisionTower(torch.nn.Module):
235
+ def __init__(self, config: PaddleOCRVLConfig):
236
+ super().__init__()
237
+ self.config = config
238
+ self.visual = PaddleOCRVisionModel(config.vision_config)
239
+ self.export_config = build_vision_tower_export_config(config)
240
+
241
+ @staticmethod
242
+ def _resolve_full_config(config_payload: Dict[str, Any]) -> PaddleOCRVLConfig:
243
+ if config_payload.get("model_type") == "paddleocr_vl_vision_tower":
244
+ config_payload = config_payload["full_model_config"]
245
+ return PaddleOCRVLConfig(**config_payload)
246
+
247
+ @classmethod
248
+ def from_pretrained(cls, model_dir: Union[str, Path]) -> "PaddleOCRVLVisionTower":
249
+ model_dir = Path(model_dir)
250
+ config_path = model_dir / VISION_TOWER_CONFIG_NAME
251
+ weights_path = model_dir / VISION_TOWER_WEIGHTS_NAME
252
+ if config_path.exists():
253
+ config_payload = _read_json(config_path)
254
+ else:
255
+ config_payload = _read_json(model_dir / FULL_MODEL_CONFIG_NAME)
256
+ model = cls(cls._resolve_full_config(config_payload))
257
+ if weights_path.exists():
258
+ state_dict = _load_safetensors_state_dict(weights_path)
259
+ else:
260
+ full_state_dict = _load_safetensors_state_dict(model_dir / FULL_MODEL_WEIGHTS_NAME)
261
+ state_dict, _, _ = remap_full_model_state_dict_to_vision_encoder_parts(
262
+ full_state_dict
263
+ )
264
+ missing, unexpected = model.load_state_dict(state_dict, strict=True)
265
+ if missing or unexpected:
266
+ raise RuntimeError(
267
+ f"Failed to load standalone vision tower weights. Missing: {missing}, unexpected: {unexpected}"
268
+ )
269
+ return model
270
+
271
+ def save_pretrained(self, output_dir: Union[str, Path]) -> None:
272
+ output_dir = Path(output_dir)
273
+ output_dir.mkdir(parents=True, exist_ok=True)
274
+ _save_safetensors_state_dict(self.state_dict(), output_dir / VISION_TOWER_WEIGHTS_NAME)
275
+ _write_json(output_dir / VISION_TOWER_CONFIG_NAME, self.export_config)
276
+
277
+ @staticmethod
278
+ def _build_visual_inputs(
279
+ pixel_values: torch.Tensor,
280
+ image_grid_thw: List[Tuple[int, int, int]],
281
+ device: torch.device,
282
+ ) -> Tuple[
283
+ torch.Tensor,
284
+ torch.Tensor,
285
+ List[Tuple[int, int, int]],
286
+ torch.Tensor,
287
+ torch.Tensor,
288
+ ]:
289
+ if pixel_values.dim() == 4:
290
+ pixel_values = pixel_values.unsqueeze(0)
291
+ elif pixel_values.dim() != 5:
292
+ raise ValueError(
293
+ "pixel_values must have shape [num_patches, C, H, W] or [1, num_patches, C, H, W]."
294
+ )
295
+
296
+ siglip_position_ids = []
297
+ sample_indices = []
298
+ cu_seqlens = [0]
299
+
300
+ for idx, thw in enumerate(image_grid_thw):
301
+ numel = int(np.prod(thw))
302
+ image_position_ids = torch.arange(numel, device=device) % int(np.prod(thw[1:]))
303
+ siglip_position_ids.append(image_position_ids)
304
+ sample_indices.append(torch.full((numel,), idx, dtype=torch.int64, device=device))
305
+ cu_seqlens.append(cu_seqlens[-1] + numel)
306
+
307
+ if siglip_position_ids:
308
+ siglip_position_ids = torch.cat(siglip_position_ids, dim=0)
309
+ sample_indices = torch.cat(sample_indices, dim=0)
310
+ else:
311
+ siglip_position_ids = torch.empty(0, dtype=torch.long, device=device)
312
+ sample_indices = torch.empty(0, dtype=torch.long, device=device)
313
+
314
+ cu_seqlens_tensor = torch.tensor(cu_seqlens, dtype=torch.int32, device=device)
315
+ return pixel_values, siglip_position_ids, image_grid_thw, sample_indices, cu_seqlens_tensor
316
+
317
+ def forward(
318
+ self,
319
+ pixel_values: torch.Tensor,
320
+ image_grid_thw: Union[torch.Tensor, Sequence[Any]],
321
+ ) -> Dict[str, Any]:
322
+ image_grid_thw_list = _normalize_image_grid_thw(image_grid_thw)
323
+ vision_dtype = next(self.visual.parameters()).dtype
324
+ pixel_values = pixel_values.to(dtype=vision_dtype)
325
+ device = pixel_values.device
326
+
327
+ (
328
+ pixel_values_5d,
329
+ siglip_position_ids,
330
+ image_grid_hws,
331
+ sample_indices,
332
+ cu_seqlens,
333
+ ) = self._build_visual_inputs(pixel_values, image_grid_thw_list, device)
334
+
335
+ vision_outputs: BaseModelOutputWithPooling = self.visual(
336
+ pixel_values=pixel_values_5d,
337
+ image_grid_thw=image_grid_hws,
338
+ position_ids=siglip_position_ids,
339
+ vision_return_embed_list=True,
340
+ interpolate_pos_encoding=True,
341
+ sample_indices=sample_indices,
342
+ cu_seqlens=cu_seqlens,
343
+ return_pooler_output=False,
344
+ use_rope=True,
345
+ window_size=-1,
346
+ )
347
+ return {
348
+ "visual_embeds": vision_outputs.last_hidden_state,
349
+ "image_grid_thw": image_grid_thw_list,
350
+ "siglip_position_ids": siglip_position_ids,
351
+ "sample_indices": sample_indices,
352
+ "cu_seqlens": cu_seqlens,
353
+ }
354
+
355
+ def encode_images(
356
+ self,
357
+ images: Any,
358
+ image_processor: Optional[PaddleOCRVLImageProcessor] = None,
359
+ **processor_kwargs: Any,
360
+ ) -> Dict[str, Any]:
361
+ image_processor = image_processor or PaddleOCRVLImageProcessor(
362
+ patch_size=self.config.vision_config.patch_size,
363
+ temporal_patch_size=self.config.vision_config.temporal_patch_size,
364
+ merge_size=self.config.vision_config.spatial_merge_size,
365
+ )
366
+ encoded: BatchFeature = image_processor(
367
+ images=images, return_tensors="pt", **processor_kwargs
368
+ )
369
+ return self.forward(
370
+ pixel_values=encoded["pixel_values"], image_grid_thw=encoded["image_grid_thw"]
371
+ )
372
+
373
+
374
+ class PaddleOCRVLProjector(torch.nn.Module):
375
+ def __init__(self, config: PaddleOCRVLConfig):
376
+ super().__init__()
377
+ self.config = config
378
+ self.projector = Projector(config, config.vision_config)
379
+ self.export_config = build_projector_export_config(config)
380
+
381
+ @staticmethod
382
+ def _resolve_full_config(config_payload: Dict[str, Any]) -> PaddleOCRVLConfig:
383
+ if config_payload.get("model_type") == "paddleocr_vl_projector":
384
+ config_payload = config_payload["full_model_config"]
385
+ return PaddleOCRVLConfig(**config_payload)
386
+
387
+ @classmethod
388
+ def from_pretrained(cls, model_dir: Union[str, Path]) -> "PaddleOCRVLProjector":
389
+ model_dir = Path(model_dir)
390
+ config_path = model_dir / PROJECTOR_CONFIG_NAME
391
+ weights_path = model_dir / PROJECTOR_WEIGHTS_NAME
392
+
393
+ if config_path.exists():
394
+ config_payload = _read_json(config_path)
395
+ else:
396
+ config_payload = _read_json(model_dir / FULL_MODEL_CONFIG_NAME)
397
+
398
+ model = cls(cls._resolve_full_config(config_payload))
399
+
400
+ if weights_path.exists():
401
+ state_dict = _load_safetensors_state_dict(weights_path)
402
+ else:
403
+ full_state_dict = _load_safetensors_state_dict(model_dir / FULL_MODEL_WEIGHTS_NAME)
404
+ _, state_dict, _ = remap_full_model_state_dict_to_vision_encoder_parts(
405
+ full_state_dict
406
+ )
407
+
408
+ missing, unexpected = model.load_state_dict(state_dict, strict=True)
409
+ if missing or unexpected:
410
+ raise RuntimeError(
411
+ f"Failed to load standalone projector weights. Missing: {missing}, unexpected: {unexpected}"
412
+ )
413
+ return model
414
+
415
+ def save_pretrained(self, output_dir: Union[str, Path]) -> None:
416
+ output_dir = Path(output_dir)
417
+ output_dir.mkdir(parents=True, exist_ok=True)
418
+ _save_safetensors_state_dict(self.state_dict(), output_dir / PROJECTOR_WEIGHTS_NAME)
419
+ _write_json(output_dir / PROJECTOR_CONFIG_NAME, self.export_config)
420
+
421
+ def forward(
422
+ self,
423
+ visual_embeds: Union[torch.Tensor, List[torch.Tensor], Tuple[torch.Tensor, ...]],
424
+ image_grid_thw: Union[torch.Tensor, Sequence[Any]],
425
+ ) -> Dict[str, Any]:
426
+ image_grid_thw_list = _normalize_image_grid_thw(image_grid_thw)
427
+ image_embeds = self.projector(visual_embeds, image_grid_thw_list)
428
+ projector_dtype = next(self.projector.parameters()).dtype
429
+ projector_device = next(self.projector.parameters()).device
430
+ concat_image_embeds = (
431
+ torch.cat(image_embeds, dim=0)
432
+ if image_embeds
433
+ else torch.empty(
434
+ 0,
435
+ self.config.hidden_size,
436
+ device=projector_device,
437
+ dtype=projector_dtype,
438
+ )
439
+ )
440
+ return {
441
+ "image_embeds": image_embeds,
442
+ "concat_image_embeds": concat_image_embeds,
443
+ "image_grid_thw": image_grid_thw_list,
444
+ }
445
+
446
+ class PaddleOCRVLVisionEncoder(torch.nn.Module):
447
+ def __init__(self, config: PaddleOCRVLConfig):
448
+ super().__init__()
449
+ self.config = config
450
+ self.vision_tower = PaddleOCRVLVisionTower(config)
451
+ self.projector = PaddleOCRVLProjector(config)
452
+ self.export_config = build_vision_encoder_export_config(config)
453
+
454
+ @classmethod
455
+ def from_pretrained(cls, model_dir: Union[str, Path]) -> "PaddleOCRVLVisionEncoder":
456
+ model_dir = Path(model_dir)
457
+ config_candidates = [
458
+ model_dir / FULL_MODEL_CONFIG_NAME,
459
+ model_dir / VISION_TOWER_CONFIG_NAME,
460
+ model_dir / PROJECTOR_CONFIG_NAME,
461
+ ]
462
+ config_path = next((path for path in config_candidates if path.exists()), None)
463
+ if config_path is None:
464
+ raise FileNotFoundError(
465
+ "Could not find config.json, vision_tower_config.json, or projector_config.json."
466
+ )
467
+ config_payload = _read_json(config_path)
468
+ if config_payload.get("model_type") == "paddleocr_vl_vision_tower":
469
+ config = PaddleOCRVLVisionTower._resolve_full_config(config_payload)
470
+ elif config_payload.get("model_type") == "paddleocr_vl_projector":
471
+ config = PaddleOCRVLProjector._resolve_full_config(config_payload)
472
+ else:
473
+ config = PaddleOCRVLProjector._resolve_full_config(config_payload)
474
+ model = cls(config)
475
+ model.vision_tower = PaddleOCRVLVisionTower.from_pretrained(model_dir)
476
+ model.projector = PaddleOCRVLProjector.from_pretrained(model_dir)
477
+ return model
478
+
479
+ def forward(
480
+ self,
481
+ pixel_values: torch.Tensor,
482
+ image_grid_thw: Union[torch.Tensor, Sequence[Any]],
483
+ ) -> Dict[str, Any]:
484
+ vision_outputs = self.vision_tower(
485
+ pixel_values=pixel_values,
486
+ image_grid_thw=image_grid_thw,
487
+ )
488
+ projector_outputs = self.projector(
489
+ visual_embeds=vision_outputs["visual_embeds"],
490
+ image_grid_thw=vision_outputs["image_grid_thw"],
491
+ )
492
+ return {
493
+ **vision_outputs,
494
+ **projector_outputs,
495
+ }
496
+
497
+ def encode_images(
498
+ self,
499
+ images: Any,
500
+ image_processor: Optional[PaddleOCRVLImageProcessor] = None,
501
+ **processor_kwargs: Any,
502
+ ) -> Dict[str, Any]:
503
+ vision_outputs = self.vision_tower.encode_images(
504
+ images=images,
505
+ image_processor=image_processor,
506
+ **processor_kwargs,
507
+ )
508
+ projector_outputs = self.projector(
509
+ visual_embeds=vision_outputs["visual_embeds"],
510
+ image_grid_thw=vision_outputs["image_grid_thw"],
511
+ )
512
+ return {**vision_outputs, **projector_outputs}
model/image_processing_paddleocr_vl.py ADDED
@@ -0,0 +1,569 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Image processor class for PaddleOCR-VL."""
16
+
17
+ import math
18
+ from typing import Dict, List, Optional, Union
19
+
20
+ import numpy as np
21
+ import torch
22
+ from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
23
+ from torchvision.transforms import functional as TF
24
+ from transformers.image_transforms import (
25
+ convert_to_rgb,
26
+ resize,
27
+ to_channel_dimension_format,
28
+ )
29
+ from transformers.image_utils import (
30
+ OPENAI_CLIP_MEAN,
31
+ OPENAI_CLIP_STD,
32
+ ChannelDimension,
33
+ PILImageResampling,
34
+ get_image_size,
35
+ infer_channel_dimension_format,
36
+ is_scaled_image,
37
+ is_valid_image,
38
+ make_list_of_images,
39
+ to_numpy_array,
40
+ valid_images,
41
+ validate_preprocess_arguments,
42
+ )
43
+ from transformers.utils import TensorType, is_vision_available, logging
44
+
45
+
46
+ logger = logging.get_logger(__name__)
47
+
48
+
49
+ if is_vision_available():
50
+ from PIL import Image
51
+
52
+ ImageInput = Union[
53
+ "PIL.Image.Image",
54
+ np.ndarray,
55
+ "torch.Tensor",
56
+ List["PIL.Image.Image"],
57
+ List[np.ndarray],
58
+ List["torch.Tensor"],
59
+ ] # noqa
60
+
61
+
62
+ VideoInput = Union[
63
+ List["PIL.Image.Image"],
64
+ "np.ndarray",
65
+ "torch.Tensor",
66
+ List["np.ndarray"],
67
+ List["torch.Tensor"],
68
+ List[List["PIL.Image.Image"]],
69
+ List[List["np.ndarrray"]],
70
+ List[List["torch.Tensor"]],
71
+ ] # noqa
72
+
73
+
74
+ def make_batched_images(images) -> List[List[ImageInput]]:
75
+ """
76
+ Accepts images in list or nested list format, and makes a list of images for preprocessing.
77
+
78
+ Args:
79
+ images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
80
+ The input image.
81
+
82
+ Returns:
83
+ list: A list of images.
84
+ """
85
+ if (
86
+ isinstance(images, (list, tuple))
87
+ and isinstance(images[0], (list, tuple))
88
+ and is_valid_image(images[0][0])
89
+ ):
90
+ return [img for img_list in images for img in img_list]
91
+
92
+ elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
93
+ return images
94
+
95
+ elif is_valid_image(images):
96
+ return [images]
97
+
98
+ raise ValueError(f"Could not make batched images from {images}")
99
+
100
+
101
+ def adjust_size(size, patch_size):
102
+ num_patches = size // patch_size
103
+ if num_patches % 2 != 0: # 如果是奇数,减1
104
+ num_patches -= 1
105
+ return num_patches * patch_size
106
+
107
+
108
+ def make_batched_videos(videos) -> List[VideoInput]:
109
+ if (
110
+ isinstance(videos, (list, tuple))
111
+ and isinstance(videos[0], (list, tuple))
112
+ and is_valid_image(videos[0][0])
113
+ ):
114
+ return videos
115
+
116
+ elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
117
+ if isinstance(videos[0], Image.Image):
118
+ return [videos]
119
+ elif len(videos[0].shape) == 4:
120
+ return [list(video) for video in videos]
121
+
122
+ elif is_valid_image(videos) and len(videos.shape) == 4:
123
+ return [list(videos)]
124
+
125
+ raise ValueError(f"Could not make batched video from {videos}")
126
+
127
+
128
+ def smart_resize(
129
+ height: int,
130
+ width: int,
131
+ factor: int = 28,
132
+ min_pixels: int = 28 * 28 * 130,
133
+ max_pixels: int = 28 * 28 * 1280,
134
+ ):
135
+ """Rescales the image so that the following conditions are met:
136
+
137
+ 1. Both dimensions (height and width) are divisible by 'factor'.
138
+
139
+ 2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
140
+
141
+ 3. The aspect ratio of the image is maintained as closely as possible.
142
+
143
+ """
144
+ # if height < factor or width < factor:
145
+ # raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor}")
146
+ # if int(height < factor//4) + int(width < factor//4):
147
+ # raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor//4}")
148
+
149
+ if height < factor:
150
+ print(f"smart_resize: height={height} < factor={factor}, reset height=factor")
151
+ width = round((width * factor) / height)
152
+ height = factor
153
+
154
+ if width < factor:
155
+ print(f"smart_resize: width={width} < factor={factor}, reset width=factor")
156
+ height = round((height * factor) / width)
157
+ width = factor
158
+
159
+ if max(height, width) / min(height, width) > 200:
160
+ raise ValueError(
161
+ f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}"
162
+ )
163
+ h_bar = round(height / factor) * factor
164
+ w_bar = round(width / factor) * factor
165
+ if h_bar * w_bar > max_pixels:
166
+ beta = math.sqrt((height * width) / max_pixels)
167
+ h_bar = math.floor(height / beta / factor) * factor
168
+ w_bar = math.floor(width / beta / factor) * factor
169
+ elif h_bar * w_bar < min_pixels:
170
+ beta = math.sqrt(min_pixels / (height * width))
171
+ h_bar = math.ceil(height * beta / factor) * factor
172
+ w_bar = math.ceil(width * beta / factor) * factor
173
+ return h_bar, w_bar
174
+
175
+
176
+ class PaddleOCRVLImageProcessor(BaseImageProcessor):
177
+ r"""
178
+ Constructs a Siglip image processor that dynamically resizes images based on the original images.
179
+
180
+ Args:
181
+ do_resize (`bool`, *optional*, defaults to `True`):
182
+ Whether to resize the image's (height, width) dimensions.
183
+ resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
184
+ Resampling filter to use when resizing the image.
185
+ do_rescale (`bool`, *optional*, defaults to `True`):
186
+ Whether to rescale the image by the specified scale `rescale_factor`.
187
+ rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
188
+ Scale factor to use if rescaling the image.
189
+ do_normalize (`bool`, *optional*, defaults to `True`):
190
+ Whether to normalize the image.
191
+ image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
192
+ Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
193
+ image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
194
+ Standard deviation to use if normalizing the image. This is a float or list of floats for each channel in the image.
195
+ do_convert_rgb (`bool`, *optional*, defaults to `True`):
196
+ Whether to convert the image to RGB.
197
+ min_pixels (`int`, *optional*, defaults to `28 * 28 * 130`):
198
+ The min pixels of the image to resize the image.
199
+ max_pixels (`int`, *optional*, defaults to `28 * 28 * 1670`):
200
+ The max pixels of the image to resize the image.
201
+ patch_size (`int`, *optional*, defaults to 14):
202
+ The spacial patch size of the vision encoder.
203
+ temporal_patch_size (`int`, *optional*, defaults to 2):
204
+ The temporal patch size of the vision encoder.
205
+ merge_size (`int`, *optional*, defaults to 2):
206
+ The merge size of the vision encoder to llm encoder.
207
+ """
208
+
209
+ model_input_names = [
210
+ "pixel_values",
211
+ "image_grid_thw",
212
+ "pixel_values_videos",
213
+ "video_grid_thw",
214
+ ]
215
+
216
+ def __init__(
217
+ self,
218
+ do_resize: bool = True,
219
+ resample: PILImageResampling = PILImageResampling.BICUBIC,
220
+ do_rescale: bool = True,
221
+ rescale_factor: Union[int, float] = 1 / 255,
222
+ do_normalize: bool = True,
223
+ image_mean: Optional[Union[float, List[float]]] = None,
224
+ image_std: Optional[Union[float, List[float]]] = None,
225
+ do_convert_rgb: bool = True,
226
+ min_pixels: int = 28 * 28 * 130,
227
+ max_pixels: int = 28 * 28 * 1280,
228
+ patch_size: int = 14,
229
+ temporal_patch_size: int = 1,
230
+ merge_size: int = 2,
231
+ **kwargs,
232
+ ) -> None:
233
+ super().__init__(**kwargs)
234
+ self.do_resize = do_resize
235
+ self.resample = resample
236
+ self.do_rescale = do_rescale
237
+ self.rescale_factor = rescale_factor
238
+ self.do_normalize = do_normalize
239
+ self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
240
+ self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
241
+ self.min_pixels = min_pixels
242
+ self.max_pixels = max_pixels
243
+ self.patch_size = patch_size
244
+ self.temporal_patch_size = temporal_patch_size
245
+ self.merge_size = merge_size
246
+ self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels} # not used
247
+ self.do_convert_rgb = do_convert_rgb
248
+
249
+ def mvit_rescale(self, image: Image.Image, merge_size: int = 2) -> Image.Image:
250
+ try:
251
+ w, h = image.size
252
+ except:
253
+ raise ValueError(str((type(image), image)))
254
+ patch_size = self.patch_size
255
+
256
+ if (w // patch_size) * (h // patch_size) > self.in_token_limit:
257
+ scale = math.sqrt(
258
+ self.in_token_limit / ((w // patch_size) * (h // patch_size))
259
+ )
260
+ new_w, new_h = int(w * scale), int(h * scale)
261
+
262
+ image = image.resize((new_w, new_h), Image.Resampling.BICUBIC)
263
+ if self.pad_input:
264
+ new_w, new_h = image.size
265
+ pad_size_h = merge_size * patch_size
266
+ pad_size_w = merge_size * patch_size
267
+
268
+ pad_h = (pad_size_h - new_h % pad_size_h) % pad_size_h
269
+ pad_w = (pad_size_w - new_w % pad_size_w) % pad_size_w
270
+
271
+ image = TF.pad(image, (0, 0, pad_w, pad_h))
272
+ else:
273
+ new_w, new_h = image.size
274
+ new_w = new_w - new_w % patch_size
275
+ new_h = new_h - new_h % patch_size
276
+
277
+ new_w = adjust_size(new_w, patch_size)
278
+ new_h = adjust_size(new_h, patch_size)
279
+
280
+ image = TF.center_crop(image, (new_h, new_w))
281
+
282
+ w, h = image.size
283
+ if w // patch_size >= 512 or h // patch_size >= 512:
284
+ new_h = min(patch_size * 510, h)
285
+ new_w = min(patch_size * 510, w)
286
+ image = TF.center_crop(image, (new_h, new_w))
287
+ # raise ValueError("Exceed pos emb")
288
+ return image
289
+
290
+ def _preprocess(
291
+ self,
292
+ images: Union[ImageInput, VideoInput],
293
+ do_resize: bool = None,
294
+ resample: PILImageResampling = None,
295
+ do_rescale: bool = None,
296
+ rescale_factor: float = None,
297
+ do_normalize: bool = None,
298
+ image_mean: Optional[Union[float, List[float]]] = None,
299
+ image_std: Optional[Union[float, List[float]]] = None,
300
+ do_convert_rgb: bool = None,
301
+ data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
302
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
303
+ ):
304
+ """
305
+ Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
306
+
307
+ Args:
308
+ images (`ImageInput`):
309
+ Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`.
310
+ vision_info (`List[Dict]`, *optional*):
311
+ Optional list of dictionaries containing additional information about vision inputs.
312
+ do_resize (`bool`, *optional*, defaults to `self.do_resize`):
313
+ Whether to resize the image.
314
+ resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
315
+ Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
316
+ do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
317
+ Whether to rescale the image.
318
+ rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
319
+ Scale factor to use if rescaling the image.
320
+ do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
321
+ Whether to normalize the image.
322
+ image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
323
+ Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
324
+ image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
325
+ Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
326
+ do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
327
+ Whether to convert the image to RGB.
328
+ data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
329
+ The channel dimension format for the output image. Can be one of:
330
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
331
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
332
+ - Unset: Use the channel dimension format of the input image.
333
+ input_data_format (`ChannelDimension` or `str`, *optional*):
334
+ The channel dimension format for the input image. Can be one of:
335
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
336
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
337
+ - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
338
+ """
339
+ images = make_list_of_images(images)
340
+
341
+ if do_convert_rgb:
342
+ images = [convert_to_rgb(image) for image in images]
343
+
344
+ # All transformations expect numpy arrays.
345
+ images = [to_numpy_array(image) for image in images]
346
+
347
+ if is_scaled_image(images[0]) and do_rescale:
348
+ logger.warning_once(
349
+ "It looks like you are trying to rescale already rescaled images. If the input"
350
+ " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
351
+ )
352
+ if input_data_format is None:
353
+ # We assume that all images have the same channel dimension format.
354
+ input_data_format = infer_channel_dimension_format(images[0])
355
+
356
+ height, width = get_image_size(images[0], channel_dim=input_data_format)
357
+ resized_height, resized_width = height, width
358
+ processed_images = []
359
+
360
+ for image in images:
361
+ if do_resize:
362
+ resized_height, resized_width = smart_resize(
363
+ height,
364
+ width,
365
+ factor=self.patch_size * self.merge_size,
366
+ min_pixels=self.min_pixels,
367
+ max_pixels=self.max_pixels,
368
+ )
369
+ image = resize(
370
+ image,
371
+ size=(resized_height, resized_width),
372
+ resample=resample,
373
+ input_data_format=input_data_format,
374
+ )
375
+
376
+ if do_rescale:
377
+ image = self.rescale(
378
+ image, scale=rescale_factor, input_data_format=input_data_format
379
+ )
380
+
381
+ if do_normalize:
382
+ image = self.normalize(
383
+ image=image,
384
+ mean=image_mean,
385
+ std=image_std,
386
+ input_data_format=input_data_format,
387
+ )
388
+ image = to_channel_dimension_format(
389
+ image, data_format, input_channel_dim=input_data_format
390
+ )
391
+ processed_images.append(image)
392
+
393
+ patches = np.array(processed_images)
394
+ if data_format == ChannelDimension.LAST:
395
+ patches = patches.transpose(0, 3, 1, 2)
396
+ if patches.shape[0] == 1:
397
+ patches = np.tile(patches, (self.temporal_patch_size, 1, 1, 1))
398
+ init_patches = patches
399
+ channel = patches.shape[1]
400
+ grid_t = patches.shape[0] // self.temporal_patch_size
401
+ grid_h, grid_w = (
402
+ resized_height // self.patch_size,
403
+ resized_width // self.patch_size,
404
+ )
405
+ patches = patches.reshape(
406
+ grid_t,
407
+ self.temporal_patch_size,
408
+ channel,
409
+ grid_h,
410
+ self.patch_size,
411
+ grid_w,
412
+ self.patch_size,
413
+ )
414
+ patches = patches.transpose(0, 3, 5, 2, 1, 4, 6)
415
+ assert self.temporal_patch_size == 1
416
+ flatten_patches = patches.reshape(
417
+ grid_t * grid_h * grid_w, channel, self.patch_size, self.patch_size
418
+ )
419
+ return flatten_patches, (grid_t, grid_h, grid_w)
420
+
421
+ def preprocess(
422
+ self,
423
+ images: ImageInput,
424
+ videos: VideoInput = None,
425
+ do_resize: bool = None,
426
+ size: Dict[str, int] = None,
427
+ resample: PILImageResampling = None,
428
+ do_rescale: bool = None,
429
+ rescale_factor: float = None,
430
+ do_normalize: bool = None,
431
+ image_mean: Optional[Union[float, List[float]]] = None,
432
+ image_std: Optional[Union[float, List[float]]] = None,
433
+ do_convert_rgb: bool = None,
434
+ return_tensors: Optional[Union[str, TensorType]] = None,
435
+ data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
436
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
437
+ ):
438
+ """
439
+ Args:
440
+ images (`ImageInput`):
441
+ Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
442
+ passing in images with pixel values between 0 and 1, set `do_rescale=False`.
443
+ videos (`VideoInput`):
444
+ Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If
445
+ passing in videos with pixel values between 0 and 1, set `do_rescale=False`.
446
+ do_resize (`bool`, *optional*, defaults to `self.do_resize`):
447
+ Whether to resize the image.
448
+ size (`Dict[str, int]`, *optional*, defaults to `self.size`):
449
+ Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
450
+ the longest edge resized to keep the input aspect ratio.
451
+ resample (`int`, *optional*, defaults to `self.resample`):
452
+ Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
453
+ has an effect if `do_resize` is set to `True`.
454
+ do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
455
+ Whether to rescale the image.
456
+ rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
457
+ Rescale factor to rescale the image by if `do_rescale` is set to `True`.
458
+ do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
459
+ Whether to normalize the image.
460
+ image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
461
+ Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
462
+ image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
463
+ Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
464
+ `True`.
465
+ do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
466
+ Whether to convert the image to RGB.
467
+ return_tensors (`str` or `TensorType`, *optional*):
468
+ The type of tensors to return. Can be one of:
469
+ - Unset: Return a list of `np.ndarray`.
470
+ - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
471
+ - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
472
+ - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
473
+ - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
474
+ data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
475
+ The channel dimension format for the output image. Can be one of:
476
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
477
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
478
+ - Unset: Use the channel dimension format of the input image.
479
+ input_data_format (`ChannelDimension` or `str`, *optional*):
480
+ The channel dimension format for the input image. If unset, the channel dimension format is inferred
481
+ from the input image. Can be one of:
482
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
483
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
484
+ - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
485
+
486
+ """
487
+ do_resize = do_resize if do_resize is not None else self.do_resize
488
+ size = size if size is not None else self.size
489
+ resample = resample if resample is not None else self.resample
490
+ do_rescale = do_rescale if do_rescale is not None else self.do_rescale
491
+ rescale_factor = (
492
+ rescale_factor if rescale_factor is not None else self.rescale_factor
493
+ )
494
+ do_normalize = do_normalize if do_normalize is not None else self.do_normalize
495
+ image_mean = image_mean if image_mean is not None else self.image_mean
496
+ image_std = image_std if image_std is not None else self.image_std
497
+ do_convert_rgb = (
498
+ do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
499
+ )
500
+
501
+ if images is not None:
502
+ images = make_batched_images(images)
503
+ if videos is not None:
504
+ videos = make_batched_videos(videos)
505
+
506
+ if images is not None and not valid_images(images):
507
+ raise ValueError(
508
+ "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
509
+ "torch.Tensor, tf.Tensor or jax.ndarray."
510
+ )
511
+
512
+ validate_preprocess_arguments(
513
+ rescale_factor=rescale_factor,
514
+ do_normalize=do_normalize,
515
+ image_mean=image_mean,
516
+ image_std=image_std,
517
+ do_resize=do_resize,
518
+ size=size,
519
+ resample=resample,
520
+ )
521
+
522
+ if images is not None:
523
+ pixel_values, vision_grid_thws = [], []
524
+ for image in images:
525
+ patches, image_grid_thw = self._preprocess(
526
+ image,
527
+ do_resize=do_resize,
528
+ resample=resample,
529
+ do_rescale=do_rescale,
530
+ rescale_factor=rescale_factor,
531
+ do_normalize=do_normalize,
532
+ image_mean=image_mean,
533
+ image_std=image_std,
534
+ data_format=data_format,
535
+ do_convert_rgb=do_convert_rgb,
536
+ input_data_format=input_data_format,
537
+ )
538
+ pixel_values.extend(patches)
539
+ vision_grid_thws.append(image_grid_thw)
540
+ pixel_values = np.array(pixel_values)
541
+ vision_grid_thws = np.array(vision_grid_thws)
542
+ data = {"pixel_values": pixel_values, "image_grid_thw": vision_grid_thws}
543
+
544
+ if videos is not None:
545
+ pixel_values, vision_grid_thws = [], []
546
+ for images in videos:
547
+ patches, video_grid_thw = self._preprocess(
548
+ images,
549
+ do_resize=do_resize,
550
+ resample=resample,
551
+ do_rescale=do_rescale,
552
+ rescale_factor=rescale_factor,
553
+ do_normalize=do_normalize,
554
+ image_mean=image_mean,
555
+ image_std=image_std,
556
+ data_format=data_format,
557
+ do_convert_rgb=do_convert_rgb,
558
+ input_data_format=input_data_format,
559
+ )
560
+ pixel_values.extend(patches)
561
+ vision_grid_thws.append(video_grid_thw)
562
+ pixel_values = np.array(pixel_values)
563
+ vision_grid_thws = np.array(vision_grid_thws)
564
+ data = {
565
+ "pixel_values_videos": pixel_values,
566
+ "video_grid_thw": vision_grid_thws,
567
+ }
568
+
569
+ return BatchFeature(data=data, tensor_type=return_tensors)
model/modeling_paddleocr_vl.py ADDED
The diff for this file is too large to render. See raw diff
 
projector.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8ab6a35716b6b7d79a760b5653de4e7c17bd9146784c11fd92bde20d65e72be
3
+ size 51920952
projector_config.json ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "paddleocr_vl_projector",
3
+ "architectures": [
4
+ "PaddleOCRVLProjector"
5
+ ],
6
+ "torch_dtype": "bfloat16",
7
+ "vision_config": {
8
+ "architectures": [
9
+ "PaddleOCRVisionModel"
10
+ ],
11
+ "attention_dropout": 0.0,
12
+ "auto_map": {
13
+ "AutoConfig": "configuration_paddleocr_vl.PaddleOCRVLConfig",
14
+ "AutoModel": "modeling_paddleocr_vl.PaddleOCRVisionModel"
15
+ },
16
+ "hidden_act": "gelu_pytorch_tanh",
17
+ "hidden_size": 1152,
18
+ "image_size": 384,
19
+ "intermediate_size": 4304,
20
+ "layer_norm_eps": 1e-06,
21
+ "model_type": "paddleocr_vl",
22
+ "num_attention_heads": 16,
23
+ "num_channels": 3,
24
+ "num_hidden_layers": 27,
25
+ "pad_token_id": 0,
26
+ "patch_size": 14,
27
+ "spatial_merge_size": 2,
28
+ "temporal_patch_size": 2,
29
+ "tokens_per_second": 2,
30
+ "torch_dtype": "bfloat16"
31
+ },
32
+ "text_hidden_size": 1024,
33
+ "projector": {
34
+ "merge_kernel_size": [
35
+ 2,
36
+ 2
37
+ ],
38
+ "input_hidden_size": 1152,
39
+ "output_hidden_size": 1024
40
+ },
41
+ "required_weight_prefixes": [
42
+ "projector."
43
+ ],
44
+ "source_weight_prefixes": {
45
+ "projector": "mlp_AR."
46
+ },
47
+ "full_model_config": {
48
+ "architectures": [
49
+ "PaddleOCRVLForConditionalGeneration"
50
+ ],
51
+ "attention_probs_dropout_prob": 0.0,
52
+ "auto_map": {
53
+ "AutoConfig": "configuration_paddleocr_vl.PaddleOCRVLConfig",
54
+ "AutoModel": "modeling_paddleocr_vl.PaddleOCRVLForConditionalGeneration",
55
+ "AutoModelForCausalLM": "modeling_paddleocr_vl.PaddleOCRVLForConditionalGeneration"
56
+ },
57
+ "compression_ratio": 1.0,
58
+ "head_dim": 128,
59
+ "hidden_act": "silu",
60
+ "hidden_dropout_prob": 0.0,
61
+ "hidden_size": 1024,
62
+ "ignored_index": -100,
63
+ "image_token_id": 100295,
64
+ "intermediate_size": 3072,
65
+ "max_position_embeddings": 131072,
66
+ "max_sequence_length": null,
67
+ "model_type": "paddleocr_vl",
68
+ "num_attention_heads": 16,
69
+ "num_hidden_layers": 18,
70
+ "num_key_value_heads": 2,
71
+ "pad_token_id": 0,
72
+ "rms_norm_eps": 1e-05,
73
+ "rope_scaling": {
74
+ "mrope_section": [
75
+ 16,
76
+ 24,
77
+ 24
78
+ ],
79
+ "rope_type": "default",
80
+ "type": "default"
81
+ },
82
+ "rope_theta": 500000,
83
+ "sliding_window": null,
84
+ "tie_word_embeddings": false,
85
+ "torch_dtype": "bfloat16",
86
+ "transformers_version": "4.55.0",
87
+ "use_bias": false,
88
+ "use_cache": false,
89
+ "use_flash_attention": false,
90
+ "video_token_id": 101307,
91
+ "vision_config": {
92
+ "architectures": [
93
+ "PaddleOCRVisionModel"
94
+ ],
95
+ "attention_dropout": 0.0,
96
+ "auto_map": {
97
+ "AutoConfig": "configuration_paddleocr_vl.PaddleOCRVLConfig",
98
+ "AutoModel": "modeling_paddleocr_vl.PaddleOCRVisionModel"
99
+ },
100
+ "hidden_act": "gelu_pytorch_tanh",
101
+ "hidden_size": 1152,
102
+ "image_size": 384,
103
+ "intermediate_size": 4304,
104
+ "layer_norm_eps": 1e-06,
105
+ "model_type": "paddleocr_vl",
106
+ "num_attention_heads": 16,
107
+ "num_channels": 3,
108
+ "num_hidden_layers": 27,
109
+ "pad_token_id": 0,
110
+ "patch_size": 14,
111
+ "spatial_merge_size": 2,
112
+ "temporal_patch_size": 2,
113
+ "tokens_per_second": 2,
114
+ "torch_dtype": "bfloat16"
115
+ },
116
+ "vision_start_token_id": 101305,
117
+ "vision_end_token_id": 101306,
118
+ "vocab_size": 103424,
119
+ "weight_share_add_bias": true,
120
+ "use_3d_rope": true,
121
+ "rope_is_neox_style": true
122
+ }
123
+ }
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ torch
2
+ transformers
3
+ safetensors
4
+ numpy
5
+ Pillow
6
+ torchvision
7
+ einops
vision_tower.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:560ed1b44203e3bb34023750848033d50a0b73fff8c571ffbcae0b5b18a42e5e
3
+ size 932006944
vision_tower_config.json ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "paddleocr_vl_vision_tower",
3
+ "architectures": [
4
+ "PaddleOCRVLVisionTower"
5
+ ],
6
+ "torch_dtype": "bfloat16",
7
+ "vision_config": {
8
+ "architectures": [
9
+ "PaddleOCRVisionModel"
10
+ ],
11
+ "attention_dropout": 0.0,
12
+ "auto_map": {
13
+ "AutoConfig": "configuration_paddleocr_vl.PaddleOCRVLConfig",
14
+ "AutoModel": "modeling_paddleocr_vl.PaddleOCRVisionModel"
15
+ },
16
+ "hidden_act": "gelu_pytorch_tanh",
17
+ "hidden_size": 1152,
18
+ "image_size": 384,
19
+ "intermediate_size": 4304,
20
+ "layer_norm_eps": 1e-06,
21
+ "model_type": "paddleocr_vl",
22
+ "num_attention_heads": 16,
23
+ "num_channels": 3,
24
+ "num_hidden_layers": 27,
25
+ "pad_token_id": 0,
26
+ "patch_size": 14,
27
+ "spatial_merge_size": 2,
28
+ "temporal_patch_size": 2,
29
+ "tokens_per_second": 2,
30
+ "torch_dtype": "bfloat16"
31
+ },
32
+ "required_weight_prefixes": [
33
+ "visual."
34
+ ],
35
+ "source_weight_prefixes": {
36
+ "visual": "visual."
37
+ },
38
+ "full_model_config": {
39
+ "architectures": [
40
+ "PaddleOCRVLForConditionalGeneration"
41
+ ],
42
+ "attention_probs_dropout_prob": 0.0,
43
+ "auto_map": {
44
+ "AutoConfig": "configuration_paddleocr_vl.PaddleOCRVLConfig",
45
+ "AutoModel": "modeling_paddleocr_vl.PaddleOCRVLForConditionalGeneration",
46
+ "AutoModelForCausalLM": "modeling_paddleocr_vl.PaddleOCRVLForConditionalGeneration"
47
+ },
48
+ "compression_ratio": 1.0,
49
+ "head_dim": 128,
50
+ "hidden_act": "silu",
51
+ "hidden_dropout_prob": 0.0,
52
+ "hidden_size": 1024,
53
+ "ignored_index": -100,
54
+ "image_token_id": 100295,
55
+ "intermediate_size": 3072,
56
+ "max_position_embeddings": 131072,
57
+ "max_sequence_length": null,
58
+ "model_type": "paddleocr_vl",
59
+ "num_attention_heads": 16,
60
+ "num_hidden_layers": 18,
61
+ "num_key_value_heads": 2,
62
+ "pad_token_id": 0,
63
+ "rms_norm_eps": 1e-05,
64
+ "rope_scaling": {
65
+ "mrope_section": [
66
+ 16,
67
+ 24,
68
+ 24
69
+ ],
70
+ "rope_type": "default",
71
+ "type": "default"
72
+ },
73
+ "rope_theta": 500000,
74
+ "sliding_window": null,
75
+ "tie_word_embeddings": false,
76
+ "torch_dtype": "bfloat16",
77
+ "transformers_version": "4.55.0",
78
+ "use_bias": false,
79
+ "use_cache": false,
80
+ "use_flash_attention": false,
81
+ "video_token_id": 101307,
82
+ "vision_config": {
83
+ "architectures": [
84
+ "PaddleOCRVisionModel"
85
+ ],
86
+ "attention_dropout": 0.0,
87
+ "auto_map": {
88
+ "AutoConfig": "configuration_paddleocr_vl.PaddleOCRVLConfig",
89
+ "AutoModel": "modeling_paddleocr_vl.PaddleOCRVisionModel"
90
+ },
91
+ "hidden_act": "gelu_pytorch_tanh",
92
+ "hidden_size": 1152,
93
+ "image_size": 384,
94
+ "intermediate_size": 4304,
95
+ "layer_norm_eps": 1e-06,
96
+ "model_type": "paddleocr_vl",
97
+ "num_attention_heads": 16,
98
+ "num_channels": 3,
99
+ "num_hidden_layers": 27,
100
+ "pad_token_id": 0,
101
+ "patch_size": 14,
102
+ "spatial_merge_size": 2,
103
+ "temporal_patch_size": 2,
104
+ "tokens_per_second": 2,
105
+ "torch_dtype": "bfloat16"
106
+ },
107
+ "vision_start_token_id": 101305,
108
+ "vision_end_token_id": 101306,
109
+ "vocab_size": 103424,
110
+ "weight_share_add_bias": true,
111
+ "use_3d_rope": true,
112
+ "rope_is_neox_style": true
113
+ }
114
+ }