aki-0421 commited on
Commit
131c541
·
verified ·
1 Parent(s): 11b05a4

Upload folder using huggingface_hub

Browse files
.gitignore ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.pyc
3
+ *.pyo
4
+ *.pyd
5
+ *.pyw
6
+ *.pyz
7
+ *.pywz
8
+ *.pyzw
9
+ *.pyzwz
README.md CHANGED
@@ -1,3 +1,115 @@
1
- ---
2
- license: cc-by-nc-4.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - sentence-transformers
4
+ - sentence-similarity
5
+ - feature-extraction
6
+ pipeline_tag: sentence-similarity
7
+ library_name: sentence-transformers
8
+ language:
9
+ - ja
10
+ base_model:
11
+ - cl-nagoya/ruri-v3-310m
12
+ - Qwen/Qwen2.5-VL-7B-Instruct
13
+ license: apache-2.0
14
+ ---
15
+
16
+ ### aki-0421/clip-anime-patch400-10k-v1
17
+
18
+ This is a CLIP model designed for anime character retrieval tasks.
19
+
20
+ ### Example
21
+
22
+ ```
23
+ import math
24
+ from PIL import Image
25
+ from sentence_transformers import SentenceTransformer
26
+
27
+ def resize_image_for_patch(image: Image.Image, patch_size: int = 14, max_patches: int = 400) -> Image.Image:
28
+ orig_width, orig_height = image.size
29
+ aspect_ratio = orig_width / orig_height
30
+
31
+ # Max width and height in pixels under the patch constraint
32
+ max_total_pixels = patch_size * math.sqrt(max_patches)
33
+
34
+ if aspect_ratio >= 1:
35
+ # Landscape or square orientation
36
+ target_width = patch_size * int(math.floor(math.sqrt(max_patches * aspect_ratio)))
37
+ target_height = int(target_width / aspect_ratio)
38
+ else:
39
+ # Portrait orientation
40
+ target_height = patch_size * int(math.floor(math.sqrt(max_patches / aspect_ratio)))
41
+ target_width = int(target_height * aspect_ratio)
42
+
43
+ # Ensure dimensions are multiples of patch_size
44
+ target_width -= target_width % patch_size
45
+ target_height -= target_height % patch_size
46
+
47
+ return image.resize((target_width, target_height), Image.BICUBIC)
48
+
49
+ # Init model
50
+ model = SentenceTransformer("./", device="cuda")
51
+
52
+ images = [
53
+ resize_image_for_patch(Image.open("/home/aki0421/Share/images/00085.png"))
54
+ ]
55
+ image_embeddings = model.encode(images, convert_to_tensor=True)
56
+
57
+ sentences = [
58
+ "女の子が悲しんでいる。",
59
+ "落ち込んでる人",
60
+ "泣いている",
61
+ "笑っている",
62
+ "ピンクの髪の女の子",
63
+ "赤い髪の女の子",
64
+ "茶色の髪の女の子",
65
+ "赤い目",
66
+ "青い目",
67
+ "曇っている",
68
+ "雨が降っている",
69
+ "晴れている",
70
+ "キッチンにいます。",
71
+ "学校にいる",
72
+ "魔法少女のようだ",
73
+ "戦闘しますか?",
74
+ "男性ですか?",
75
+ "茶色い髪の女の子が悲しんでいるシーン",
76
+ "ピンクの髪の女の子が笑っているシーン"
77
+ ]
78
+ text_embeddings = model.encode(sentences, convert_to_tensor=True)
79
+ similarities = model.similarity(text_embeddings, image_embeddings)
80
+
81
+ print(similarities)
82
+ ```
83
+
84
+ ### Citation
85
+
86
+ ```
87
+ @misc{
88
+ qwen2.5-VL,
89
+ title = {Qwen2.5-VL},
90
+ url = {https://qwenlm.github.io/blog/qwen2.5-vl/},
91
+ author = {Qwen Team},
92
+ month = {January},
93
+ year = {2025}
94
+ }
95
+
96
+ @misc{
97
+ Ruri,
98
+ title={{Ruri: Japanese General Text Embeddings}},
99
+ author={Hayato Tsukagoshi and Ryohei Sasano},
100
+ year={2024},
101
+ eprint={2409.07737},
102
+ archivePrefix={arXiv},
103
+ primaryClass={cs.CL},
104
+ url={https://arxiv.org/abs/2409.07737},
105
+ }
106
+
107
+ @misc{
108
+ oshizo2024clipqwen,
109
+ author = {Oshizo},
110
+ title = {japanese-clip-qwen2\_vl},
111
+ year = {2024},
112
+ howpublished = {\url{https://github.com/oshizo/japanese-clip-qwen2_vl}},
113
+ note = {Accessed: 2025-06-08}
114
+ }
115
+ ```
added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
chat_template.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
3
+ }
config.json ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "CLIPQwenVLModel"
4
+ ],
5
+ "logit_scale_init_value": 0.5,
6
+ "model_type": "clip_qwen_vl",
7
+ "projection_dim": 768,
8
+ "text_config": {
9
+ "_attn_implementation_autoset": true,
10
+ "_name_or_path": "cl-nagoya/ruri-v3-310m",
11
+ "architectures": [
12
+ "ModernBertModel"
13
+ ],
14
+ "attention_bias": false,
15
+ "attention_dropout": 0.0,
16
+ "bos_token_id": 1,
17
+ "classifier_activation": "gelu",
18
+ "classifier_bias": false,
19
+ "classifier_dropout": 0.0,
20
+ "classifier_pooling": "cls",
21
+ "cls_token_id": 6,
22
+ "decoder_bias": true,
23
+ "deterministic_flash_attn": false,
24
+ "embedding_dropout": 0.0,
25
+ "eos_token_id": 2,
26
+ "global_attn_every_n_layers": 3,
27
+ "global_rope_theta": 160000.0,
28
+ "gradient_checkpointing": false,
29
+ "hidden_activation": "gelu",
30
+ "hidden_size": 768,
31
+ "initializer_cutoff_factor": 2.0,
32
+ "initializer_range": 0.02,
33
+ "intermediate_size": 3072,
34
+ "layer_norm_eps": 1e-05,
35
+ "local_attention": 128,
36
+ "local_rope_theta": 10000.0,
37
+ "max_position_embeddings": 8192,
38
+ "mlp_bias": false,
39
+ "mlp_dropout": 0.0,
40
+ "model_type": "modernbert",
41
+ "norm_bias": false,
42
+ "norm_eps": 1e-05,
43
+ "num_attention_heads": 12,
44
+ "num_hidden_layers": 25,
45
+ "pad_token_id": 3,
46
+ "position_embedding_type": "rope",
47
+ "repad_logits_with_grad": false,
48
+ "sep_token_id": 4,
49
+ "sparse_pred_ignore_index": -100,
50
+ "sparse_prediction": false,
51
+ "torch_dtype": "float32",
52
+ "vocab_size": 102400
53
+ },
54
+ "torch_dtype": "bfloat16",
55
+ "transformers_version": "4.51.3",
56
+ "vision_config": {
57
+ "_attn_implementation_autoset": true,
58
+ "depth": 32,
59
+ "fullatt_block_indexes": [
60
+ 7,
61
+ 15,
62
+ 23,
63
+ 31
64
+ ],
65
+ "hidden_act": "silu",
66
+ "hidden_size": 1280,
67
+ "in_channels": 3,
68
+ "in_chans": 3,
69
+ "intermediate_size": 3420,
70
+ "model_type": "qwen2_5_vl",
71
+ "num_heads": 16,
72
+ "out_hidden_size": 3584,
73
+ "patch_size": 14,
74
+ "spatial_merge_size": 2,
75
+ "spatial_patch_size": 14,
76
+ "temporal_patch_size": 2,
77
+ "tokens_per_second": 2,
78
+ "window_size": 112
79
+ }
80
+ }
config_sentence_transformers.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "__version__": {
3
+ "sentence_transformers": "4.1.0",
4
+ "transformers": "4.51.3",
5
+ "pytorch": "2.8.0.dev20250530+cu128"
6
+ },
7
+ "prompts": {},
8
+ "default_prompt_name": null,
9
+ "similarity_fn_name": "cosine"
10
+ }
example.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ from PIL import Image
3
+ from sentence_transformers import SentenceTransformer
4
+
5
+ def resize_image_for_patch(image: Image.Image, patch_size: int = 14, max_patches: int = 400) -> Image.Image:
6
+ orig_width, orig_height = image.size
7
+ aspect_ratio = orig_width / orig_height
8
+
9
+ # Max width and height in pixels under the patch constraint
10
+ max_total_pixels = patch_size * math.sqrt(max_patches)
11
+
12
+ if aspect_ratio >= 1:
13
+ # Landscape or square orientation
14
+ target_width = patch_size * int(math.floor(math.sqrt(max_patches * aspect_ratio)))
15
+ target_height = int(target_width / aspect_ratio)
16
+ else:
17
+ # Portrait orientation
18
+ target_height = patch_size * int(math.floor(math.sqrt(max_patches / aspect_ratio)))
19
+ target_width = int(target_height * aspect_ratio)
20
+
21
+ # Ensure dimensions are multiples of patch_size
22
+ target_width -= target_width % patch_size
23
+ target_height -= target_height % patch_size
24
+
25
+ return image.resize((target_width, target_height), Image.BICUBIC)
26
+
27
+ # Init model
28
+ model = SentenceTransformer("./", device="cuda")
29
+
30
+ images = [
31
+ resize_image_for_patch(Image.open("/home/aki0421/Share/images/00085.png"))
32
+ ]
33
+ image_embeddings = model.encode(images, convert_to_tensor=True)
34
+
35
+ sentences = [
36
+ "女の子が悲しんでいる。",
37
+ "落ち込んでる人",
38
+ "泣いている",
39
+ "笑っている",
40
+ "ピンクの髪の女の子",
41
+ "赤い髪の女の子",
42
+ "茶色の髪の女の子",
43
+ "赤い目",
44
+ "青い目",
45
+ "曇っている",
46
+ "雨が降っている",
47
+ "晴れている",
48
+ "キッチンにいます。",
49
+ "学校にいる",
50
+ "魔法少女のようだ",
51
+ "戦闘しますか?",
52
+ "男性ですか?",
53
+ "茶色い髪の女の子が悲しんでいるシーン",
54
+ "ピンクの髪の女の子が笑っているシーン"
55
+ ]
56
+ text_embeddings = model.encode(sentences, convert_to_tensor=True)
57
+ similarities = model.similarity(text_embeddings, image_embeddings)
58
+
59
+ print(similarities)
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8477a3483d0a3ca9109ec1aeb83f38c0ef2b363dd4fb2a79b7bd15ec67c2a1a0
3
+ size 1993531042
modeling_clip.py ADDED
@@ -0,0 +1,443 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Dict, List, Optional, Union
4
+
5
+ import torch
6
+ import torch.nn.functional as F
7
+ import transformers
8
+ from PIL import Image
9
+ from torch import nn
10
+ from transformers import (
11
+ ModernBertConfig,
12
+ ModernBertModel,
13
+ PretrainedConfig,
14
+ PreTrainedModel,
15
+ )
16
+
17
+ from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLVisionConfig
18
+ from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
19
+ Qwen2_5_VisionTransformerPretrainedModel,
20
+ )
21
+
22
+
23
+ # Constants
24
+ DEFAULT_PROJECTION_DIM = 768
25
+ DEFAULT_LOGIT_SCALE_INIT = 0.5
26
+ DEFAULT_MAX_LENGTH = 512
27
+ SPATIAL_MERGE_SIZE = 2
28
+ PROJECTION_INTERMEDIATE_DIM = 1280
29
+ PROJECTION_DROPOUT = 0.1
30
+ RURI_MODEL_NAME = "cl-nagoya/ruri-v3-310m"
31
+ QWEN_MODEL_NAME = "Qwen/Qwen2.5-VL-7B-Instruct"
32
+
33
+ # Input type constants
34
+ IMAGE_INPUT_TYPE = 0
35
+ TEXT_INPUT_TYPE = 1
36
+
37
+
38
+ class CLIPQwenVLConfig(PretrainedConfig):
39
+ """Configuration class for CLIP-QwenVL model."""
40
+
41
+ model_type = "clip_qwen_vl"
42
+
43
+ def __init__(
44
+ self,
45
+ text_config: Optional[Dict[str, Any]] = None,
46
+ vision_config: Optional[Dict[str, Any]] = None,
47
+ projection_dim: int = DEFAULT_PROJECTION_DIM,
48
+ logit_scale_init_value: float = DEFAULT_LOGIT_SCALE_INIT,
49
+ **kwargs,
50
+ ):
51
+ super().__init__(**kwargs)
52
+
53
+ text_config = text_config or {}
54
+ vision_config = vision_config or {}
55
+
56
+ self.text_config = ModernBertConfig(**text_config)
57
+ self.vision_config = Qwen2_5_VLVisionConfig(**vision_config)
58
+
59
+ self.projection_dim = projection_dim
60
+ self.logit_scale_init_value = logit_scale_init_value
61
+
62
+
63
+ class CLIPQwenVLModel(PreTrainedModel):
64
+ """CLIP-QwenVL model for multi-modal embedding generation."""
65
+
66
+ config_class = CLIPQwenVLConfig
67
+
68
+ def __init__(self, config: CLIPQwenVLConfig):
69
+ super().__init__(config)
70
+
71
+ self.projection_dim = config.text_config.hidden_size
72
+ self.text_embed_dim = config.text_config.hidden_size
73
+ self.vision_embed_dim = config.vision_config.out_hidden_size
74
+
75
+ # Initialize text encoder
76
+ self.text_model = ModernBertModel(config.text_config)
77
+
78
+ # Initialize vision encoder
79
+ self.vision_model = Qwen2_5_VisionTransformerPretrainedModel(config.vision_config)
80
+
81
+ # Initialize vision projection layers
82
+ self.vision_projection = self._create_vision_projection()
83
+
84
+ # Initialize logit scale parameter
85
+ self.logit_scale = nn.Parameter(torch.ones([]) * config.logit_scale_init_value)
86
+
87
+ def _create_vision_projection(self) -> nn.Module:
88
+ """Create vision projection layers with dropout and activation."""
89
+ return nn.Sequential(
90
+ nn.Linear(self.vision_embed_dim, PROJECTION_INTERMEDIATE_DIM),
91
+ nn.GELU(),
92
+ nn.Dropout(PROJECTION_DROPOUT),
93
+ nn.Linear(PROJECTION_INTERMEDIATE_DIM, self.projection_dim),
94
+ nn.Tanh(),
95
+ )
96
+
97
+ def _apply_mean_pooling(
98
+ self,
99
+ last_hidden_state: torch.Tensor,
100
+ attention_mask: torch.Tensor
101
+ ) -> torch.Tensor:
102
+ """Apply mean pooling to text embeddings using attention mask."""
103
+ attention_mask = attention_mask.to(last_hidden_state.dtype)
104
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(
105
+ last_hidden_state.size()
106
+ )
107
+ sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
108
+ sum_mask = input_mask_expanded.sum(1)
109
+ sum_mask = torch.clamp(sum_mask, min=1e-9)
110
+ return sum_embeddings / sum_mask
111
+
112
+ def _normalize_embeddings(self, embeddings: torch.Tensor) -> torch.Tensor:
113
+ """Apply tanh constraint and L2 normalization to embeddings."""
114
+ # Constrain to [-1,1] range using tanh, then apply L2 normalization
115
+ embeddings = torch.tanh(embeddings)
116
+ return F.normalize(embeddings, p=2, dim=-1)
117
+
118
+ def get_text_features(
119
+ self,
120
+ input_ids: Optional[torch.Tensor] = None,
121
+ attention_mask: Optional[torch.Tensor] = None,
122
+ position_ids: Optional[torch.Tensor] = None,
123
+ output_attentions: Optional[bool] = None,
124
+ output_hidden_states: Optional[bool] = None,
125
+ ) -> torch.FloatTensor:
126
+ """
127
+ Extract and normalize text features from input tokens.
128
+
129
+ Args:
130
+ input_ids: Token ids of shape [batch_size, seq_len]
131
+ attention_mask: Attention mask of shape [batch_size, seq_len]
132
+ position_ids: Position ids of shape [batch_size, seq_len]
133
+ output_attentions: Whether to output attention weights
134
+ output_hidden_states: Whether to output hidden states
135
+
136
+ Returns:
137
+ Normalized text embeddings of shape [batch_size, hidden_size]
138
+ """
139
+ text_outputs = self.text_model(
140
+ input_ids=input_ids,
141
+ attention_mask=attention_mask,
142
+ position_ids=position_ids,
143
+ output_attentions=output_attentions,
144
+ output_hidden_states=output_hidden_states,
145
+ return_dict=True,
146
+ )
147
+
148
+ # Apply mean pooling to get sentence-level representations
149
+ text_embeds = self._apply_mean_pooling(
150
+ text_outputs.last_hidden_state, attention_mask
151
+ )
152
+
153
+ # Apply tanh constraint and L2 normalization
154
+ return self._normalize_embeddings(text_embeds)
155
+
156
+ def _compute_merged_patches_info(self, image_grid_thw: torch.LongTensor) -> torch.Tensor:
157
+ """Compute cumulative sequence lengths for merged image patches."""
158
+ t, h, w = image_grid_thw.unbind(dim=1)
159
+ merged_patches_per_image = (
160
+ (h // SPATIAL_MERGE_SIZE) * (w // SPATIAL_MERGE_SIZE) * t
161
+ )
162
+ return F.pad(merged_patches_per_image.cumsum(0), (1, 0), value=0)
163
+
164
+ def _aggregate_vision_features(
165
+ self,
166
+ vision_output: torch.Tensor,
167
+ merged_cu_seqlens: torch.Tensor
168
+ ) -> torch.Tensor:
169
+ """Aggregate vision features using mean pooling over patches."""
170
+ return torch.stack([
171
+ vision_output[start:end].mean(dim=0)
172
+ for start, end in zip(merged_cu_seqlens[:-1], merged_cu_seqlens[1:])
173
+ ])
174
+
175
+ def get_image_features(
176
+ self,
177
+ pixel_values: Optional[torch.FloatTensor] = None,
178
+ image_grid_thw: Optional[torch.LongTensor] = None,
179
+ ) -> torch.FloatTensor:
180
+ """
181
+ Extract and normalize image features from pixel values.
182
+
183
+ Args:
184
+ pixel_values: Image pixel values
185
+ image_grid_thw: Image grid dimensions [batch_size, 3] (time, height, width)
186
+
187
+ Returns:
188
+ Normalized image embeddings of shape [batch_size, projection_dim]
189
+ """
190
+ # Compute merged patch information
191
+ merged_cu_seqlens = self._compute_merged_patches_info(image_grid_thw)
192
+
193
+ # Extract vision features
194
+ vision_output = self.vision_model(
195
+ hidden_states=pixel_values, grid_thw=image_grid_thw
196
+ )
197
+
198
+ # Aggregate features using mean pooling
199
+ image_features = self._aggregate_vision_features(vision_output, merged_cu_seqlens)
200
+
201
+ # Apply projection layers (includes tanh activation)
202
+ image_embeds = self.vision_projection(image_features)
203
+
204
+ # Apply L2 normalization (tanh constraint is already applied in projection)
205
+ return F.normalize(image_embeds, p=2, dim=-1)
206
+
207
+ def compute_similarity(
208
+ self,
209
+ text_embeds: torch.FloatTensor,
210
+ image_embeds: torch.FloatTensor,
211
+ ) -> torch.FloatTensor:
212
+ """
213
+ Compute similarity between text and image embeddings.
214
+
215
+ Args:
216
+ text_embeds: Tanh + L2 normalized text embeddings [batch_size, embed_dim]
217
+ image_embeds: Tanh + L2 normalized image embeddings [batch_size, embed_dim]
218
+
219
+ Returns:
220
+ Similarity matrix [batch_size, batch_size] in range [0, 1]
221
+ """
222
+ # Embeddings are constrained to [-1,1] by tanh, dot product is in [-1,1] range
223
+ # Scale moderately with small logit_scale for stable training
224
+ logit_scale = self.logit_scale.exp()
225
+ similarity = torch.matmul(text_embeds, image_embeds.t()) * logit_scale
226
+
227
+ # Apply sigmoid for more natural 0~1 mapping
228
+ # Tanh and logit_scale adjustment helps avoid extreme values and promotes stable learning
229
+ return torch.sigmoid(similarity)
230
+
231
+
232
+ class CLIPQwenVLWrapper(nn.Module):
233
+ """Wrapper class for CLIP-QwenVL model with tokenization and processing capabilities."""
234
+
235
+ save_in_root: bool = True
236
+
237
+ def __init__(
238
+ self,
239
+ model_name_or_path: str,
240
+ cache_dir: str = None,
241
+ backend: str = "torch",
242
+ enable_text_grad: bool = False,
243
+ **kwargs,
244
+ ) -> None:
245
+ super().__init__()
246
+
247
+ self.enable_text_grad = enable_text_grad
248
+
249
+ # Setup model arguments with default dtype
250
+ model_args = kwargs.get("model_args", {})
251
+ if "torch_dtype" not in model_args:
252
+ model_args["torch_dtype"] = torch.bfloat16
253
+
254
+ # Initialize model components
255
+ self.model = CLIPQwenVLModel.from_pretrained(
256
+ model_name_or_path, cache_dir=cache_dir, **model_args
257
+ )
258
+ self.tokenizer = transformers.AutoTokenizer.from_pretrained(RURI_MODEL_NAME)
259
+ self.processor = transformers.AutoProcessor.from_pretrained(
260
+ QWEN_MODEL_NAME, use_fast=False
261
+ )
262
+
263
+ def __repr__(self) -> str:
264
+ return "CLIPQwenVLWrapper()"
265
+
266
+ def _extract_embeddings_by_type(
267
+ self,
268
+ features: dict[str, torch.Tensor]
269
+ ) -> tuple[torch.Tensor, torch.Tensor]:
270
+ """Extract image and text embeddings from features."""
271
+ image_embeds = []
272
+ text_embeds = []
273
+
274
+ if "pixel_values" in features:
275
+ image_embeds = self.model.get_image_features(
276
+ pixel_values=features["pixel_values"],
277
+ image_grid_thw=features["image_grid_thw"],
278
+ )
279
+
280
+ if "input_ids" in features:
281
+ text_embeds = self.model.get_text_features(
282
+ input_ids=features["input_ids"],
283
+ attention_mask=features.get("attention_mask", None),
284
+ position_ids=features.get("position_ids", None),
285
+ output_attentions=features.get("output_attentions", None),
286
+ output_hidden_states=features.get("output_hidden_states", None),
287
+ )
288
+
289
+ if self.enable_text_grad:
290
+ # Avoid errors when not specifying text model layers during PEFT training
291
+ text_embeds = text_embeds.detach().requires_grad_()
292
+
293
+ return image_embeds, text_embeds
294
+
295
+ def _build_sentence_embeddings(
296
+ self,
297
+ image_embeds: torch.Tensor,
298
+ text_embeds: torch.Tensor,
299
+ image_text_info: List[int],
300
+ ) -> torch.Tensor:
301
+ """Build sentence embeddings by selecting appropriate embeddings based on input type."""
302
+ sentence_embedding = []
303
+ image_features = iter(image_embeds)
304
+ text_features = iter(text_embeds)
305
+
306
+ for input_type in image_text_info:
307
+ if input_type == IMAGE_INPUT_TYPE:
308
+ sentence_embedding.append(next(image_features))
309
+ else:
310
+ sentence_embedding.append(next(text_features))
311
+
312
+ return torch.stack(sentence_embedding).float()
313
+
314
+ def forward(self, features: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
315
+ """
316
+ Forward pass to generate embeddings for mixed image and text inputs.
317
+
318
+ Args:
319
+ features: Dictionary containing input features
320
+
321
+ Returns:
322
+ Dictionary with sentence embeddings added
323
+ """
324
+ # Extract embeddings by modality
325
+ image_embeds, text_embeds = self._extract_embeddings_by_type(features)
326
+
327
+ # Build combined sentence embeddings
328
+ features["sentence_embedding"] = self._build_sentence_embeddings(
329
+ image_embeds, text_embeds, features["image_text_info"]
330
+ )
331
+
332
+ return features
333
+
334
+ def _separate_inputs_by_type(
335
+ self,
336
+ texts: List[Union[str, Image.Image]]
337
+ ) -> tuple[List[Image.Image], List[str], List[int]]:
338
+ """Separate mixed inputs into images, texts, and type information."""
339
+ images = []
340
+ texts_values = []
341
+ image_text_info = []
342
+
343
+ for data in texts:
344
+ if isinstance(data, Image.Image):
345
+ images.append(data)
346
+ image_text_info.append(IMAGE_INPUT_TYPE)
347
+ else:
348
+ texts_values.append(data)
349
+ image_text_info.append(TEXT_INPUT_TYPE)
350
+
351
+ return images, texts_values, image_text_info
352
+
353
+ def _tokenize_texts(
354
+ self,
355
+ texts_values: List[str],
356
+ padding: str | bool
357
+ ) -> dict[str, torch.Tensor]:
358
+ """Tokenize text inputs."""
359
+ if not texts_values:
360
+ return {}
361
+
362
+ return self.tokenizer(
363
+ texts_values,
364
+ return_tensors="pt",
365
+ padding=padding,
366
+ truncation=True,
367
+ max_length=DEFAULT_MAX_LENGTH,
368
+ )
369
+
370
+ def _process_images(self, images: List[Image.Image]) -> dict[str, torch.Tensor]:
371
+ """Process image inputs."""
372
+ if not images:
373
+ return {}
374
+
375
+ return self.processor.image_processor(images, return_tensors="pt")
376
+
377
+ def tokenize(
378
+ self,
379
+ texts: List[Union[str, Image.Image]],
380
+ padding: str | bool = True
381
+ ) -> dict[str, torch.Tensor]:
382
+ """
383
+ Tokenize mixed text and image inputs.
384
+
385
+ Args:
386
+ texts: List of text strings and/or PIL Images
387
+ padding: Whether to pad sequences
388
+
389
+ Returns:
390
+ Dictionary containing tokenized features
391
+ """
392
+ # Separate inputs by type
393
+ images, texts_values, image_text_info = self._separate_inputs_by_type(texts)
394
+
395
+ # Process each modality
396
+ encoding = {}
397
+
398
+ # Tokenize texts
399
+ text_encoding = self._tokenize_texts(texts_values, padding)
400
+ encoding.update(text_encoding)
401
+
402
+ # Process images
403
+ image_encoding = self._process_images(images)
404
+ encoding.update(image_encoding)
405
+
406
+ # Add type information
407
+ encoding["image_text_info"] = image_text_info
408
+
409
+ return dict(encoding)
410
+
411
+ @property
412
+ def processor(self) -> transformers.PreTrainedModel:
413
+ """Get the image processor."""
414
+ return self._processor
415
+
416
+ @processor.setter
417
+ def processor(self, processor):
418
+ """Set the image processor."""
419
+ self._processor = processor
420
+
421
+ def save(self, output_path: str) -> None:
422
+ """
423
+ Save model, tokenizer, and processor to the specified path.
424
+
425
+ Args:
426
+ output_path: Directory path to save the components
427
+ """
428
+ self.model.save_pretrained(output_path)
429
+ self.tokenizer.save_pretrained(output_path)
430
+ self.processor.save_pretrained(output_path)
431
+
432
+ @staticmethod
433
+ def load(input_path: str) -> CLIPQwenVLWrapper:
434
+ """
435
+ Load model from the specified path.
436
+
437
+ Args:
438
+ input_path: Directory path containing the saved model
439
+
440
+ Returns:
441
+ Loaded CLIPQwenVLWrapper instance
442
+ """
443
+ return CLIPQwenVLWrapper(model_name_or_path=input_path)
modules.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "modeling_clip.CLIPQwenVLWrapper"
7
+ }
8
+ ]
preprocessor_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_convert_rgb": true,
3
+ "do_normalize": true,
4
+ "do_rescale": true,
5
+ "do_resize": true,
6
+ "image_mean": [
7
+ 0.48145466,
8
+ 0.4578275,
9
+ 0.40821073
10
+ ],
11
+ "image_processor_type": "Qwen2VLImageProcessor",
12
+ "image_std": [
13
+ 0.26862954,
14
+ 0.26130258,
15
+ 0.27577711
16
+ ],
17
+ "max_pixels": 12845056,
18
+ "merge_size": 2,
19
+ "min_pixels": 3136,
20
+ "patch_size": 14,
21
+ "processor_class": "Qwen2_5_VLProcessor",
22
+ "resample": 3,
23
+ "rescale_factor": 0.00392156862745098,
24
+ "size": {
25
+ "longest_edge": 12845056,
26
+ "shortest_edge": 3136
27
+ },
28
+ "temporal_patch_size": 2
29
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:008293028e1a9d9a1038d9b63d989a2319797dfeaa03f171093a57b33a3a8277
3
+ size 1831879
tokenizer_config.json ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
199
+ "clean_up_tokenization_spaces": false,
200
+ "eos_token": "<|im_end|>",
201
+ "errors": "replace",
202
+ "extra_special_tokens": {},
203
+ "model_max_length": 131072,
204
+ "pad_token": "<|endoftext|>",
205
+ "processor_class": "Qwen2_5_VLProcessor",
206
+ "split_special_tokens": false,
207
+ "tokenizer_class": "Qwen2Tokenizer",
208
+ "unk_token": null,
209
+ "use_fast": false
210
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff