Upload folder using huggingface_hub

Browse files

Files changed (17) hide show

.gitignore +9 -0
README.md +115 -3
added_tokens.json +24 -0
chat_template.json +3 -0
config.json +80 -0
config_sentence_transformers.json +10 -0
example.py +59 -0
merges.txt +0 -0
model.safetensors +3 -0
modeling_clip.py +443 -0
modules.json +8 -0
preprocessor_config.json +29 -0
special_tokens_map.json +31 -0
tokenizer.json +0 -0
tokenizer.model +3 -0
tokenizer_config.json +210 -0
vocab.json +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,9 @@

+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+*.pyw
+*.pyz
+*.pywz
+*.pyzw
+*.pyzwz

README.md CHANGED Viewed

@@ -1,3 +1,115 @@
----
-license: cc-by-nc-4.0
----

+---
+tags:
+- sentence-transformers
+- sentence-similarity
+- feature-extraction
+pipeline_tag: sentence-similarity
+library_name: sentence-transformers
+language:
+- ja
+base_model:
+- cl-nagoya/ruri-v3-310m
+- Qwen/Qwen2.5-VL-7B-Instruct
+license: apache-2.0
+---
+### aki-0421/clip-anime-patch400-10k-v1
+This is a CLIP model designed for anime character retrieval tasks.
+### Example
+```
+import math
+from PIL import Image
+from sentence_transformers import SentenceTransformer
+def resize_image_for_patch(image: Image.Image, patch_size: int = 14, max_patches: int = 400) -> Image.Image:
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+    # Max width and height in pixels under the patch constraint
+    max_total_pixels = patch_size * math.sqrt(max_patches)
+    if aspect_ratio >= 1:
+        # Landscape or square orientation
+        target_width = patch_size * int(math.floor(math.sqrt(max_patches * aspect_ratio)))
+        target_height = int(target_width / aspect_ratio)
+    else:
+        # Portrait orientation
+        target_height = patch_size * int(math.floor(math.sqrt(max_patches / aspect_ratio)))
+        target_width = int(target_height * aspect_ratio)
+    # Ensure dimensions are multiples of patch_size
+    target_width -= target_width % patch_size
+    target_height -= target_height % patch_size
+    return image.resize((target_width, target_height), Image.BICUBIC)
+# Init model
+model = SentenceTransformer("./", device="cuda")
+images = [
+     resize_image_for_patch(Image.open("/home/aki0421/Share/images/00085.png"))
+]
+image_embeddings = model.encode(images, convert_to_tensor=True)
+sentences = [
+    "女の子が悲しんでいる。",
+    "落ち込んでる人",
+    "泣いている",
+    "笑っている",
+    "ピンクの髪の女の子",
+    "赤い髪の女の子",
+    "茶色の髪の女の子",
+    "赤い目",
+    "青い目",
+    "曇っている",
+    "雨が降っている",
+    "晴れている",
+    "キッチンにいます。",
+    "学校にいる",
+    "魔法少女のようだ",
+    "戦闘しますか？",
+    "男性ですか？",
+    "茶色い髪の女の子が悲しんでいるシーン",
+    "ピンクの髪の女の子が笑っているシーン"
+]
+text_embeddings = model.encode(sentences, convert_to_tensor=True)
+similarities = model.similarity(text_embeddings, image_embeddings)
+print(similarities)
+```
+### Citation
+```
+@misc{
+    qwen2.5-VL,
+    title = {Qwen2.5-VL},
+    url = {https://qwenlm.github.io/blog/qwen2.5-vl/},
+    author = {Qwen Team},
+    month = {January},
+    year = {2025}
+}
+@misc{
+    Ruri,
+    title={{Ruri: Japanese General Text Embeddings}},
+    author={Hayato Tsukagoshi and Ryohei Sasano},
+    year={2024},
+    eprint={2409.07737},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL},
+    url={https://arxiv.org/abs/2409.07737},
+}
+@misc{
+    oshizo2024clipqwen,
+    author       = {Oshizo},
+    title        = {japanese-clip-qwen2\_vl},
+    year         = {2024},
+    howpublished = {\url{https://github.com/oshizo/japanese-clip-qwen2_vl}},
+    note         = {Accessed: 2025-06-08}
+}
+```

added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

chat_template.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
+}

config.json ADDED Viewed

	@@ -0,0 +1,80 @@

+{
+  "architectures": [
+    "CLIPQwenVLModel"
+  ],
+  "logit_scale_init_value": 0.5,
+  "model_type": "clip_qwen_vl",
+  "projection_dim": 768,
+  "text_config": {
+    "_attn_implementation_autoset": true,
+    "_name_or_path": "cl-nagoya/ruri-v3-310m",
+    "architectures": [
+      "ModernBertModel"
+    ],
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "bos_token_id": 1,
+    "classifier_activation": "gelu",
+    "classifier_bias": false,
+    "classifier_dropout": 0.0,
+    "classifier_pooling": "cls",
+    "cls_token_id": 6,
+    "decoder_bias": true,
+    "deterministic_flash_attn": false,
+    "embedding_dropout": 0.0,
+    "eos_token_id": 2,
+    "global_attn_every_n_layers": 3,
+    "global_rope_theta": 160000.0,
+    "gradient_checkpointing": false,
+    "hidden_activation": "gelu",
+    "hidden_size": 768,
+    "initializer_cutoff_factor": 2.0,
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "layer_norm_eps": 1e-05,
+    "local_attention": 128,
+    "local_rope_theta": 10000.0,
+    "max_position_embeddings": 8192,
+    "mlp_bias": false,
+    "mlp_dropout": 0.0,
+    "model_type": "modernbert",
+    "norm_bias": false,
+    "norm_eps": 1e-05,
+    "num_attention_heads": 12,
+    "num_hidden_layers": 25,
+    "pad_token_id": 3,
+    "position_embedding_type": "rope",
+    "repad_logits_with_grad": false,
+    "sep_token_id": 4,
+    "sparse_pred_ignore_index": -100,
+    "sparse_prediction": false,
+    "torch_dtype": "float32",
+    "vocab_size": 102400
+  },
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.51.3",
+  "vision_config": {
+    "_attn_implementation_autoset": true,
+    "depth": 32,
+    "fullatt_block_indexes": [
+      7,
+      15,
+      23,
+      31
+    ],
+    "hidden_act": "silu",
+    "hidden_size": 1280,
+    "in_channels": 3,
+    "in_chans": 3,
+    "intermediate_size": 3420,
+    "model_type": "qwen2_5_vl",
+    "num_heads": 16,
+    "out_hidden_size": 3584,
+    "patch_size": 14,
+    "spatial_merge_size": 2,
+    "spatial_patch_size": 14,
+    "temporal_patch_size": 2,
+    "tokens_per_second": 2,
+    "window_size": 112
+  }
+}

config_sentence_transformers.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "__version__": {
+    "sentence_transformers": "4.1.0",
+    "transformers": "4.51.3",
+    "pytorch": "2.8.0.dev20250530+cu128"
+  },
+  "prompts": {},
+  "default_prompt_name": null,
+  "similarity_fn_name": "cosine"
+}

example.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import math
+from PIL import Image
+from sentence_transformers import SentenceTransformer
+def resize_image_for_patch(image: Image.Image, patch_size: int = 14, max_patches: int = 400) -> Image.Image:
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+    # Max width and height in pixels under the patch constraint
+    max_total_pixels = patch_size * math.sqrt(max_patches)
+    if aspect_ratio >= 1:
+        # Landscape or square orientation
+        target_width = patch_size * int(math.floor(math.sqrt(max_patches * aspect_ratio)))
+        target_height = int(target_width / aspect_ratio)
+    else:
+        # Portrait orientation
+        target_height = patch_size * int(math.floor(math.sqrt(max_patches / aspect_ratio)))
+        target_width = int(target_height * aspect_ratio)
+    # Ensure dimensions are multiples of patch_size
+    target_width -= target_width % patch_size
+    target_height -= target_height % patch_size
+    return image.resize((target_width, target_height), Image.BICUBIC)
+# Init model
+model = SentenceTransformer("./", device="cuda")
+images = [
+     resize_image_for_patch(Image.open("/home/aki0421/Share/images/00085.png"))
+]
+image_embeddings = model.encode(images, convert_to_tensor=True)
+sentences = [
+    "女の子が悲しんでいる。",
+    "落ち込んでる人",
+    "泣いている",
+    "笑っている",
+    "ピンクの髪の女の子",
+    "赤い髪の女の子",
+    "茶色の髪の女の子",
+    "赤い目",
+    "青い目",
+    "曇っている",
+    "雨が降っている",
+    "晴れている",
+    "キッチンにいます。",
+    "学校にいる",
+    "魔法少女のようだ",
+    "戦闘しますか？",
+    "男性ですか？",
+    "茶色い髪の女の子が悲しんでいるシーン",
+    "ピンクの髪の女の子が笑っているシーン"
+]
+text_embeddings = model.encode(sentences, convert_to_tensor=True)
+similarities = model.similarity(text_embeddings, image_embeddings)
+print(similarities)

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8477a3483d0a3ca9109ec1aeb83f38c0ef2b363dd4fb2a79b7bd15ec67c2a1a0
+size 1993531042

modeling_clip.py ADDED Viewed

	@@ -0,0 +1,443 @@

+from __future__ import annotations
+from typing import Any, Dict, List, Optional, Union
+import torch
+import torch.nn.functional as F
+import transformers
+from PIL import Image
+from torch import nn
+from transformers import (
+    ModernBertConfig,
+    ModernBertModel,
+    PretrainedConfig,
+    PreTrainedModel,
+)
+from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLVisionConfig
+from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
+    Qwen2_5_VisionTransformerPretrainedModel,
+)
+# Constants
+DEFAULT_PROJECTION_DIM = 768
+DEFAULT_LOGIT_SCALE_INIT = 0.5
+DEFAULT_MAX_LENGTH = 512
+SPATIAL_MERGE_SIZE = 2
+PROJECTION_INTERMEDIATE_DIM = 1280
+PROJECTION_DROPOUT = 0.1
+RURI_MODEL_NAME = "cl-nagoya/ruri-v3-310m"
+QWEN_MODEL_NAME = "Qwen/Qwen2.5-VL-7B-Instruct"
+# Input type constants
+IMAGE_INPUT_TYPE = 0
+TEXT_INPUT_TYPE = 1
+class CLIPQwenVLConfig(PretrainedConfig):
+    """Configuration class for CLIP-QwenVL model."""
+    model_type = "clip_qwen_vl"
+    def __init__(
+        self,
+        text_config: Optional[Dict[str, Any]] = None,
+        vision_config: Optional[Dict[str, Any]] = None,
+        projection_dim: int = DEFAULT_PROJECTION_DIM,
+        logit_scale_init_value: float = DEFAULT_LOGIT_SCALE_INIT,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        text_config = text_config or {}
+        vision_config = vision_config or {}
+        self.text_config = ModernBertConfig(**text_config)
+        self.vision_config = Qwen2_5_VLVisionConfig(**vision_config)
+        self.projection_dim = projection_dim
+        self.logit_scale_init_value = logit_scale_init_value
+class CLIPQwenVLModel(PreTrainedModel):
+    """CLIP-QwenVL model for multi-modal embedding generation."""
+    config_class = CLIPQwenVLConfig
+    def __init__(self, config: CLIPQwenVLConfig):
+        super().__init__(config)
+        self.projection_dim = config.text_config.hidden_size
+        self.text_embed_dim = config.text_config.hidden_size
+        self.vision_embed_dim = config.vision_config.out_hidden_size
+        # Initialize text encoder
+        self.text_model = ModernBertModel(config.text_config)
+        # Initialize vision encoder
+        self.vision_model = Qwen2_5_VisionTransformerPretrainedModel(config.vision_config)
+        # Initialize vision projection layers
+        self.vision_projection = self._create_vision_projection()
+        # Initialize logit scale parameter
+        self.logit_scale = nn.Parameter(torch.ones([]) * config.logit_scale_init_value)
+    def _create_vision_projection(self) -> nn.Module:
+        """Create vision projection layers with dropout and activation."""
+        return nn.Sequential(
+            nn.Linear(self.vision_embed_dim, PROJECTION_INTERMEDIATE_DIM),
+            nn.GELU(),
+            nn.Dropout(PROJECTION_DROPOUT),
+            nn.Linear(PROJECTION_INTERMEDIATE_DIM, self.projection_dim),
+            nn.Tanh(),
+        )
+    def _apply_mean_pooling(
+        self,
+        last_hidden_state: torch.Tensor,
+        attention_mask: torch.Tensor
+    ) -> torch.Tensor:
+        """Apply mean pooling to text embeddings using attention mask."""
+        attention_mask = attention_mask.to(last_hidden_state.dtype)
+        input_mask_expanded = attention_mask.unsqueeze(-1).expand(
+            last_hidden_state.size()
+        )
+        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
+        sum_mask = input_mask_expanded.sum(1)
+        sum_mask = torch.clamp(sum_mask, min=1e-9)
+        return sum_embeddings / sum_mask
+    def _normalize_embeddings(self, embeddings: torch.Tensor) -> torch.Tensor:
+        """Apply tanh constraint and L2 normalization to embeddings."""
+        # Constrain to [-1,1] range using tanh, then apply L2 normalization
+        embeddings = torch.tanh(embeddings)
+        return F.normalize(embeddings, p=2, dim=-1)
+    def get_text_features(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        """
+        Extract and normalize text features from input tokens.
+        Args:
+            input_ids: Token ids of shape [batch_size, seq_len]
+            attention_mask: Attention mask of shape [batch_size, seq_len]
+            position_ids: Position ids of shape [batch_size, seq_len]
+            output_attentions: Whether to output attention weights
+            output_hidden_states: Whether to output hidden states
+        Returns:
+            Normalized text embeddings of shape [batch_size, hidden_size]
+        """
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+        )
+        # Apply mean pooling to get sentence-level representations
+        text_embeds = self._apply_mean_pooling(
+            text_outputs.last_hidden_state, attention_mask
+        )
+        # Apply tanh constraint and L2 normalization
+        return self._normalize_embeddings(text_embeds)
+    def _compute_merged_patches_info(self, image_grid_thw: torch.LongTensor) -> torch.Tensor:
+        """Compute cumulative sequence lengths for merged image patches."""
+        t, h, w = image_grid_thw.unbind(dim=1)
+        merged_patches_per_image = (
+            (h // SPATIAL_MERGE_SIZE) * (w // SPATIAL_MERGE_SIZE) * t
+        )
+        return F.pad(merged_patches_per_image.cumsum(0), (1, 0), value=0)
+    def _aggregate_vision_features(
+        self,
+        vision_output: torch.Tensor,
+        merged_cu_seqlens: torch.Tensor
+    ) -> torch.Tensor:
+        """Aggregate vision features using mean pooling over patches."""
+        return torch.stack([
+            vision_output[start:end].mean(dim=0)
+            for start, end in zip(merged_cu_seqlens[:-1], merged_cu_seqlens[1:])
+        ])
+    def get_image_features(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+    ) -> torch.FloatTensor:
+        """
+        Extract and normalize image features from pixel values.
+        Args:
+            pixel_values: Image pixel values
+            image_grid_thw: Image grid dimensions [batch_size, 3] (time, height, width)
+        Returns:
+            Normalized image embeddings of shape [batch_size, projection_dim]
+        """
+        # Compute merged patch information
+        merged_cu_seqlens = self._compute_merged_patches_info(image_grid_thw)
+        # Extract vision features
+        vision_output = self.vision_model(
+            hidden_states=pixel_values, grid_thw=image_grid_thw
+        )
+        # Aggregate features using mean pooling
+        image_features = self._aggregate_vision_features(vision_output, merged_cu_seqlens)
+        # Apply projection layers (includes tanh activation)
+        image_embeds = self.vision_projection(image_features)
+        # Apply L2 normalization (tanh constraint is already applied in projection)
+        return F.normalize(image_embeds, p=2, dim=-1)
+    def compute_similarity(
+        self,
+        text_embeds: torch.FloatTensor,
+        image_embeds: torch.FloatTensor,
+    ) -> torch.FloatTensor:
+        """
+        Compute similarity between text and image embeddings.
+        Args:
+            text_embeds: Tanh + L2 normalized text embeddings [batch_size, embed_dim]
+            image_embeds: Tanh + L2 normalized image embeddings [batch_size, embed_dim]
+        Returns:
+            Similarity matrix [batch_size, batch_size] in range [0, 1]
+        """
+        # Embeddings are constrained to [-1,1] by tanh, dot product is in [-1,1] range
+        # Scale moderately with small logit_scale for stable training
+        logit_scale = self.logit_scale.exp()
+        similarity = torch.matmul(text_embeds, image_embeds.t()) * logit_scale
+        # Apply sigmoid for more natural 0~1 mapping
+        # Tanh and logit_scale adjustment helps avoid extreme values and promotes stable learning
+        return torch.sigmoid(similarity)
+class CLIPQwenVLWrapper(nn.Module):
+    """Wrapper class for CLIP-QwenVL model with tokenization and processing capabilities."""
+    save_in_root: bool = True
+    def __init__(
+        self,
+        model_name_or_path: str,
+        cache_dir: str = None,
+        backend: str = "torch",
+        enable_text_grad: bool = False,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        self.enable_text_grad = enable_text_grad
+        # Setup model arguments with default dtype
+        model_args = kwargs.get("model_args", {})
+        if "torch_dtype" not in model_args:
+            model_args["torch_dtype"] = torch.bfloat16
+        # Initialize model components
+        self.model = CLIPQwenVLModel.from_pretrained(
+            model_name_or_path, cache_dir=cache_dir, **model_args
+        )
+        self.tokenizer = transformers.AutoTokenizer.from_pretrained(RURI_MODEL_NAME)
+        self.processor = transformers.AutoProcessor.from_pretrained(
+            QWEN_MODEL_NAME, use_fast=False
+        )
+    def __repr__(self) -> str:
+        return "CLIPQwenVLWrapper()"
+    def _extract_embeddings_by_type(
+        self,
+        features: dict[str, torch.Tensor]
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """Extract image and text embeddings from features."""
+        image_embeds = []
+        text_embeds = []
+        if "pixel_values" in features:
+            image_embeds = self.model.get_image_features(
+                pixel_values=features["pixel_values"],
+                image_grid_thw=features["image_grid_thw"],
+            )
+        if "input_ids" in features:
+            text_embeds = self.model.get_text_features(
+                input_ids=features["input_ids"],
+                attention_mask=features.get("attention_mask", None),
+                position_ids=features.get("position_ids", None),
+                output_attentions=features.get("output_attentions", None),
+                output_hidden_states=features.get("output_hidden_states", None),
+            )
+            if self.enable_text_grad:
+                # Avoid errors when not specifying text model layers during PEFT training
+                text_embeds = text_embeds.detach().requires_grad_()
+        return image_embeds, text_embeds
+    def _build_sentence_embeddings(
+        self,
+        image_embeds: torch.Tensor,
+        text_embeds: torch.Tensor,
+        image_text_info: List[int],
+    ) -> torch.Tensor:
+        """Build sentence embeddings by selecting appropriate embeddings based on input type."""
+        sentence_embedding = []
+        image_features = iter(image_embeds)
+        text_features = iter(text_embeds)
+        for input_type in image_text_info:
+            if input_type == IMAGE_INPUT_TYPE:
+                sentence_embedding.append(next(image_features))
+            else:
+                sentence_embedding.append(next(text_features))
+        return torch.stack(sentence_embedding).float()
+    def forward(self, features: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
+        """
+        Forward pass to generate embeddings for mixed image and text inputs.
+        Args:
+            features: Dictionary containing input features
+        Returns:
+            Dictionary with sentence embeddings added
+        """
+        # Extract embeddings by modality
+        image_embeds, text_embeds = self._extract_embeddings_by_type(features)
+        # Build combined sentence embeddings
+        features["sentence_embedding"] = self._build_sentence_embeddings(
+            image_embeds, text_embeds, features["image_text_info"]
+        )
+        return features
+    def _separate_inputs_by_type(
+        self,
+        texts: List[Union[str, Image.Image]]
+    ) -> tuple[List[Image.Image], List[str], List[int]]:
+        """Separate mixed inputs into images, texts, and type information."""
+        images = []
+        texts_values = []
+        image_text_info = []
+        for data in texts:
+            if isinstance(data, Image.Image):
+                images.append(data)
+                image_text_info.append(IMAGE_INPUT_TYPE)
+            else:
+                texts_values.append(data)
+                image_text_info.append(TEXT_INPUT_TYPE)
+        return images, texts_values, image_text_info
+    def _tokenize_texts(
+        self,
+        texts_values: List[str],
+        padding: str | bool
+    ) -> dict[str, torch.Tensor]:
+        """Tokenize text inputs."""
+        if not texts_values:
+            return {}
+        return self.tokenizer(
+            texts_values,
+            return_tensors="pt",
+            padding=padding,
+            truncation=True,
+            max_length=DEFAULT_MAX_LENGTH,
+        )
+    def _process_images(self, images: List[Image.Image]) -> dict[str, torch.Tensor]:
+        """Process image inputs."""
+        if not images:
+            return {}
+        return self.processor.image_processor(images, return_tensors="pt")
+    def tokenize(
+        self,
+        texts: List[Union[str, Image.Image]],
+        padding: str | bool = True
+    ) -> dict[str, torch.Tensor]:
+        """
+        Tokenize mixed text and image inputs.
+        Args:
+            texts: List of text strings and/or PIL Images
+            padding: Whether to pad sequences
+        Returns:
+            Dictionary containing tokenized features
+        """
+        # Separate inputs by type
+        images, texts_values, image_text_info = self._separate_inputs_by_type(texts)
+        # Process each modality
+        encoding = {}
+        # Tokenize texts
+        text_encoding = self._tokenize_texts(texts_values, padding)
+        encoding.update(text_encoding)
+        # Process images
+        image_encoding = self._process_images(images)
+        encoding.update(image_encoding)
+        # Add type information
+        encoding["image_text_info"] = image_text_info
+        return dict(encoding)
+    @property
+    def processor(self) -> transformers.PreTrainedModel:
+        """Get the image processor."""
+        return self._processor
+    @processor.setter
+    def processor(self, processor):
+        """Set the image processor."""
+        self._processor = processor
+    def save(self, output_path: str) -> None:
+        """
+        Save model, tokenizer, and processor to the specified path.
+        Args:
+            output_path: Directory path to save the components
+        """
+        self.model.save_pretrained(output_path)
+        self.tokenizer.save_pretrained(output_path)
+        self.processor.save_pretrained(output_path)
+    @staticmethod
+    def load(input_path: str) -> CLIPQwenVLWrapper:
+        """
+        Load model from the specified path.
+        Args:
+            input_path: Directory path containing the saved model
+        Returns:
+            Loaded CLIPQwenVLWrapper instance
+        """
+        return CLIPQwenVLWrapper(model_name_or_path=input_path)

modules.json ADDED Viewed

	@@ -0,0 +1,8 @@

+[
+  {
+    "idx": 0,
+    "name": "0",
+    "path": "",
+    "type": "modeling_clip.CLIPQwenVLWrapper"
+  }
+]

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_processor_type": "Qwen2VLImageProcessor",
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "max_pixels": 12845056,
+  "merge_size": 2,
+  "min_pixels": 3136,
+  "patch_size": 14,
+  "processor_class": "Qwen2_5_VLProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "longest_edge": 12845056,
+    "shortest_edge": 3136
+  },
+  "temporal_patch_size": 2
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:008293028e1a9d9a1038d9b63d989a2319797dfeaa03f171093a57b33a3a8277
+size 1831879

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,210 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "processor_class": "Qwen2_5_VLProcessor",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null,
+  "use_fast": false
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff