update usage

Browse files

Files changed (8) hide show

README.md +9 -7
__init__.py +10 -0
config.json +9 -2
configuration_ops_colqwen3.py +18 -0
modeling_ops_colqwen3.py +103 -0
preprocessor_config.json +3 -0
processing_ops_colqwen3.py +139 -0
scripts/ops_colqwen3_embedder.py +90 -261

README.md CHANGED Viewed

@@ -32,33 +32,35 @@ The model is trained using a multi-stage strategy that combines large-scale text
 **Requirements**
 ```
 transformers>=4.57.0
 qwen-vl-utils>=0.0.14
 torch==2.8.0
-colpali_engine==0.3.12
 ```
 **Basic Usage**
 ```python
 from PIL import Image
 from scripts.ops_colqwen3_embedder import OpsColQwen3Embedder
 images = [Image.new("RGB", (32, 32), color="white"), Image.new("RGB", (16, 16), color="black")]
 queries = ["Is attention really all you need?", "What is the amount of bananas farmed in Salvador?"]
-encoder = OpsColQwen3Embedder(
     model_name="OpenSearch-AI/Ops-Colqwen3-4B",
-    dims=320,
     dtype=torch.float16,
     attn_implementation="flash_attention_2",
 )
-query_embeddings = encoder.encode_texts(queries, batch_size=2)
-image_embeddings = encoder.encode_images(images, batch_size=2)
-scores = encoder.compute_scores(query_embeddings, image_embeddings)
 print(f"Scores:\n{scores}")
 ```

 **Requirements**
 ```
+pillow
 transformers>=4.57.0
 qwen-vl-utils>=0.0.14
 torch==2.8.0
 ```
 **Basic Usage**
 ```python
+import torch
 from PIL import Image
 from scripts.ops_colqwen3_embedder import OpsColQwen3Embedder
 images = [Image.new("RGB", (32, 32), color="white"), Image.new("RGB", (16, 16), color="black")]
 queries = ["Is attention really all you need?", "What is the amount of bananas farmed in Salvador?"]
+embedder = OpsColQwen3Embedder(
     model_name="OpenSearch-AI/Ops-Colqwen3-4B",
+    dims=2560,
     dtype=torch.float16,
     attn_implementation="flash_attention_2",
 )
+query_embeddings = embedder.encode_queries(queries)
+image_embeddings = embedder.encode_images(images)
+print(query_embeddings[0].shape, image_embeddings[0].shape) # (23, 2560) (18, 2560)
+scores = embedder.compute_scores(query_embeddings, image_embeddings)
 print(f"Scores:\n{scores}")
 ```

__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from .configuration_ops_colqwen3 import OpsColQwen3Config
+from .modeling_ops_colqwen3 import OpsColQwen3Model, OpsColQwen3PreTrainedModel
+from .processing_ops_colqwen3 import OpsColQwen3Processor
+__all__ = [
+    "OpsColQwen3Config",
+    "OpsColQwen3Model",
+    "OpsColQwen3PreTrainedModel",
+    "OpsColQwen3Processor",
+]

config.json CHANGED Viewed

@@ -1,10 +1,17 @@
 {
   "architectures": [
-    "ColQwen3VLModel"
   ],
   "dtype": "float32",
   "image_token_id": 151655,
-  "model_type": "qwen3_vl",
   "text_config": {
     "attention_bias": false,
     "attention_dropout": 0.0,

 {
   "architectures": [
+    "OpsColQwen3Model"
   ],
+  "auto_map": {
+    "AutoConfig": "configuration_ops_colqwen3.OpsColQwen3Config",
+    "AutoModel": "modeling_ops_colqwen3.OpsColQwen3Model",
+    "AutoModelForVision2Seq": "modeling_ops_colqwen3.OpsColQwen3Model",
+    "AutoProcessor": "processing_ops_colqwen3.OpsColQwen3Processor"
+  },
+  "dims": 2560,
   "dtype": "float32",
   "image_token_id": 151655,
+  "model_type": "ops_colqwen3",
   "text_config": {
     "attention_bias": false,
     "attention_dropout": 0.0,

configuration_ops_colqwen3.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from transformers import Qwen3VLConfig
+class OpsColQwen3Config(Qwen3VLConfig):
+    """
+    Configuration class for OpsColQwen3 model.
+    """
+    model_type = "ops_colqwen3"
+    def __init__(
+        self,
+        dims: int = 2560,
+        mask_non_image_embeddings: bool = False,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.dims = dims
+        self.mask_non_image_embeddings = mask_non_image_embeddings

modeling_ops_colqwen3.py ADDED Viewed

	@@ -0,0 +1,103 @@

+from typing import Optional
+import torch
+from torch import nn
+from transformers import PreTrainedModel
+from transformers.models.qwen3_vl import Qwen3VLModel
+from transformers.utils import logging
+from .configuration_ops_colqwen3 import OpsColQwen3Config
+logger = logging.get_logger(__name__)
+class OpsColQwen3PreTrainedModel(PreTrainedModel):
+    config_class = OpsColQwen3Config
+    base_model_prefix = "ops_colqwen3"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Qwen3VLVisionBlock", "Qwen3DecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+class OpsColQwen3Model(OpsColQwen3PreTrainedModel):
+    _checkpoint_conversion_mapping = {
+        r"^language_model": r"qwen3vl.language_model",
+        r"^visual": "qwen3vl.visual",
+    }
+    def __init__(self, config: OpsColQwen3Config):
+        super().__init__(config)
+        self.config = config
+        self.qwen3vl = Qwen3VLModel(config)
+        self.dims = config.text_config.hidden_size
+        self.custom_text_proj = nn.Linear(config.text_config.hidden_size, self.dims)
+        self.mask_non_image_embeddings = config.mask_non_image_embeddings
+        self.post_init()
+    @classmethod
+    def from_pretrained(cls, *args, config: Optional[OpsColQwen3Config] = None, **kwargs):
+        key_mapping = kwargs.pop("key_mapping", None)
+        if key_mapping is None:
+            key_mapping = getattr(cls, "_checkpoint_conversion_mapping", None)
+        dims = None
+        if 'dims' in kwargs:
+            dims = kwargs.pop('dims')
+        elif config is not None:
+            dims = config.dims
+        model = super().from_pretrained(*args, config=config, **kwargs, key_mapping=key_mapping)
+        if dims is not None:
+            model.dims = dims
+        return model
+    def forward(self, input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.Tensor] = None, pixel_values: Optional[torch.Tensor] = None, image_grid_thw: Optional[torch.Tensor] = None, **kwargs) -> torch.Tensor:
+        has_pixel_values = pixel_values is not None
+        if has_pixel_values:
+            if image_grid_thw is None:
+                raise ValueError("`image_grid_thw` must be provided when `pixel_values` is passed.")
+            if not torch.is_tensor(image_grid_thw):
+                image_grid_thw = torch.as_tensor(image_grid_thw, device=pixel_values.device)
+            offsets = image_grid_thw.prod(dim=1)
+            unpadded = [pixel_sequence[: int(offset.item())] for pixel_sequence, offset in zip(pixel_values, offsets)]
+            pixel_values = torch.cat(unpadded, dim=0) if unpadded else None
+        outputs = self.qwen3vl(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            pixel_values=pixel_values,
+            image_grid_thw=image_grid_thw,
+            use_cache=False,
+            output_hidden_states=True,
+            return_dict=True,
+        )
+        last_hidden_states = outputs.last_hidden_state
+        proj = self.custom_text_proj(last_hidden_states)
+        if self.dims < self.config.text_config.hidden_size:
+            proj = proj[..., : self.dims]
+        proj = proj / proj.norm(dim=-1, keepdim=True)
+        if attention_mask is not None:
+            proj = proj * attention_mask.unsqueeze(-1)
+        if has_pixel_values and self.mask_non_image_embeddings and input_ids is not None:
+            image_mask = (input_ids == self.config.image_token_id).unsqueeze(-1)
+            proj = proj * image_mask
+        return proj
+    @property
+    def patch_size(self) -> int:
+        return self.qwen3vl.visual.config.patch_size
+    @property
+    def spatial_merge_size(self) -> int:
+        return self.qwen3vl.visual.config.spatial_merge_size

preprocessor_config.json CHANGED Viewed

@@ -1,4 +1,7 @@
 {
   "crop_size": null,
   "data_format": "channels_first",
   "default_to_square": true,

 {
+  "auto_map": {
+    "AutoProcessor": "processing_ops_colqwen3.OpsColQwen3Processor"
+  },
   "crop_size": null,
   "data_format": "channels_first",
   "default_to_square": true,

processing_ops_colqwen3.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import logging
+from typing import List, Optional, Union
+import torch
+from PIL import Image
+from transformers import BatchEncoding, BatchFeature
+from transformers.models.qwen3_vl import Qwen3VLProcessor
+logger = logging.getLogger(__name__)
+def get_torch_device(device: str = "auto") -> str:
+    """
+    Returns the device (string) to be used by PyTorch.
+    `device` arg defaults to "auto" which will use:
+    - "cuda:0" if available
+    - else "mps" if available
+    - else "cpu".
+    """
+    if device == "auto":
+        if torch.cuda.is_available():
+            device = "cuda:0"
+        elif torch.backends.mps.is_available():  # for Apple Silicon
+            device = "mps"
+        else:
+            device = "cpu"
+        logger.info(f"Using device: {device}")
+    return device
+class OpsColQwen3Processor(Qwen3VLProcessor):
+    """
+    Processor for OpsColQwen3 model.
+    """
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+    query_prefix: str = "Query: "
+    visual_prompt_prefix: str = "<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe the image.<|im_end|><|im_start|>assistant\n<|endoftext|>"
+    query_augmentation_token: str = "<|endoftext|>"
+    image_token: str = "<|image_pad|>"
+    def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
+        """
+        Initialize the processor.
+        Args:
+            image_processor: Image processor instance
+            tokenizer: Tokenizer instance
+            chat_template: Optional chat template
+            **kwargs: Additional arguments
+        """
+        super().__init__(image_processor=image_processor, tokenizer=tokenizer, chat_template=chat_template, **kwargs)
+        if self.tokenizer is not None:
+            self.tokenizer.padding_side = "left"
+    def process_images(self, images: List[Image.Image], return_tensors: str = "pt", **kwargs) -> Union[BatchFeature, BatchEncoding]:
+        """
+        Process a batch of PIL images for the model.
+        """
+        images = [image.convert("RGB") for image in images]
+        batch_doc = self(text=[self.visual_prompt_prefix] * len(images), images=images, padding="longest", return_tensors=return_tensors, **kwargs)
+        if batch_doc["pixel_values"].numel() == 0:
+            return batch_doc
+        offsets = batch_doc["image_grid_thw"].prod(dim=1)
+        pixel_values = list(torch.split(batch_doc["pixel_values"], offsets.tolist()))
+        batch_doc["pixel_values"] = torch.nn.utils.rnn.pad_sequence(pixel_values, batch_first=True)
+        return batch_doc
+    def process_queries(self, queries: List[str], return_tensors: str = "pt", **kwargs) -> Union[BatchFeature, BatchEncoding]:
+        """
+        Process a list of text queries.
+        """
+        processed_queries = [self.query_prefix + q + self.query_augmentation_token * 10 for q in queries]
+        return self(text=processed_queries, return_tensors=return_tensors, padding="longest", **kwargs)
+    @staticmethod
+    def score_multi_vector(
+        qs: Union[torch.Tensor, List[torch.Tensor]],
+        ps: Union[torch.Tensor, List[torch.Tensor]],
+        batch_size: int = 128,
+        device: Optional[Union[str, torch.device]] = None,
+    ) -> torch.Tensor:
+        """
+        Compute the late-interaction/MaxSim score (ColBERT-like) for the given multi-vector
+        query embeddings (`qs`) and passage embeddings (`ps`). For ColPali, a passage is the
+        image of a document page.
+        Because the embedding tensors are multi-vector and can thus have different shapes, they
+        should be fed as:
+        (1) a list of tensors, where the i-th tensor is of shape (sequence_length_i, embedding_dim)
+        (2) a single tensor of shape (n_passages, max_sequence_length, embedding_dim) -> usually
+            obtained by padding the list of tensors.
+        Args:
+            qs (`Union[torch.Tensor, List[torch.Tensor]`): Query embeddings.
+            ps (`Union[torch.Tensor, List[torch.Tensor]`): Passage embeddings.
+            batch_size (`int`, *optional*, defaults to 128): Batch size for computing scores.
+            device (`Union[str, torch.device]`, *optional*): Device to use for computation. If not
+                provided, uses `get_torch_device("auto")`.
+        Returns:
+            `torch.Tensor`: A tensor of shape `(n_queries, n_passages)` containing the scores. The score
+            tensor is saved on the "cpu" device.
+        """
+        device = device or get_torch_device("auto")
+        if len(qs) == 0:
+            raise ValueError("No queries provided")
+        if len(ps) == 0:
+            raise ValueError("No passages provided")
+        scores_list: List[torch.Tensor] = []
+        for i in range(0, len(qs), batch_size):
+            scores_batch = []
+            qs_batch = torch.nn.utils.rnn.pad_sequence(qs[i : i + batch_size], batch_first=True, padding_value=0).to(device)
+            for j in range(0, len(ps), batch_size):
+                ps_batch = torch.nn.utils.rnn.pad_sequence(ps[j : j + batch_size], batch_first=True, padding_value=0).to(device)
+                scores_batch.append(torch.einsum("bnd,csd->bcns", qs_batch, ps_batch).max(dim=3)[0].sum(dim=2))
+            scores_batch = torch.cat(scores_batch, dim=1).cpu()
+            scores_list.append(scores_batch)
+        scores = torch.cat(scores_list, dim=0)
+        assert scores.shape[0] == len(qs), f"Expected {len(qs)} scores, got {scores.shape[0]}"
+        scores = scores.to(torch.float32)
+        return scores

scripts/ops_colqwen3_embedder.py CHANGED Viewed

@@ -1,338 +1,167 @@
-from typing import List, Union, Optional, Tuple
 import torch
-from torch import nn
 from PIL import Image
-from tqdm.auto import tqdm
-from transformers.models.qwen3_vl import Qwen3VLConfig, Qwen3VLModel, Qwen3VLProcessor
-from colpali_engine.utils.processing_utils import BaseVisualRetrieverProcessor
-from transformers import BatchEncoding, BatchFeature
-from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
-class OpsColQwen3(Qwen3VLModel):
-    """
-    OpsColQwen3 model implementation for multi-vector document retrieval.
-    """
-    def __init__(self, config: Qwen3VLConfig, dims: int = 320, mask_non_image_embeddings: bool = False):
-        super().__init__(config=config)
-        self.custom_text_proj = nn.Linear(self.config.text_config.hidden_size, self.config.text_config.hidden_size)
-        self.dims = dims
-        self.padding_side = "left"
-        self.mask_non_image_embeddings = mask_non_image_embeddings
-        self.post_init()
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        key_mapping = kwargs.pop("key_mapping", None)
-        if key_mapping is None:
-            key_mapping = {
-                r"^base_model\.model\.(.*)": r"\1",
-                r"^model\.(.*)": r"\1",
-            }
-        return super().from_pretrained(*args, **kwargs, key_mapping=key_mapping)
-    def forward(self, *args, **kwargs) -> torch.Tensor:
-        attention_mask = kwargs.get("attention_mask")
-        has_pixel_values = "pixel_values" in kwargs and kwargs["pixel_values"] is not None
-        if has_pixel_values:
-            image_grid_thw = kwargs.get("image_grid_thw")
-            if image_grid_thw is None:
-                raise ValueError("`image_grid_thw` must be provided when `pixel_values` is passed.")
-            if not torch.is_tensor(image_grid_thw):
-                image_grid_thw = torch.as_tensor(image_grid_thw, device=kwargs["pixel_values"].device)
-            offsets = image_grid_thw.prod(dim=1)
-            unpadded = [pixel_sequence[: int(offset.item())] for pixel_sequence, offset in zip(kwargs["pixel_values"], offsets)]
-            if unpadded:
-                kwargs["pixel_values"] = torch.cat(unpadded, dim=0)
-            else:
-                kwargs["pixel_values"] = None
-        kwargs.pop("return_dict", True)
-        kwargs.pop("output_hidden_states", None)
-        kwargs.pop("use_cache", None)
-        last_hidden_states = super().forward(*args, **kwargs, use_cache=False, output_hidden_states=True, return_dict=True).last_hidden_state
-        proj = self.custom_text_proj(last_hidden_states)
-        if self.dims < self.config.text_config.hidden_size:
-            proj = proj[..., : self.dims]
-        proj = proj / proj.norm(dim=-1, keepdim=True)
-        if attention_mask is not None:
-            proj = proj * attention_mask.unsqueeze(-1)
-        if has_pixel_values and self.mask_non_image_embeddings and kwargs.get("input_ids") is not None:
-            image_mask = (kwargs["input_ids"] == self.config.image_token_id).unsqueeze(-1)
-            proj = proj * image_mask
-        return proj
-    @property
-    def patch_size(self) -> int:
-        return self.visual.config.patch_size
-    @property
-    def spatial_merge_size(self) -> int:
-        return self.visual.config.spatial_merge_size
-    @property
-    def temporal_patch_size(self) -> int:
-        return getattr(self.visual.config, "temporal_patch_size", 1)
-class OpsColQwen3Processor(BaseVisualRetrieverProcessor, Qwen3VLProcessor):
-    """
-    Processor for OpsColQwen3.
-    """
-    query_prefix: str = "Query: "
-    visual_prompt_prefix: str = "<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe the image.<|im_end|><|im_start|>assistant\n<|endoftext|>"
-    query_augmentation_token: str = "<|endoftext|>"
-    image_token: str = "<|image_pad|>"
-    def __init__(self, *args, **kwargs) -> None:
-        super().__init__(*args, **kwargs)
-        self.tokenizer.padding_side = "left"
-    @classmethod
-    def from_pretrained(cls, *args, device_map: Optional[str] = None, **kwargs):
-        instance = super().from_pretrained(*args, device_map=device_map, **kwargs)
-        if "max_num_visual_tokens" in kwargs:
-            instance.image_processor.max_pixels = kwargs["max_num_visual_tokens"] * 32 * 32
-            instance.image_processor.size["longest_edge"] = instance.image_processor.max_pixels
-        return instance
-    def process_images(self, images: List[Image.Image]) -> Union[BatchFeature, BatchEncoding]:
-        """Process a batch of PIL images."""
-        images = [image.convert("RGB") for image in images]
-        batch_doc = self.__call__(
-            text=[self.visual_prompt_prefix] * len(images),
-            images=images,
-            padding="longest",
-            return_tensors="pt",
-        )
-        if batch_doc["pixel_values"].numel() == 0:
-            return batch_doc
-        offsets = batch_doc["image_grid_thw"].prod(dim=1)
-        pixel_values = list(torch.split(batch_doc["pixel_values"], offsets.tolist()))
-        batch_doc["pixel_values"] = torch.nn.utils.rnn.pad_sequence(pixel_values, batch_first=True)
-        return batch_doc
-    def process_texts(self, texts: List[str]) -> Union[BatchFeature, BatchEncoding]:
-        """Process a list of texts."""
-        return self(text=texts, return_tensors="pt", padding="longest")
-    def score(
-        self,
-        qs: Union[torch.Tensor, List[torch.Tensor]],
-        ps: Union[torch.Tensor, List[torch.Tensor]],
-        device: Optional[Union[str, torch.device]] = None,
-        **kwargs,
-    ) -> torch.Tensor:
-        """Compute the MaxSim score (ColBERT-like) for query and passage embeddings."""
-        return self.score_multi_vector(qs, ps, device=device, **kwargs)
-    def get_n_patches(
-        self,
-        image_size: Tuple[int, int],
-        spatial_merge_size: int,
-    ) -> Tuple[int, int]:
-        """
-        Compute the number of patches (n_patches_x, n_patches_y) for an image.
-        """
-        patch_size = self.image_processor.patch_size
-        merge_size = getattr(self.image_processor, "merge_size", 1)
-        height_new, width_new = smart_resize(
-            width=image_size[0],
-            height=image_size[1],
-            factor=patch_size * merge_size,
-            min_pixels=self.image_processor.size["shortest_edge"],
-            max_pixels=self.image_processor.size["longest_edge"],
-        )
-        n_patches_x = width_new // patch_size // spatial_merge_size
-        n_patches_y = height_new // patch_size // spatial_merge_size
-        return n_patches_x, n_patches_y
-    def get_image_mask(self, batch_images: BatchFeature) -> torch.Tensor:
-        """Return a boolean tensor identifying image tokens."""
-        return batch_images.input_ids == self.image_token_id
 class OpsColQwen3Embedder:
     """
-    Simple embedder wrapper for OpsColQwen3 model.
-    Args:
-        model_name: HuggingFace model name or local path
-        dims: Embedding dimension after projection
-        device: Device to run the model on
-        attn_implementation: Attention implementation
     """
     def __init__(
         self,
-        model_name: str = "OpenSearch-AI/Ops-ColQwen3-4B",
         dims: int = 2560,
         device: Optional[str] = None,
-        attn_implementation: Optional[str] = None,
-        **kwargs,
     ):
-        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
-        self.dims = dims
-        if attn_implementation is None:
-            try:
-                from transformers.utils.import_utils import is_flash_attn_2_available
-                attn_implementation = "flash_attention_2" if is_flash_attn_2_available() else None
-            except ImportError:
-                attn_implementation = None
-        load_kwargs = {"dims": dims, "device_map": self.device, **kwargs}
-        if attn_implementation:
-            load_kwargs["attn_implementation"] = attn_implementation
-        self.model = OpsColQwen3.from_pretrained(model_name, **load_kwargs)
         self.model.eval()
-        self.processor = OpsColQwen3Processor.from_pretrained(model_name)
-    def encode_texts(
         self,
-        texts: List[str],
-        batch_size: int = 32,
-        show_progress: bool = False,
     ) -> List[torch.Tensor]:
         """
         Encode a list of text queries.
         Args:
-            texts: List of text strings to encode
-            batch_size: Batch size for processing
-            show_progress: Whether to show progress bar
         Returns:
-            List of embedding tensors
         """
-        all_embeddings = []
-        iterator = range(0, len(texts), batch_size)
-        if show_progress:
-            iterator = tqdm(iterator, desc="Encoding texts")
         with torch.no_grad():
-            for i in iterator:
-                batch_texts = texts[i : i + batch_size]
-                batch_texts = [self.processor.query_prefix + t + self.processor.query_augmentation_token * 10 for t in batch_texts]
-                inputs = self.processor.process_texts(batch_texts)
-                inputs = {k: v.to(self.device) for k, v in inputs.items()}
-                embeddings = self.model(**inputs)
-                all_embeddings.extend(embeddings.cpu().to(torch.float32))
-        return all_embeddings
     def encode_images(
         self,
-        images: List[Union[str, Image.Image]],
-        batch_size: int = 32,
-        show_progress: bool = False,
     ) -> List[torch.Tensor]:
         """
         Encode a list of images.
         Args:
             images: List of image paths or PIL Images
-            batch_size: Batch size for processing
-            show_progress: Whether to show progress bar
         Returns:
-            List of embedding tensors
         """
-        image_list = []
         for img in images:
             if isinstance(img, str):
-                image_list.append(Image.open(img).convert("RGB"))
             elif isinstance(img, Image.Image):
-                image_list.append(img.convert("RGB"))
             else:
                 raise ValueError(f"Unsupported image type: {type(img)}")
-        all_embeddings = []
-        iterator = range(0, len(image_list), batch_size)
-        if show_progress:
-            iterator = tqdm(iterator, desc="Encoding images")
         with torch.no_grad():
-            for i in iterator:
-                batch_images = image_list[i : i + batch_size]
-                inputs = self.processor.process_images(batch_images)
-                inputs = {k: v.to(self.device) for k, v in inputs.items()}
-                embeddings = self.model(**inputs)
-                all_embeddings.extend(embeddings.cpu().to(torch.float32))
-        return all_embeddings
     def compute_scores(
         self,
         query_embeddings: List[torch.Tensor],
-        image_embeddings: List[torch.Tensor],
-        batch_size: int = 128,
     ) -> torch.Tensor:
         """
-        Compute relevance scores between queries and images using MaxSim.
         Args:
-            query_embeddings: List of query embedding tensors
-            image_embeddings: List of image embedding tensors
-            batch_size: Batch size for score computation
         Returns:
-            Score matrix of shape (num_queries, num_images)
         """
-        return self.processor.score_multi_vector(
-            query_embeddings,
-            image_embeddings,
-            batch_size=batch_size,
-            device=self.device,
-        )
 # Example usage
 if __name__ == "__main__":
     images = [Image.new("RGB", (32, 32), color="white"), Image.new("RGB", (16, 16), color="black")]
     queries = ["Is attention really all you need?", "What is the amount of bananas farmed in Salvador?"]
-    encoder = OpsColQwen3Embedder(
         model_name="OpenSearch-AI/Ops-Colqwen3-4B",
-        dims=320,
         dtype=torch.float16,
         attn_implementation="flash_attention_2",
     )
-    query_embeddings = encoder.encode_texts(queries, batch_size=2)
-    image_embeddings = encoder.encode_images(images, batch_size=2)
-    scores = encoder.compute_scores(query_embeddings, image_embeddings)
-    print(f"Scores:\n{scores}")

 import torch
 from PIL import Image
+from transformers import AutoModel, AutoProcessor
+from typing import List, Union, Optional
 class OpsColQwen3Embedder:
     """
+    Embedder for OpsColQwen3-4B model.
     """
     def __init__(
         self,
+        model_name: str = "OpenSearch-AI/Ops-Colqwen3-4B",
         dims: int = 2560,
         device: Optional[str] = None,
+        **kwargs
     ):
+        """
+        Initialize the embedder.
+        Args:
+            model_name: Model path or hub name
+            dims: Embedding dimensions
+            device: Device to use for inference ('mps', 'cuda', or 'cpu')
+            **kwargs: Additional arguments passed to from_pretrained
+        """
+        device_map = kwargs.pop('device_map', None)
+        if not device_map:
+            if device:
+                device_map = device
+            elif torch.cuda.is_available():
+                device_map = "cuda"
+            elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
+                device_map = "mps" # Use MPS for Apple Silicon
+            else:
+                device_map = "cpu"
+        dtype = kwargs.pop('dtype', torch.float16 if device_map != "cpu" else torch.float32)
+        self.model = AutoModel.from_pretrained(
+            model_name,
+            dims=dims,
+            trust_remote_code=True,
+            dtype=dtype,
+            device_map=device_map,
+            **kwargs
+        )
         self.model.eval()
+        self.processor = AutoProcessor.from_pretrained(
+            model_name,
+            trust_remote_code=True,
+            **kwargs
+        )
+        self.device = device_map
+        self.dims = dims
+    def encode_queries(
         self,
+        queries: List[str]
     ) -> List[torch.Tensor]:
         """
         Encode a list of text queries.
         Args:
+            queries: List of query texts
         Returns:
+            List of query embeddings
         """
+        query_inputs = self.processor.process_queries(queries)
+        query_inputs = {k: v.to(self.device) for k, v in query_inputs.items()}
         with torch.no_grad():
+            query_embeddings = self.model(**query_inputs)
+        return [q.cpu() for q in query_embeddings]
     def encode_images(
         self,
+        images: List[Union[str, Image.Image]]
     ) -> List[torch.Tensor]:
         """
         Encode a list of images.
         Args:
             images: List of image paths or PIL Images
         Returns:
+            List of image embeddings
         """
+        image_objects = []
         for img in images:
             if isinstance(img, str):
+                image_objects.append(Image.open(img).convert("RGB"))
             elif isinstance(img, Image.Image):
+                image_objects.append(img)
             else:
                 raise ValueError(f"Unsupported image type: {type(img)}")
+        image_inputs = self.processor.process_images(image_objects)
+        image_inputs = {k: v.to(self.device) for k, v in image_inputs.items()}
         with torch.no_grad():
+            image_embeddings = self.model(**image_inputs)
+        return [i.cpu() for i in image_embeddings]
     def compute_scores(
         self,
         query_embeddings: List[torch.Tensor],
+        image_embeddings: List[torch.Tensor]
     ) -> torch.Tensor:
         """
+        Compute similarity scores between queries and images.
         Args:
+            query_embeddings: List of query embeddings
+            image_embeddings: List of image embeddings
         Returns:
+            Similarity scores matrix
         """
+        return self.processor.score_multi_vector(query_embeddings, image_embeddings)
+    def encode_and_score(
+        self,
+        queries: List[str],
+        images: List[Union[str, Image.Image]]
+    ):
+        """
+        Convenience method to encode queries and images and compute scores.
+        Args:
+            queries: List of query texts
+            images: List of images (paths or PIL objects)
+        Returns:
+            Similarity scores between queries and images
+        """
+        query_embeddings = self.encode_queries(queries)
+        image_embeddings = self.encode_images(images)
+        return self.compute_scores(query_embeddings, image_embeddings)
 # Example usage
 if __name__ == "__main__":
     images = [Image.new("RGB", (32, 32), color="white"), Image.new("RGB", (16, 16), color="black")]
     queries = ["Is attention really all you need?", "What is the amount of bananas farmed in Salvador?"]
+    embedder = OpsColQwen3Embedder(
         model_name="OpenSearch-AI/Ops-Colqwen3-4B",
+        dims=2560,
         dtype=torch.float16,
         attn_implementation="flash_attention_2",
     )
+    query_embeddings = embedder.encode_queries(queries)
+    image_embeddings = embedder.encode_images(images)
+    print(query_embeddings[0].shape, image_embeddings[0].shape) # (23, 2560) (18, 2560)
+    scores = embedder.compute_scores(query_embeddings, image_embeddings)
+    print(f"Scores:\n{scores}")