File size: 11,271 Bytes
a1638ea
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
"""Argus: Region-Aware Query-Conditioned Mixture of Experts for Visual Document Retrieval.

Self-contained processor for Argus-Colqwen3.5-9B. Wraps the Qwen3-VL processor
(image processor + Qwen2 tokenizer + optional video processor) and adds ColPali-
style ``process_images`` / ``process_texts`` / ``score_multi_vector`` helpers.
"""
from __future__ import annotations

from pathlib import Path
from typing import ClassVar, List, Optional, Tuple, Union

import torch
from PIL import Image
from transformers import BatchEncoding, BatchFeature
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
from transformers.models.qwen3_vl import Qwen3VLProcessor


class ArgusProcessor(Qwen3VLProcessor):
    """Processor for Argus-Colqwen3.5-9B.

    Subclasses ``Qwen3VLProcessor`` (the Qwen3.5-9B hub repo ships that
    processor class even though the LLM is Qwen3.5). Adds:

    - ``process_images``: batch-encode PIL images into the exact dict the
      retriever forward expects (``pixel_values``, ``image_grid_thw``,
      ``input_ids``, ``attention_mask``).
    - ``process_texts``: batch-encode query strings.
    - ``score`` / ``score_multi_vector``: MaxSim scoring helper.
    - ``max_num_visual_tokens`` knob: caps the longest-edge pixel budget per
      image so long documents don't blow up the vision encoder.
    """

    visual_prompt_prefix: ClassVar[str] = (
        "<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe the image.<|im_end|><|endoftext|>"
    )
    query_augmentation_token: ClassVar[str] = "<|endoftext|>"
    query_prefix: ClassVar[str] = ""
    image_token: ClassVar[str] = "<|image_pad|>"
    # Number of <|endoftext|> tokens appended to every query — matches the
    # training-time collator (``colpali_novel/data/layout_collator.py``).
    # Removing or changing this number measurably hurts retrieval scores.
    n_query_augmentation_tokens: ClassVar[int] = 10

    def __init__(
        self,
        image_processor=None,
        tokenizer=None,
        video_processor=None,
        chat_template=None,
        **kwargs,
    ):
        # Explicit signature matters for ``ProcessorMixin``: it inspects
        # __init__.__code__ to decide which modality attributes to set. A
        # *args,**kwargs signature silently drops tokenizer/image_processor.
        super().__init__(
            image_processor=image_processor,
            tokenizer=tokenizer,
            video_processor=video_processor,
            chat_template=chat_template,
            **kwargs,
        )
        if getattr(self, "tokenizer", None) is not None:
            self.tokenizer.padding_side = "left"

    @classmethod
    def from_pretrained(
        cls,
        pretrained_model_name_or_path,
        *args,
        device_map: Optional[str] = None,
        max_num_visual_tokens: Optional[int] = None,
        **kwargs,
    ):
        """Load the processor from a local folder or HF repo id.

        The Qwen3.5-9B hub repo declares ``processor_class=Qwen3VLProcessor``
        but ``tokenizer_class=Qwen2Tokenizer``. The stock ``Qwen3VLProcessor
        .from_pretrained`` returns ``tokenizer=None`` in that case and then
        crashes on ``tokenizer.convert_tokens_to_ids(self.image_token)``.
        We load tokenizer + image processor via the Auto* registry
        explicitly so both are real objects before ``__init__`` runs.
        """
        from transformers import AutoImageProcessor, AutoTokenizer

        tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
        image_processor = AutoImageProcessor.from_pretrained(pretrained_model_name_or_path, *args, **kwargs)

        video_processor = None
        try:
            from transformers import AutoVideoProcessor
            video_processor = AutoVideoProcessor.from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
        except Exception:  # noqa: BLE001 — video processing is optional
            video_processor = None

        chat_template = None
        try:
            candidate = Path(str(pretrained_model_name_or_path)) / "chat_template.jinja"
            if candidate.is_file():
                chat_template = candidate.read_text()
        except Exception:  # noqa: BLE001
            chat_template = None

        instance = cls(
            image_processor=image_processor,
            tokenizer=tokenizer,
            video_processor=video_processor,
            chat_template=chat_template,
        )

        if max_num_visual_tokens is not None:
            patch_size = getattr(instance.image_processor, "patch_size", None)
            merge_size = getattr(instance.image_processor, "merge_size", None)
            if patch_size is None or merge_size is None:
                raise ValueError("Argus image processor missing patch_size or merge_size.")
            tile = patch_size * merge_size
            instance.image_processor.max_pixels = max_num_visual_tokens * tile * tile
            instance.image_processor.size["longest_edge"] = instance.image_processor.max_pixels
        return instance

    # ------------------------------------------------------------------ #
    # Encoding
    # ------------------------------------------------------------------ #

    def process_images(self, images: List[Image.Image]) -> Union[BatchFeature, BatchEncoding]:
        """Encode PIL images into the backbone's expected input dict."""
        images = [img.convert("RGB") for img in images]
        batch_doc = self(
            text=[self.visual_prompt_prefix] * len(images),
            images=images,
            padding="longest",
            return_tensors="pt",
        )
        # Pack pixel_values so the forward can scatter them per image via
        # image_grid_thw offsets. This mirrors the training-time collator.
        offsets = batch_doc["image_grid_thw"][:, 1] * batch_doc["image_grid_thw"][:, 2]
        pixel_values = list(torch.split(batch_doc["pixel_values"], offsets.tolist()))
        batch_doc["pixel_values"] = torch.nn.utils.rnn.pad_sequence(pixel_values, batch_first=True)
        return batch_doc

    def process_texts(
        self,
        texts: List[str],
        max_length: Optional[int] = None,
    ) -> Union[BatchFeature, BatchEncoding]:
        """Encode query strings into the backbone's expected input dict."""
        kwargs = {"text": texts, "return_tensors": "pt", "padding": "longest"}
        if max_length is not None:
            kwargs["max_length"] = max_length
            kwargs["truncation"] = True
        return self(**kwargs)

    def process_queries(
        self,
        queries: Optional[List[str]] = None,
        texts: Optional[List[str]] = None,
        max_length: Optional[int] = None,
        suffix: Optional[str] = None,
    ) -> Union[BatchFeature, BatchEncoding]:
        """Encode queries with the training-time augmentation:
        ``query_prefix + query + query_augmentation_token * n_query_augmentation_tokens``.

        Mirrors ``colpali_engine.utils.processing_utils.BaseVisualRetrieverProcessor
        .process_queries`` and the Argus training collator. The default 10 trailing
        ``<|endoftext|>`` tokens are not optional — without them, MaxSim scoring
        drops several nDCG points because the query has fewer active multi-vectors.
        """
        if texts is not None and queries is not None:
            raise ValueError("Only one of 'texts' or 'queries' should be provided.")
        if queries is None:
            queries = texts
        if queries is None:
            raise ValueError("No queries provided.")

        if suffix is None:
            suffix = self.query_augmentation_token * self.n_query_augmentation_tokens

        wrapped = [self.query_prefix + q + suffix for q in queries]
        return self.process_texts(wrapped, max_length=max_length)

    # ------------------------------------------------------------------ #
    # Scoring
    # ------------------------------------------------------------------ #

    def score(
        self,
        qs: List[torch.Tensor],
        ps: List[torch.Tensor],
        device: Optional[Union[str, torch.device]] = None,
        **kwargs,
    ) -> torch.Tensor:
        """Alias for ``score_multi_vector`` (MaxSim over multi-vectors)."""
        return self.score_multi_vector(qs, ps, device=device, **kwargs)

    def score_multi_vector(
        self,
        qs: List[torch.Tensor],
        ps: List[torch.Tensor],
        batch_size: int = 128,
        device: Optional[Union[str, torch.device]] = None,
    ) -> torch.Tensor:
        """Compute an [N_q, N_p] score matrix via MaxSim (ColBERT scoring).

        For each (q, p) pair: ``sum_t max_p <q_t, p_p>``. Inputs are the raw
        (potentially ragged) per-sample multi-vector tensors returned by
        :meth:`encode_queries` / :meth:`encode_images`.
        """
        dev = torch.device(device) if device is not None else torch.device("cpu")
        n_q, n_p = len(qs), len(ps)
        scores = torch.zeros(n_q, n_p, device=dev)

        for qi in range(0, n_q, batch_size):
            q_slice = qs[qi : qi + batch_size]
            q_len = max(x.size(0) for x in q_slice)
            q_pad = torch.zeros(len(q_slice), q_len, q_slice[0].size(-1), device=dev)
            q_mask = torch.zeros(len(q_slice), q_len, device=dev, dtype=torch.bool)
            for i, t in enumerate(q_slice):
                q_pad[i, : t.size(0)] = t.to(dev)
                q_mask[i, : t.size(0)] = t.abs().sum(dim=-1) > 0

            for pi in range(0, n_p, batch_size):
                p_slice = ps[pi : pi + batch_size]
                p_len = max(x.size(0) for x in p_slice)
                p_pad = torch.zeros(len(p_slice), p_len, p_slice[0].size(-1), device=dev)
                for j, t in enumerate(p_slice):
                    p_pad[j, : t.size(0)] = t.to(dev)

                sim = torch.einsum("qld,pkd->qplk", q_pad, p_pad)
                maxsim = sim.max(dim=-1).values
                maxsim = (maxsim * q_mask.unsqueeze(1).to(maxsim.dtype)).sum(dim=-1)
                scores[qi : qi + len(q_slice), pi : pi + len(p_slice)] = maxsim

        return scores

    # ------------------------------------------------------------------ #
    # Misc helpers (match colpali-engine BaseVisualRetrieverProcessor API)
    # ------------------------------------------------------------------ #

    def get_n_patches(
        self,
        image_size: Tuple[int, int],
        spatial_merge_size: int,
    ) -> Tuple[int, int]:
        patch_size = self.image_processor.patch_size
        height_new, width_new = smart_resize(
            width=image_size[0],
            height=image_size[1],
            factor=patch_size * self.image_processor.merge_size,
            min_pixels=self.image_processor.size["shortest_edge"],
            max_pixels=self.image_processor.size["longest_edge"],
        )
        n_patches_x = width_new // patch_size // spatial_merge_size
        n_patches_y = height_new // patch_size // spatial_merge_size
        return n_patches_x, n_patches_y

    def get_image_mask(self, batch_images: BatchFeature) -> torch.Tensor:
        return batch_images.input_ids == self.image_token_id


__all__ = ["ArgusProcessor"]