Qwen
/

Qwen3-VL-Reranker-8B

@@ -57,7 +57,7 @@ We utilize retrieval task datasets from various subtasks of [MMEB-v2](https://hu
 | Model | Size | MMEB-v2(Retrieval) - Avg | MMEB-v2(Retrieval) - Image | MMEB-v2(Retrieval) - Video | MMEB-v2(Retrieval) - VisDoc | MMTEB(Retrieval) | JinaVDR | ViDoRe(v3) |
 |-------|------|--------------------------|----------------------------|----------------------------|------------------------------|------------------|---------|------------|
-| Qwen3-VL-Embedding-2B | 2B | 73.6 | 74.9 | 52.1 | 80.2 | 68.1 | 71.0 | 52.9 |
 | jina-reranker-m0      | 2B |  - | 68.2 | -    | 85.2 | -    | 82.2 | 57.8 |
 | Qwen3-VL-Reranker-2B | 2B | 75.1 | 73.8 | 52.1 | 83.4 | 70.0 | 80.9 | 60.8 |
 | Qwen3-VL-Reranker-8B | 8B | 79.2 | 80.7 | 55.8 | 86.3 | 74.9 | 83.6 | 66.7 |
@@ -98,7 +98,7 @@ inputs = {
 scores = model.process(inputs)
 print(scores)
-# [0.7941557168960571, 0.5883369445800781, 0.6327560544013977]
 ```
 For more usage examples, please visit our [GitHub repository](https://github.com/QwenLM/Qwen3-VL-Embedding).

 | Model | Size | MMEB-v2(Retrieval) - Avg | MMEB-v2(Retrieval) - Image | MMEB-v2(Retrieval) - Video | MMEB-v2(Retrieval) - VisDoc | MMTEB(Retrieval) | JinaVDR | ViDoRe(v3) |
 |-------|------|--------------------------|----------------------------|----------------------------|------------------------------|------------------|---------|------------|
+| Qwen3-VL-Embedding-2B | 2B | 73.4 | 74.8 | 53.6 | 79.2 | 68.1 | 71.0 | 52.9 |
 | jina-reranker-m0      | 2B |  - | 68.2 | -    | 85.2 | -    | 82.2 | 57.8 |
 | Qwen3-VL-Reranker-2B | 2B | 75.1 | 73.8 | 52.1 | 83.4 | 70.0 | 80.9 | 60.8 |
 | Qwen3-VL-Reranker-8B | 8B | 79.2 | 80.7 | 55.8 | 86.3 | 74.9 | 83.6 | 66.7 |
 scores = model.process(inputs)
 print(scores)
+# [0.7838293313980103, 0.585621178150177, 0.6147719025611877]
 ```
 For more usage examples, please visit our [GitHub repository](https://github.com/QwenLM/Qwen3-VL-Embedding).

scripts/qwen3_vl_reranker.py CHANGED Viewed

@@ -10,6 +10,7 @@ from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
 logger = logging.getLogger(__name__)
 IMAGE_BASE_FACTOR = 16
 IMAGE_FACTOR = IMAGE_BASE_FACTOR * 2
 MIN_PIXELS = 4 * IMAGE_FACTOR * IMAGE_FACTOR  # 4 tokens
@@ -37,33 +38,37 @@ def sample_frames(frames, num_segments, max_segments):
         except:
             break
         sampled_frames.append(single_frame_path)
-    # If total frame numbers is less than num_segments, append the last images to achieve
     while len(sampled_frames) < num_segments:
         sampled_frames.append(frames[last_frame_id])
     return sampled_frames[:max_segments]
 class Qwen3VLReranker():
     def __init__(
         self,
         model_name_or_path: str,
         **kwargs,
     ):
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        self.max_length = kwargs.pop('max_length', 8192)
-        self.default_instruction = "Given a search query, retrieve relevant candidates that answer the query."
-        self.min_pixels = kwargs.pop('min_pixels', MIN_PIXELS)
-        self.max_pixels = kwargs.pop('max_pixels', MAX_PIXELS)
-        self.total_pixels = kwargs.pop('total_pixels', MAX_TOTAL_PIXELS)
-        self.fps = kwargs.pop('fps', FPS)
-        self.num_frames = kwargs.pop('num_frames', None)
-        self.max_frames = kwargs.pop('max_frames', None)
         lm = Qwen3VLForConditionalGeneration.from_pretrained(
             model_name_or_path,
@@ -71,16 +76,15 @@ class Qwen3VLReranker():
         ).to(self.device)
         self.model = lm.model
         self.processor = AutoProcessor.from_pretrained(
             model_name_or_path, trust_remote_code=True,
             padding_side='left'
         )
         token_true_id = self.processor.tokenizer.get_vocab()["yes"]
         token_false_id = self.processor.tokenizer.get_vocab()["no"]
         self.score_linear = self.get_binary_linear(lm, token_true_id, token_false_id)
-        self.model.eval()
         self.score_linear.eval()
         self.score_linear.to(self.device).to(self.model.dtype)
@@ -115,24 +119,19 @@ class Qwen3VLReranker():
         special_tokens_set = set(special_tokens)
-        # 1. 确定预算：计算我们能保留多少个非特殊token
         num_special = sum(1 for token in tokens if token in special_tokens_set)
-        # 根据保证（特殊token总数 < max_length），这个值总是非负的
         num_non_special_to_keep = max_length - num_special
-        # 2. 按预算构建最终列表
         final_tokens = []
         non_special_kept_count = 0
         for token in tokens:
-            # 如果是特殊token，直接保留
             if token in special_tokens_set:
                 final_tokens.append(token)
-            # 如果是非特殊token，并且我们还有预算
             elif non_special_kept_count < num_non_special_to_keep:
                 final_tokens.append(token)
                 non_special_kept_count += 1
-            # 如果是非特殊token但预算已用完，则丢弃（即什么都不做）
         return final_tokens
@@ -142,10 +141,11 @@ class Qwen3VLReranker():
         try:
             images, videos, video_kwargs = process_vision_info(
                 pairs, image_patch_size=16,
-                return_video_kwargs=True, return_video_metadata=True
             )
         except Exception as e:
-            logger.warning(f"Error in processing vision info: {e}")
             images = None
             videos = None
             video_kwargs = {'do_sample_frames': False}
@@ -159,60 +159,80 @@ class Qwen3VLReranker():
             videos, video_metadatas = list(videos), list(video_metadatas)
         else:
             video_metadatas = None
-        inputs = self.processor(text=text,
-                                images=images,
-                                videos=videos,
-                                video_metadata=video_metadatas,
-                                truncation=False,
-                                padding=False,
-                                max_length=max_length,
-                                do_resize=False,
-                                **video_kwargs)
         for i, ele in enumerate(inputs['input_ids']):
-            inputs['input_ids'][i] = self.truncate_tokens_optimized(inputs['input_ids'][i][:-5], max_length,
-                                                                    self.processor.tokenizer.all_special_ids) + \
-                                     inputs['input_ids'][i][-5:]
-        temp_inputs = self.processor.tokenizer.pad({'input_ids': inputs['input_ids']}, padding=True,
-                                                   return_tensors="pt", max_length=self.max_length)
         for key in temp_inputs:
             inputs[key] = temp_inputs[key]
         return inputs
-    def format_mm_content(self, text, image, video, prefix='Query:', fps=None):
         content = []
         content.append({'type': 'text', 'text': prefix})
         if not text and not image and not video:
-            content.append({'type': 'text', 'text': ""})
             return content
         if video:
             video_content = None
             if isinstance(video, list):
                 video_content = video
                 if self.num_frames is not None or self.max_frames is not None:
-                    video_content = sample_frames(video_content, self.num_frames, self.max_frames)
-                video_content = ['file://' + ele for ele in video_content]
-            if video.startswith('http') or video.startswith('oss'):
-                video_content = video
             elif isinstance(video, str):
-                video_content = 'file://' + video
             if video_content:
-                content.append({'type': 'video', 'video': video_content, 'total_pixels': self.total_pixels, 'fps': fps})
         if image:
             image_content = None
             if isinstance(image, Image.Image):
                 image_content = image
-            elif image.startswith('http') or image.startswith('oss'):
-                image_content = image
             elif isinstance(image, str):
-                image_content = 'file://' + image
             else:
-                image_content = image
             if image_content:
-                content.append({'type': 'image', 'image': image_content, "min_pixels": self.min_pixels,
-                                "max_pixels": self.max_pixels})
         if text:
             content.append({'type': 'text', 'text': text})
@@ -222,7 +242,8 @@ class Qwen3VLReranker():
         self,
         query_text, query_image, query_video,
         doc_text, doc_image, doc_video,
-        instruction=None, fps=None
     ):
         inputs = []
         inputs.append({
@@ -242,9 +263,15 @@ class Qwen3VLReranker():
             "type": "text",
             "text": '<Instruct>: ' + instruct
         })
-        query_content = self.format_mm_content(query_text, query_image, query_video, prefix='<Query>:', fps=fps)
         contents.extend(query_content)
-        doc_content = self.format_mm_content(doc_text, doc_image, doc_video, prefix='\n<Document>:', fps=fps)
         contents.extend(doc_content)
         inputs.append({
             "role": "user",
@@ -271,12 +298,14 @@ class Qwen3VLReranker():
             document.get('image', None),
             document.get('video', None),
             instruction=instruction,
-            fps=inputs.get('fps', self.fps))
-            for document in documents]
         final_scores = []
         for pair in pairs:
             inputs = self.tokenize([pair])
             inputs = inputs.to(self.model.device)
             scores = self.compute_scores(inputs)
             final_scores.extend(scores)
-        return final_scores

 logger = logging.getLogger(__name__)
+MAX_LENGTH = 8192
 IMAGE_BASE_FACTOR = 16
 IMAGE_FACTOR = IMAGE_BASE_FACTOR * 2
 MIN_PIXELS = 4 * IMAGE_FACTOR * IMAGE_FACTOR  # 4 tokens
         except:
             break
         sampled_frames.append(single_frame_path)
+    # Pad with last frame if total frames less than num_segments
     while len(sampled_frames) < num_segments:
         sampled_frames.append(frames[last_frame_id])
     return sampled_frames[:max_segments]
 class Qwen3VLReranker():
     def __init__(
         self,
         model_name_or_path: str,
+        max_length: int = MAX_LENGTH,
+        min_pixels: int = MIN_PIXELS,
+        max_pixels: int = MAX_PIXELS,
+        total_pixels: int = MAX_TOTAL_PIXELS,
+        fps: float = FPS,
+        num_frames: int = MAX_FRAMES,
+        max_frames: int = MAX_FRAMES,
+        default_instruction: str = "Given a search query, retrieve relevant candidates that answer the query.",
         **kwargs,
     ):
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.max_length = max_length
+        self.min_pixels = min_pixels
+        self.max_pixels = max_pixels
+        self.total_pixels = total_pixels
+        self.fps = fps
+        self.num_frames = num_frames
+        self.max_frames = max_frames
+        self.default_instruction = default_instruction
         lm = Qwen3VLForConditionalGeneration.from_pretrained(
             model_name_or_path,
         ).to(self.device)
         self.model = lm.model
         self.processor = AutoProcessor.from_pretrained(
             model_name_or_path, trust_remote_code=True,
             padding_side='left'
         )
+        self.model.eval()
         token_true_id = self.processor.tokenizer.get_vocab()["yes"]
         token_false_id = self.processor.tokenizer.get_vocab()["no"]
         self.score_linear = self.get_binary_linear(lm, token_true_id, token_false_id)
         self.score_linear.eval()
         self.score_linear.to(self.device).to(self.model.dtype)
         special_tokens_set = set(special_tokens)
+        # Calculate budget: how many non-special tokens we can keep
         num_special = sum(1 for token in tokens if token in special_tokens_set)
         num_non_special_to_keep = max_length - num_special
+        # Build final list according to budget
         final_tokens = []
         non_special_kept_count = 0
         for token in tokens:
             if token in special_tokens_set:
                 final_tokens.append(token)
             elif non_special_kept_count < num_non_special_to_keep:
                 final_tokens.append(token)
                 non_special_kept_count += 1
         return final_tokens
         try:
             images, videos, video_kwargs = process_vision_info(
                 pairs, image_patch_size=16,
+                return_video_kwargs=True,
+                return_video_metadata=True
             )
         except Exception as e:
+            logger.error(f"Error in processing vision info: {e}")
             images = None
             videos = None
             video_kwargs = {'do_sample_frames': False}
             videos, video_metadatas = list(videos), list(video_metadatas)
         else:
             video_metadatas = None
+        inputs = self.processor(
+            text=text,
+            images=images,
+            videos=videos,
+            video_metadata=video_metadatas,
+            truncation=False,
+            padding=False,
+            do_resize=False,
+            **video_kwargs
+        )
         for i, ele in enumerate(inputs['input_ids']):
+            inputs['input_ids'][i] = self.truncate_tokens_optimized(
+                inputs['input_ids'][i][:-5], max_length,
+                self.processor.tokenizer.all_special_ids
+            ) + inputs['input_ids'][i][-5:]
+        temp_inputs = self.processor.tokenizer.pad(
+            {'input_ids': inputs['input_ids']}, padding=True,
+            return_tensors="pt", max_length=self.max_length
+        )
         for key in temp_inputs:
             inputs[key] = temp_inputs[key]
         return inputs
+    def format_mm_content(
+        self,
+        text, image, video,
+        prefix='Query:',
+        fps=None, max_frames=None,
+    ):
         content = []
         content.append({'type': 'text', 'text': prefix})
         if not text and not image and not video:
+            content.append({'type': 'text', 'text': "NULL"})
             return content
         if video:
             video_content = None
+            video_kwargs = { 'total_pixels': self.total_pixels }
             if isinstance(video, list):
                 video_content = video
                 if self.num_frames is not None or self.max_frames is not None:
+                    video_content = self._sample_frames(video_content, self.num_frames, self.max_frames)
+                video_content = [
+                    ('file://' + ele if isinstance(ele, str) else ele)
+                    for ele in video_content
+                ]
             elif isinstance(video, str):
+                video_content = video if video.startswith(('http://', 'https://')) else 'file://' + video
+                video_kwargs = {'fps': fps or self.fps, 'max_frames': max_frames or self.max_frames,}
+            else:
+                raise TypeError(f"Unrecognized video type: {type(video)}")
             if video_content:
+                content.append({
+                    'type': 'video', 'video': video_content,
+                    **video_kwargs
+                })
         if image:
             image_content = None
             if isinstance(image, Image.Image):
                 image_content = image
             elif isinstance(image, str):
+                image_content = image if image.startswith(('http', 'oss')) else 'file://' + image
             else:
+                raise TypeError(f"Unrecognized image type: {type(image)}")
             if image_content:
+                content.append({
+                    'type': 'image', 'image': image_content,
+                    "min_pixels": self.min_pixels,
+                    "max_pixels": self.max_pixels
+                })
         if text:
             content.append({'type': 'text', 'text': text})
         self,
         query_text, query_image, query_video,
         doc_text, doc_image, doc_video,
+        instruction=None,
+        fps=None, max_frames=None
     ):
         inputs = []
         inputs.append({
             "type": "text",
             "text": '<Instruct>: ' + instruct
         })
+        query_content = self.format_mm_content(
+            query_text, query_image, query_video, prefix='<Query>:',
+            fps=fps, max_frames=max_frames
+        )
         contents.extend(query_content)
+        doc_content = self.format_mm_content(
+            doc_text, doc_image, doc_video, prefix='\n<Document>:',
+            fps=fps, max_frames=max_frames
+        )
         contents.extend(doc_content)
         inputs.append({
             "role": "user",
             document.get('image', None),
             document.get('video', None),
             instruction=instruction,
+            fps=inputs.get('fps', self.fps),
+            max_frames=inputs.get('max_frames', self.max_frames)
+        ) for document in documents]
         final_scores = []
         for pair in pairs:
             inputs = self.tokenize([pair])
             inputs = inputs.to(self.model.device)
             scores = self.compute_scores(inputs)
             final_scores.extend(scores)
+        return final_scores