Spaces:

KinetoLabs
/

SmokeScan

Paused

KinetoLabs Claude Opus 4.5 commited on 3 days ago

Commit

455c786

1 Parent(s): f3ebc82

Fix embedding/reranker loading with official Qwen3-VL classes

Root cause: AutoModel.from_pretrained() loads base transformer
instead of specialized embedding/reranking variants.

Changes:
- Vendor official scripts from QwenLM/Qwen3-VL-Embedding repo
- Replace AutoModel with Qwen3VLEmbedder for embedding model
- Replace AutoModel with Qwen3VLReranker for reranker model
- Update embed()/rerank() methods to use official process() API

The official loaders handle:
- Proper last-token pooling and L2 normalization (embedding)
- Yes/no binary scoring from LM head weights (reranker)

This eliminates the fallback L2 norm heuristic scoring that was
producing "less accurate" results.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

Files changed (4) hide show

models/real.py +86 -167
scripts/qwen3_vl/__init__.py +14 -0
scripts/qwen3_vl/qwen3_vl_embedding.py +393 -0
scripts/qwen3_vl/qwen3_vl_reranker.py +371 -0

models/real.py CHANGED Viewed

@@ -2,6 +2,11 @@
 This module loads the actual Qwen3-VL models for production use.
 Requires ~90GB VRAM (4xL4 with 96GB total).
 """
 import json
@@ -28,7 +33,7 @@ class RealModelStack:
     def load_all(self) -> "RealModelStack":
         """Load all models with device_map='auto' for multi-GPU distribution."""
-        from transformers import AutoModel, AutoProcessor
         device_type = 'cuda' if torch.cuda.is_available() else 'cpu'
         logger.info(f"Loading models on {device_type}")
@@ -71,34 +76,30 @@ class RealModelStack:
             )
             logger.info(f"Fallback vision model loaded in {time.time() - vision_start:.2f}s")
-        # Embedding model (~16GB in BF16)
         logger.info(f"Loading embedding model: {settings.embedding_model}")
         embed_start = time.time()
-        self.models["embedding"] = AutoModel.from_pretrained(
-            settings.embedding_model,
             torch_dtype=torch.bfloat16,
-            device_map="auto",
-            trust_remote_code=True,
-        )
-        self.processors["embedding"] = AutoProcessor.from_pretrained(
-            settings.embedding_model,
-            trust_remote_code=True,
         )
         logger.info(f"Embedding model loaded in {time.time() - embed_start:.2f}s")
-        # Reranker model (~16GB in BF16)
         logger.info(f"Loading reranker model: {settings.reranker_model}")
         reranker_start = time.time()
-        self.models["reranker"] = AutoModel.from_pretrained(
-            settings.reranker_model,
             torch_dtype=torch.bfloat16,
-            device_map="auto",
-            trust_remote_code=True,
-        )
-        self.processors["reranker"] = AutoProcessor.from_pretrained(
-            settings.reranker_model,
-            trust_remote_code=True,
         )
         logger.info(f"Reranker model loaded in {time.time() - reranker_start:.2f}s")
         self.loaded = True
@@ -370,80 +371,68 @@ IMPORTANT: Return ONLY valid JSON, no additional text."""
 class RealEmbeddingModel:
     """Wrapper for real embedding model inference.
-    Uses last-token pooling per official Qwen3-VL-Embedding implementation:
-    https://github.com/QwenLM/Qwen3-VL-Embedding
     """
     def __init__(self, model, processor):
         self.model = model
         self.processor = processor
-    @staticmethod
-    def _pooling_last(hidden_state: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
-        """Extract the last valid token's hidden state based on attention mask.
-        This is the official pooling method from Qwen3-VL-Embedding.
-        It finds the last position where attention_mask == 1 and extracts that token.
-        """
-        # Flip attention mask to find last 1 position
-        flipped_tensor = attention_mask.flip(dims=[1])
-        last_one_positions = flipped_tensor.argmax(dim=1)
-        col = attention_mask.shape[1] - last_one_positions - 1
-        row = torch.arange(hidden_state.shape[0], device=hidden_state.device)
-        return hidden_state[row, col]
-    def embed(self, text: str) -> list[float]:
-        """Generate embedding for text using last-token pooling.
-        Per Qwen3-VL-Embedding: extracts the hidden state of the last valid token,
-        then applies L2 normalization.
         """
         try:
-            # Tokenize input
-            inputs = self.processor(
-                text,
-                return_tensors="pt",
-                padding=True,
-                truncation=True,
-                max_length=512,
-            )
-            # Move to model device
-            inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
-            # Generate embeddings
-            with torch.no_grad():
-                outputs = self.model(**inputs)
-                # Use last-token pooling (official Qwen3-VL-Embedding method)
-                # outputs.last_hidden_state shape: (batch, seq_len, hidden_dim)
-                attention_mask = inputs.get("attention_mask")
-                if attention_mask is not None:
-                    embeddings = self._pooling_last(outputs.last_hidden_state, attention_mask)
-                else:
-                    # Fallback: use last token if no attention mask
-                    embeddings = outputs.last_hidden_state[:, -1, :]
-                # L2 normalize (per official implementation)
-                embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=-1)
             return embeddings[0].cpu().tolist()
         except Exception as e:
             logger.error(f"Embedding generation failed: {e}")
             # Return zero vector as fallback (4096-dim per Qwen3-VL-Embedding-8B)
-            hidden_size = getattr(self.model.config, "hidden_size", 4096)
             return [0.0] * hidden_size
     def embed_batch(self, texts: list[str]) -> list[list[float]]:
-        """Generate embeddings for a batch of texts."""
-        return [self.embed(text) for text in texts]
 class RealRerankerModel:
     """Wrapper for real reranker model inference.
-    Uses the official Qwen3-VL-Reranker scoring method:
     - Extracts "yes" and "no" token weights from the LM head
     - Creates a binary linear layer: weight = yes_weight - no_weight
     - Scores = sigmoid(linear(last_token_hidden_state))
@@ -452,118 +441,48 @@ class RealRerankerModel:
     """
     def __init__(self, model, processor):
-        self.model = model
-        self.processor = processor
-        self.score_linear = None
-        self._initialize_score_linear()
-    def _initialize_score_linear(self):
-        """Initialize the binary scoring linear layer from LM head weights.
-        Per Qwen3-VL-Reranker: the scoring layer uses the difference between
-        "yes" and "no" token embeddings from the language model head.
         """
-        try:
-            # Get tokenizer vocab to find yes/no token IDs
-            tokenizer = self.processor.tokenizer if hasattr(self.processor, 'tokenizer') else self.processor
-            vocab = tokenizer.get_vocab()
-            # Find yes/no token IDs
-            token_yes_id = vocab.get("yes")
-            token_no_id = vocab.get("no")
-            if token_yes_id is None or token_no_id is None:
-                logger.warning("Could not find 'yes'/'no' tokens in vocab, using fallback scoring")
-                return
-            # Get LM head weights
-            if not hasattr(self.model, 'lm_head'):
-                logger.warning("Model does not have lm_head, using fallback scoring")
-                return
-            lm_head_weights = self.model.lm_head.weight.data
-            # Extract yes/no weights
-            weight_yes = lm_head_weights[token_yes_id]
-            weight_no = lm_head_weights[token_no_id]
-            # Create binary linear layer: weight = yes - no
-            hidden_size = weight_yes.shape[0]
-            self.score_linear = torch.nn.Linear(hidden_size, 1, bias=False)
-            self.score_linear.weight.data[0] = weight_yes - weight_no
-            self.score_linear = self.score_linear.to(self.model.device)
-            self.score_linear.eval()
-            logger.info(f"Initialized reranker score linear from yes/no LM head weights (hidden_size={hidden_size})")
-        except Exception as e:
-            logger.warning(f"Failed to initialize score linear from LM head: {e}, using fallback scoring")
-            self.score_linear = None
-    def rerank(self, query: str, documents: list[str]) -> list[float]:
-        """Rerank documents by relevance to query.
-        Returns a list of relevance scores (0-1) for each document.
-        Higher scores indicate more relevant documents.
         """
         if not documents:
             return []
-        scores = []
-        for doc in documents:
-            try:
-                score = self._score_pair(query, doc)
-                scores.append(score)
-            except Exception as e:
-                logger.warning(f"Reranking failed for document: {e}")
-                scores.append(0.0)
-        return scores
-    def _score_pair(self, query: str, document: str) -> float:
-        """Score a single query-document pair using official Qwen3-VL-Reranker method."""
-        # Truncate document if too long
-        max_doc_len = 400
-        if len(document) > max_doc_len:
-            document = document[:max_doc_len] + "..."
-        # Format as query-document pair
-        pair_text = f"Query: {query}\n\nDocument: {document}"
         try:
-            inputs = self.processor(
-                pair_text,
-                return_tensors="pt",
-                padding=True,
-                truncation=True,
-                max_length=512,
-            )
-            # Move to model device
-            inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
-            with torch.no_grad():
-                outputs = self.model(**inputs)
-                # Use LAST token hidden state (not CLS/first token)
-                # Per official implementation: last_hidden_state[:, -1]
-                last_token_hidden = outputs.last_hidden_state[:, -1, :]
-                if self.score_linear is not None:
-                    # Official scoring: linear(last_token) -> sigmoid
-                    raw_score = self.score_linear(last_token_hidden)
-                    score = torch.sigmoid(raw_score).squeeze(-1).item()
-                else:
-                    # Fallback: use L2 norm with better scaling
-                    # This is less accurate but provides reasonable ordering
-                    norm = last_token_hidden.norm(dim=-1).item()
-                    score = min(1.0, max(0.0, norm / 50.0))  # Heuristic scaling
-            return score
         except Exception as e:
-            logger.error(f"Reranker scoring failed: {e}")
-            return 0.0
     def rerank_with_indices(
         self, query: str, documents: list[str], top_k: int = None

 This module loads the actual Qwen3-VL models for production use.
 Requires ~90GB VRAM (4xL4 with 96GB total).
+Model Loading:
+- Vision: Qwen3VLMoeForConditionalGeneration (standard transformers)
+- Embedding: Qwen3VLEmbedder (official scripts from QwenLM/Qwen3-VL-Embedding)
+- Reranker: Qwen3VLReranker (official scripts from QwenLM/Qwen3-VL-Embedding)
 """
 import json
     def load_all(self) -> "RealModelStack":
         """Load all models with device_map='auto' for multi-GPU distribution."""
+        from transformers import AutoProcessor
         device_type = 'cuda' if torch.cuda.is_available() else 'cpu'
         logger.info(f"Loading models on {device_type}")
             )
             logger.info(f"Fallback vision model loaded in {time.time() - vision_start:.2f}s")
+        # Embedding model (~16GB in BF16) - Using official Qwen3VLEmbedder
         logger.info(f"Loading embedding model: {settings.embedding_model}")
         embed_start = time.time()
+        from scripts.qwen3_vl import Qwen3VLEmbedder
+        self.models["embedding"] = Qwen3VLEmbedder(
+            model_name_or_path=settings.embedding_model,
             torch_dtype=torch.bfloat16,
         )
+        # Processor is internal to Qwen3VLEmbedder, but store reference for compatibility
+        self.processors["embedding"] = self.models["embedding"].processor
         logger.info(f"Embedding model loaded in {time.time() - embed_start:.2f}s")
+        # Reranker model (~16GB in BF16) - Using official Qwen3VLReranker
         logger.info(f"Loading reranker model: {settings.reranker_model}")
         reranker_start = time.time()
+        from scripts.qwen3_vl import Qwen3VLReranker
+        self.models["reranker"] = Qwen3VLReranker(
+            model_name_or_path=settings.reranker_model,
             torch_dtype=torch.bfloat16,
         )
+        # Processor is internal to Qwen3VLReranker, but store reference for compatibility
+        self.processors["reranker"] = self.models["reranker"].processor
         logger.info(f"Reranker model loaded in {time.time() - reranker_start:.2f}s")
         self.loaded = True
 class RealEmbeddingModel:
     """Wrapper for real embedding model inference.
+    Uses the official Qwen3VLEmbedder from QwenLM/Qwen3-VL-Embedding.
+    The model handles last-token pooling and L2 normalization internally.
     """
     def __init__(self, model, processor):
+        """Initialize with Qwen3VLEmbedder instance.
+        Args:
+            model: Qwen3VLEmbedder instance (official loader)
+            processor: Processor (stored for compatibility, but model has its own)
+        """
         self.model = model
         self.processor = processor
+    def embed(self, text: str) -> list[float]:
+        """Generate embedding for text using official Qwen3VLEmbedder.
+        The official model.process() handles:
+        - Tokenization and preprocessing
+        - Last-token pooling
+        - L2 normalization
+        Args:
+            text: Input text to embed
+        Returns:
+            List of floats representing the embedding (4096-dim for 8B model)
         """
         try:
+            # Use official process() API - expects list of dicts
+            inputs = [{"text": text}]
+            embeddings = self.model.process(inputs, normalize=True)
+            # embeddings is a tensor of shape (1, hidden_dim)
             return embeddings[0].cpu().tolist()
         except Exception as e:
             logger.error(f"Embedding generation failed: {e}")
             # Return zero vector as fallback (4096-dim per Qwen3-VL-Embedding-8B)
+            hidden_size = getattr(self.model.model.config, "hidden_size", 4096)
             return [0.0] * hidden_size
     def embed_batch(self, texts: list[str]) -> list[list[float]]:
+        """Generate embeddings for a batch of texts.
+        Uses official batch processing for efficiency.
+        """
+        try:
+            inputs = [{"text": text} for text in texts]
+            embeddings = self.model.process(inputs, normalize=True)
+            return [emb.cpu().tolist() for emb in embeddings]
+        except Exception as e:
+            logger.error(f"Batch embedding generation failed: {e}")
+            hidden_size = getattr(self.model.model.config, "hidden_size", 4096)
+            return [[0.0] * hidden_size for _ in texts]
 class RealRerankerModel:
     """Wrapper for real reranker model inference.
+    Uses the official Qwen3VLReranker from QwenLM/Qwen3-VL-Embedding.
+    The model handles yes/no scoring internally via:
     - Extracts "yes" and "no" token weights from the LM head
     - Creates a binary linear layer: weight = yes_weight - no_weight
     - Scores = sigmoid(linear(last_token_hidden_state))
     """
     def __init__(self, model, processor):
+        """Initialize with Qwen3VLReranker instance.
+        Args:
+            model: Qwen3VLReranker instance (official loader)
+            processor: Processor (stored for compatibility, but model has its own)
         """
+        self.model = model
+        self.processor = processor
+    def rerank(self, query: str, documents: list[str]) -> list[float]:
+        """Rerank documents by relevance to query using official Qwen3VLReranker.
+        The official model.process() handles:
+        - Proper message formatting
+        - Tokenization
+        - Yes/no scoring with LM head weights
+        - Sigmoid normalization
+        Args:
+            query: The search query
+            documents: List of documents to rerank
+        Returns:
+            List of relevance scores (0-1) for each document.
+            Higher scores indicate more relevant documents.
         """
         if not documents:
             return []
         try:
+            # Use official process() API - expects dict with query and documents
+            inputs = {
+                "instruction": "Retrieve relevant documents for the query.",
+                "query": {"text": query},
+                "documents": [{"text": doc} for doc in documents],
+            }
+            scores = self.model.process(inputs)
+            return scores
         except Exception as e:
+            logger.error(f"Reranking failed: {e}")
+            return [0.0] * len(documents)
     def rerank_with_indices(
         self, query: str, documents: list[str], top_k: int = None

scripts/qwen3_vl/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+"""Vendored Qwen3-VL embedding and reranker implementations.
+Source: https://github.com/QwenLM/Qwen3-VL-Embedding
+License: Apache 2.0
+These are the official loading classes for:
+- Qwen/Qwen3-VL-Embedding-8B
+- Qwen/Qwen3-VL-Reranker-8B
+"""
+from scripts.qwen3_vl.qwen3_vl_embedding import Qwen3VLEmbedder, Qwen3VLForEmbedding
+from scripts.qwen3_vl.qwen3_vl_reranker import Qwen3VLReranker
+__all__ = ["Qwen3VLEmbedder", "Qwen3VLForEmbedding", "Qwen3VLReranker"]

scripts/qwen3_vl/qwen3_vl_embedding.py ADDED Viewed

	@@ -0,0 +1,393 @@

+"""Official Qwen3-VL Embedding implementation.
+Source: https://github.com/QwenLM/Qwen3-VL-Embedding/blob/main/src/models/qwen3_vl_embedding.py
+License: Apache 2.0
+"""
+import torch
+import torch.nn.functional as F
+import unicodedata
+import numpy as np
+import logging
+from PIL import Image
+from dataclasses import dataclass
+from typing import Optional, List, Union, Dict, Any
+from transformers.models.qwen3_vl.modeling_qwen3_vl import (
+    Qwen3VLPreTrainedModel,
+    Qwen3VLModel,
+    Qwen3VLConfig,
+)
+from transformers.models.qwen3_vl.processing_qwen3_vl import Qwen3VLProcessor
+from transformers.modeling_outputs import ModelOutput
+from transformers.processing_utils import Unpack
+from transformers.utils import TransformersKwargs
+from transformers.cache_utils import Cache
+from qwen_vl_utils.vision_process import process_vision_info
+logger = logging.getLogger(__name__)
+# Constants for configuration
+MAX_LENGTH = 8192
+IMAGE_BASE_FACTOR = 16
+IMAGE_FACTOR = IMAGE_BASE_FACTOR * 2
+MIN_PIXELS = 4 * IMAGE_FACTOR * IMAGE_FACTOR
+MAX_PIXELS = 1800 * IMAGE_FACTOR * IMAGE_FACTOR
+FPS = 1
+MAX_FRAMES = 64
+FRAME_MAX_PIXELS = 768 * IMAGE_FACTOR * IMAGE_FACTOR
+MAX_TOTAL_PIXELS = 10 * FRAME_MAX_PIXELS
+PAD_TOKEN = "<|endoftext|>"
+@dataclass
+class Qwen3VLForEmbeddingOutput(ModelOutput):
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    attention_mask: Optional[torch.Tensor] = None
+class Qwen3VLForEmbedding(Qwen3VLPreTrainedModel):
+    _checkpoint_conversion_mapping = {}
+    accepts_loss_kwargs = False
+    config: Qwen3VLConfig
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = Qwen3VLModel(config)
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.get_input_embeddings()
+    def set_input_embeddings(self, value):
+        self.model.set_input_embeddings(value)
+    def set_decoder(self, decoder):
+        self.model.set_decoder(decoder)
+    def get_decoder(self):
+        return self.model.get_decoder()
+    def get_video_features(
+        self,
+        pixel_values_videos: torch.FloatTensor,
+        video_grid_thw: Optional[torch.LongTensor] = None,
+    ):
+        return self.model.get_video_features(pixel_values_videos, video_grid_thw)
+    def get_image_features(
+        self,
+        pixel_values: torch.FloatTensor,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+    ):
+        return self.model.get_image_features(pixel_values, image_grid_thw)
+    @property
+    def language_model(self):
+        return self.model.language_model
+    @property
+    def visual(self):
+        return self.model.visual
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        pixel_values: Optional[torch.Tensor] = None,
+        pixel_values_videos: Optional[torch.FloatTensor] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        video_grid_thw: Optional[torch.LongTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, Qwen3VLForEmbeddingOutput]:
+        outputs = self.model(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            pixel_values_videos=pixel_values_videos,
+            image_grid_thw=image_grid_thw,
+            video_grid_thw=video_grid_thw,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        return Qwen3VLForEmbeddingOutput(
+            last_hidden_state=outputs.last_hidden_state,
+            attention_mask=attention_mask,
+        )
+def sample_frames(
+    frames: List[Union[str, Image.Image]], num_segments: int, max_segments: int
+) -> List[str]:
+    duration = len(frames)
+    frame_id_array = np.linspace(0, duration - 1, num_segments, dtype=int)
+    frame_id_list = frame_id_array.tolist()
+    last_frame_id = frame_id_list[-1]
+    sampled_frames = []
+    for frame_idx in frame_id_list:
+        try:
+            sampled_frames.append(frames[frame_idx])
+        except:
+            break
+    while len(sampled_frames) < num_segments:
+        sampled_frames.append(frames[last_frame_id])
+    return sampled_frames[:max_segments]
+class Qwen3VLEmbedder:
+    """Official Qwen3-VL embedding model wrapper.
+    Usage:
+        model = Qwen3VLEmbedder(model_name_or_path="Qwen/Qwen3-VL-Embedding-8B")
+        embeddings = model.process([{"text": "Hello world"}])
+    """
+    def __init__(
+        self,
+        model_name_or_path: str,
+        max_length: int = MAX_LENGTH,
+        min_pixels: int = MIN_PIXELS,
+        max_pixels: int = MAX_PIXELS,
+        total_pixels: int = MAX_TOTAL_PIXELS,
+        fps: float = FPS,
+        num_frames: int = MAX_FRAMES,
+        max_frames: int = MAX_FRAMES,
+        default_instruction: str = "Represent the user's input.",
+        **kwargs,
+    ):
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.max_length = max_length
+        self.min_pixels = min_pixels
+        self.max_pixels = max_pixels
+        self.total_pixels = total_pixels
+        self.fps = fps
+        self.num_frames = num_frames
+        self.max_frames = max_frames
+        self.default_instruction = default_instruction
+        self.model = Qwen3VLForEmbedding.from_pretrained(
+            model_name_or_path, trust_remote_code=True, **kwargs
+        ).to(device)
+        self.processor = Qwen3VLProcessor.from_pretrained(
+            model_name_or_path, padding_side="right"
+        )
+        self.model.eval()
+    @property
+    def device(self):
+        return self.model.device
+    @torch.no_grad()
+    def forward(self, inputs: Dict[str, Any]) -> Dict[str, torch.Tensor]:
+        outputs = self.model(**inputs)
+        return {
+            "last_hidden_state": outputs.last_hidden_state,
+            "attention_mask": inputs.get("attention_mask"),
+        }
+    def _truncate_tokens(self, token_ids: List[int], max_length: int) -> List[int]:
+        if len(token_ids) <= max_length:
+            return token_ids
+        special_token_ids = set(self.processor.tokenizer.all_special_ids)
+        num_special = sum(1 for token_idx in token_ids if token_idx in special_token_ids)
+        num_non_special_to_keep = max_length - num_special
+        final_token_ids = []
+        non_special_kept_count = 0
+        for token_idx in token_ids:
+            if token_idx in special_token_ids:
+                final_token_ids.append(token_idx)
+            elif non_special_kept_count < num_non_special_to_keep:
+                final_token_ids.append(token_idx)
+                non_special_kept_count += 1
+        return final_token_ids
+    def format_model_input(
+        self,
+        text: Optional[str] = None,
+        image: Optional[Union[str, Image.Image]] = None,
+        video: Optional[Union[str, List[Union[str, Image.Image]]]] = None,
+        instruction: Optional[str] = None,
+        fps: Optional[float] = None,
+        max_frames: Optional[int] = None,
+    ) -> List[Dict]:
+        if instruction:
+            instruction = instruction.strip()
+            if instruction and not unicodedata.category(instruction[-1]).startswith("P"):
+                instruction = instruction + "."
+        content = []
+        conversation = [
+            {
+                "role": "system",
+                "content": [
+                    {"type": "text", "text": instruction or self.default_instruction}
+                ],
+            },
+            {"role": "user", "content": content},
+        ]
+        if not text and not image and not video:
+            content.append({"type": "text", "text": "NULL"})
+            return conversation
+        if video:
+            video_content = None
+            video_kwargs = {"total_pixels": self.total_pixels}
+            if isinstance(video, list):
+                video_content = video
+                if self.num_frames is not None or self.max_frames is not None:
+                    video_content = sample_frames(
+                        video_content, self.num_frames, self.max_frames
+                    )
+                video_content = [
+                    ("file://" + ele if isinstance(ele, str) else ele)
+                    for ele in video_content
+                ]
+            elif isinstance(video, str):
+                video_content = (
+                    video
+                    if video.startswith(("http://", "https://"))
+                    else "file://" + video
+                )
+                video_kwargs = {
+                    "fps": fps or self.fps,
+                    "max_frames": max_frames or self.max_frames,
+                }
+            else:
+                raise TypeError(f"Unrecognized video type: {type(video)}")
+            if video_content:
+                content.append({"type": "video", "video": video_content, **video_kwargs})
+        if image:
+            image_content = None
+            if isinstance(image, Image.Image):
+                image_content = image
+            elif isinstance(image, str):
+                image_content = (
+                    image if image.startswith(("http", "oss")) else "file://" + image
+                )
+            else:
+                raise TypeError(f"Unrecognized image type: {type(image)}")
+            if image_content:
+                content.append(
+                    {
+                        "type": "image",
+                        "image": image_content,
+                        "min_pixels": self.min_pixels,
+                        "max_pixels": self.max_pixels,
+                    }
+                )
+        if text:
+            content.append({"type": "text", "text": text})
+        return conversation
+    def _preprocess_inputs(
+        self, conversations: List[List[Dict]]
+    ) -> Dict[str, torch.Tensor]:
+        text = self.processor.apply_chat_template(
+            conversations, add_generation_prompt=True, tokenize=False
+        )
+        try:
+            images, video_inputs, video_kwargs = process_vision_info(
+                conversations,
+                image_patch_size=16,
+                return_video_metadata=True,
+                return_video_kwargs=True,
+            )
+        except Exception as e:
+            logger.error(f"Error in processing vision info: {e}")
+            images = None
+            video_inputs = None
+            video_kwargs = {"do_sample_frames": False}
+            text = self.processor.apply_chat_template(
+                [{"role": "user", "content": [{"type": "text", "text": "NULL"}]}],
+                add_generation_prompt=True,
+                tokenize=False,
+            )
+        if video_inputs is not None:
+            videos, video_metadata = zip(*video_inputs)
+            videos = list(videos)
+            video_metadata = list(video_metadata)
+        else:
+            videos, video_metadata = None, None
+        inputs = self.processor(
+            text=text,
+            images=images,
+            videos=videos,
+            video_metadata=video_metadata,
+            truncation=True,
+            max_length=self.max_length,
+            padding=True,
+            do_resize=False,
+            return_tensors="pt",
+            **video_kwargs,
+        )
+        return inputs
+    @staticmethod
+    def _pooling_last(
+        hidden_state: torch.Tensor, attention_mask: torch.Tensor
+    ) -> torch.Tensor:
+        """Extract the last valid token's hidden state based on attention mask."""
+        flipped_tensor = attention_mask.flip(dims=[1])
+        last_one_positions = flipped_tensor.argmax(dim=1)
+        col = attention_mask.shape[1] - last_one_positions - 1
+        row = torch.arange(hidden_state.shape[0], device=hidden_state.device)
+        return hidden_state[row, col]
+    def process(
+        self, inputs: List[Dict[str, Any]], normalize: bool = True
+    ) -> torch.Tensor:
+        """Generate embeddings for a list of inputs.
+        Args:
+            inputs: List of dicts with 'text', 'image', and/or 'video' keys
+            normalize: Whether to L2 normalize embeddings (default True)
+        Returns:
+            Tensor of shape (batch_size, hidden_dim) with embeddings
+        """
+        conversations = [
+            self.format_model_input(
+                text=ele.get("text"),
+                image=ele.get("image"),
+                video=ele.get("video"),
+                instruction=ele.get("instruction"),
+                fps=ele.get("fps"),
+                max_frames=ele.get("max_frames"),
+            )
+            for ele in inputs
+        ]
+        processed_inputs = self._preprocess_inputs(conversations)
+        processed_inputs = {k: v.to(self.model.device) for k, v in processed_inputs.items()}
+        outputs = self.forward(processed_inputs)
+        embeddings = self._pooling_last(
+            outputs["last_hidden_state"], outputs["attention_mask"]
+        )
+        if normalize:
+            embeddings = F.normalize(embeddings, p=2, dim=-1)
+        return embeddings

scripts/qwen3_vl/qwen3_vl_reranker.py ADDED Viewed

	@@ -0,0 +1,371 @@

+"""Official Qwen3-VL Reranker implementation.
+Source: https://github.com/QwenLM/Qwen3-VL-Embedding/blob/main/src/models/qwen3_vl_reranker.py
+License: Apache 2.0
+"""
+import torch
+import numpy as np
+import logging
+from PIL import Image
+from typing import List, Optional, Union, Dict, Any
+from qwen_vl_utils import process_vision_info
+from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
+logger = logging.getLogger(__name__)
+MAX_LENGTH = 8192
+IMAGE_BASE_FACTOR = 16
+IMAGE_FACTOR = IMAGE_BASE_FACTOR * 2
+MIN_PIXELS = 4 * IMAGE_FACTOR * IMAGE_FACTOR  # 4 tokens
+MAX_PIXELS = 1280 * IMAGE_FACTOR * IMAGE_FACTOR  # 1280 tokens
+MAX_RATIO = 200
+FRAME_FACTOR = 2
+FPS = 1
+MIN_FRAMES = 2
+MAX_FRAMES = 64
+MIN_TOTAL_PIXELS = 1 * FRAME_FACTOR * MIN_PIXELS  # 1 frames
+MAX_TOTAL_PIXELS = 4 * FRAME_FACTOR * MAX_PIXELS  # 4 frames
+def sample_frames(frames, num_segments, max_segments):
+    duration = len(frames)
+    frame_id_array = np.linspace(0, duration - 1, num_segments, dtype=int)
+    frame_id_list = frame_id_array.tolist()
+    last_frame_id = frame_id_list[-1]
+    sampled_frames = []
+    for frame_idx in frame_id_list:
+        try:
+            single_frame_path = frames[frame_idx]
+        except:
+            break
+        sampled_frames.append(single_frame_path)
+    # Pad with last frame if total frames less than num_segments
+    while len(sampled_frames) < num_segments:
+        sampled_frames.append(frames[last_frame_id])
+    return sampled_frames[:max_segments]
+class Qwen3VLReranker:
+    """Official Qwen3-VL reranker model wrapper.
+    Usage:
+        model = Qwen3VLReranker(model_name_or_path="Qwen/Qwen3-VL-Reranker-8B")
+        scores = model.process({
+            "instruction": "Retrieve relevant documents.",
+            "query": {"text": "search query"},
+            "documents": [{"text": "doc1"}, {"text": "doc2"}]
+        })
+    """
+    def __init__(
+        self,
+        model_name_or_path: str,
+        max_length: int = MAX_LENGTH,
+        min_pixels: int = MIN_PIXELS,
+        max_pixels: int = MAX_PIXELS,
+        total_pixels: int = MAX_TOTAL_PIXELS,
+        fps: float = FPS,
+        num_frames: int = MAX_FRAMES,
+        max_frames: int = MAX_FRAMES,
+        default_instruction: str = "Given a search query, retrieve relevant candidates that answer the query.",
+        **kwargs,
+    ):
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.max_length = max_length
+        self.min_pixels = min_pixels
+        self.max_pixels = max_pixels
+        self.total_pixels = total_pixels
+        self.fps = fps
+        self.num_frames = num_frames
+        self.max_frames = max_frames
+        self.default_instruction = default_instruction
+        lm = Qwen3VLForConditionalGeneration.from_pretrained(
+            model_name_or_path, trust_remote_code=True, **kwargs
+        ).to(self.device)
+        self.model = lm.model
+        self.processor = AutoProcessor.from_pretrained(
+            model_name_or_path, trust_remote_code=True, padding_side="left"
+        )
+        self.model.eval()
+        token_true_id = self.processor.tokenizer.get_vocab()["yes"]
+        token_false_id = self.processor.tokenizer.get_vocab()["no"]
+        self.score_linear = self.get_binary_linear(lm, token_true_id, token_false_id)
+        self.score_linear.eval()
+        self.score_linear.to(self.device).to(self.model.dtype)
+        logger.info(
+            f"Initialized Qwen3VLReranker with yes/no scoring layer (device={self.device})"
+        )
+    def get_binary_linear(self, model, token_yes, token_no):
+        """Extract yes/no token weights from LM head and create scoring layer."""
+        lm_head_weights = model.lm_head.weight.data
+        weight_yes = lm_head_weights[token_yes]
+        weight_no = lm_head_weights[token_no]
+        D = weight_yes.size()[0]
+        linear_layer = torch.nn.Linear(D, 1, bias=False)
+        with torch.no_grad():
+            linear_layer.weight[0] = weight_yes - weight_no
+        return linear_layer
+    @torch.no_grad()
+    def compute_scores(self, inputs):
+        """Compute relevance scores using the binary linear layer."""
+        batch_scores = self.model(**inputs).last_hidden_state[:, -1]
+        scores = self.score_linear(batch_scores)
+        scores = torch.sigmoid(scores).squeeze(-1).cpu().detach().tolist()
+        return scores
+    def truncate_tokens_optimized(
+        self, tokens: List[str], max_length: int, special_tokens: List[str]
+    ) -> List[str]:
+        if len(tokens) <= max_length:
+            return tokens
+        special_tokens_set = set(special_tokens)
+        # Calculate budget: how many non-special tokens we can keep
+        num_special = sum(1 for token in tokens if token in special_tokens_set)
+        num_non_special_to_keep = max_length - num_special
+        # Build final list according to budget
+        final_tokens = []
+        non_special_kept_count = 0
+        for token in tokens:
+            if token in special_tokens_set:
+                final_tokens.append(token)
+            elif non_special_kept_count < num_non_special_to_keep:
+                final_tokens.append(token)
+                non_special_kept_count += 1
+        return final_tokens
+    def tokenize(self, pairs: list, **kwargs):
+        max_length = self.max_length
+        text = self.processor.apply_chat_template(
+            pairs, tokenize=False, add_generation_prompt=True
+        )
+        try:
+            images, videos, video_kwargs = process_vision_info(
+                pairs,
+                image_patch_size=16,
+                return_video_kwargs=True,
+                return_video_metadata=True,
+            )
+        except Exception as e:
+            logger.error(f"Error in processing vision info: {e}")
+            images = None
+            videos = None
+            video_kwargs = {"do_sample_frames": False}
+            text = self.processor.apply_chat_template(
+                [{"role": "user", "content": [{"type": "text", "text": "NULL"}]}],
+                add_generation_prompt=True,
+                tokenize=False,
+            )
+        if videos is not None:
+            videos, video_metadatas = zip(*videos)
+            videos, video_metadatas = list(videos), list(video_metadatas)
+        else:
+            video_metadatas = None
+        inputs = self.processor(
+            text=text,
+            images=images,
+            videos=videos,
+            video_metadata=video_metadatas,
+            truncation=False,
+            padding=False,
+            do_resize=False,
+            **video_kwargs,
+        )
+        for i, ele in enumerate(inputs["input_ids"]):
+            inputs["input_ids"][i] = (
+                self.truncate_tokens_optimized(
+                    inputs["input_ids"][i][:-5],
+                    max_length,
+                    self.processor.tokenizer.all_special_ids,
+                )
+                + inputs["input_ids"][i][-5:]
+            )
+        temp_inputs = self.processor.tokenizer.pad(
+            {"input_ids": inputs["input_ids"]},
+            padding=True,
+            return_tensors="pt",
+            max_length=self.max_length,
+        )
+        for key in temp_inputs:
+            inputs[key] = temp_inputs[key]
+        return inputs
+    def format_mm_content(
+        self,
+        text,
+        image,
+        video,
+        prefix="Query:",
+        fps=None,
+        max_frames=None,
+    ):
+        content = []
+        content.append({"type": "text", "text": prefix})
+        if not text and not image and not video:
+            content.append({"type": "text", "text": "NULL"})
+            return content
+        if video:
+            video_content = None
+            video_kwargs = {"total_pixels": self.total_pixels}
+            if isinstance(video, list):
+                video_content = video
+                if self.num_frames is not None or self.max_frames is not None:
+                    video_content = sample_frames(
+                        video_content, self.num_frames, self.max_frames
+                    )
+                video_content = [
+                    ("file://" + ele if isinstance(ele, str) else ele)
+                    for ele in video_content
+                ]
+            elif isinstance(video, str):
+                video_content = (
+                    video
+                    if video.startswith(("http://", "https://"))
+                    else "file://" + video
+                )
+                video_kwargs = {
+                    "fps": fps or self.fps,
+                    "max_frames": max_frames or self.max_frames,
+                }
+            else:
+                raise TypeError(f"Unrecognized video type: {type(video)}")
+            if video_content:
+                content.append({"type": "video", "video": video_content, **video_kwargs})
+        if image:
+            image_content = None
+            if isinstance(image, Image.Image):
+                image_content = image
+            elif isinstance(image, str):
+                image_content = (
+                    image if image.startswith(("http", "oss")) else "file://" + image
+                )
+            else:
+                raise TypeError(f"Unrecognized image type: {type(image)}")
+            if image_content:
+                content.append(
+                    {
+                        "type": "image",
+                        "image": image_content,
+                        "min_pixels": self.min_pixels,
+                        "max_pixels": self.max_pixels,
+                    }
+                )
+        if text:
+            content.append({"type": "text", "text": text})
+        return content
+    def format_mm_instruction(
+        self,
+        query_text,
+        query_image,
+        query_video,
+        doc_text,
+        doc_image,
+        doc_video,
+        instruction=None,
+        fps=None,
+        max_frames=None,
+    ):
+        inputs = []
+        inputs.append(
+            {
+                "role": "system",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": 'Judge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be "yes" or "no".',
+                    }
+                ],
+            }
+        )
+        if isinstance(query_text, tuple):
+            instruct, query_text = query_text
+        else:
+            instruct = instruction
+        contents = []
+        contents.append({"type": "text", "text": "<Instruct>: " + instruct})
+        query_content = self.format_mm_content(
+            query_text,
+            query_image,
+            query_video,
+            prefix="<Query>:",
+            fps=fps,
+            max_frames=max_frames,
+        )
+        contents.extend(query_content)
+        doc_content = self.format_mm_content(
+            doc_text,
+            doc_image,
+            doc_video,
+            prefix="\n<Document>:",
+            fps=fps,
+            max_frames=max_frames,
+        )
+        contents.extend(doc_content)
+        inputs.append({"role": "user", "content": contents})
+        return inputs
+    def process(self, inputs: Dict[str, Any]) -> List[float]:
+        """Score documents by relevance to query.
+        Args:
+            inputs: Dict with 'instruction', 'query', and 'documents' keys.
+                   query and documents can have 'text', 'image', 'video' fields.
+        Returns:
+            List of relevance scores (0-1) for each document.
+        """
+        instruction = inputs.get("instruction", self.default_instruction)
+        query = inputs.get("query", {})
+        documents = inputs.get("documents", [])
+        if not query or not documents:
+            return []
+        pairs = [
+            self.format_mm_instruction(
+                query.get("text", None),
+                query.get("image", None),
+                query.get("video", None),
+                document.get("text", None),
+                document.get("image", None),
+                document.get("video", None),
+                instruction=instruction,
+                fps=inputs.get("fps", self.fps),
+                max_frames=inputs.get("max_frames", self.max_frames),
+            )
+            for document in documents
+        ]
+        final_scores = []
+        for pair in pairs:
+            tokenized_inputs = self.tokenize([pair])
+            tokenized_inputs = tokenized_inputs.to(self.model.device)
+            scores = self.compute_scores(tokenized_inputs)
+            final_scores.extend(scores)
+        return final_scores