Spaces:

TurkishCodeMan
/

multimodal-recipe-rag

Runtime error

App Files Files Community

TurkishCodeMan commited on Jan 24

Commit

4db9aa3

verified ·

1 Parent(s): bffd49d

Upload folder using huggingface_hub

Browse files

Files changed (9) hide show

models/local_nemotron/__init__.py +0 -0
models/local_nemotron/configuration_llama_nemotron_vl.py +136 -0
models/local_nemotron/modeling_llama_nemotron_vl.py +552 -0
models/local_nemotron/processing_llama_nemotron_vl.py +417 -0
models/local_nemotron_rerank/__init__.py +0 -0
models/local_nemotron_rerank/configuration_llama_nemotron_vl.py +164 -0
models/local_nemotron_rerank/modeling_llama_nemotron_vl.py +678 -0
models/local_nemotron_rerank/processing_llama_nemotron_vl.py +360 -0
models/model_loader.py +26 -8

models/local_nemotron/__init__.py ADDED Viewed

File without changes

models/local_nemotron/configuration_llama_nemotron_vl.py ADDED Viewed

	@@ -0,0 +1,136 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0.
+from typing import Optional
+from transformers.configuration_utils import PretrainedConfig
+from transformers.models.llama.configuration_llama import LlamaConfig
+from transformers.models.siglip.configuration_siglip import SiglipVisionConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+# ============================================================================
+# Bidirectional LLaMA Configuration
+# ============================================================================
+class LlamaBidirectionalConfig(LlamaConfig):
+    """Configuration for bidirectional (non-causal) LLaMA model."""
+    model_type = "llama_bidirec"
+    def __init__(
+        self,
+        pooling="avg",
+        temperature=1.0,
+        **kwargs,
+    ):
+        self.pooling = pooling
+        self.temperature = temperature
+        super().__init__(
+            **kwargs,
+        )
+# ============================================================================
+# LlamaNemotronVL Configuration Classes
+# ============================================================================
+class LlamaNemotronVLConfig(PretrainedConfig):
+    """
+    Base configuration for vision-language models combining vision and language components.
+    This serves as the foundation for LlamaNemotronVL configurations.
+    """
+    model_type = "llama_nemotron_vl"
+    is_composition = True
+    # is_composition was renamed to has_no_defaults_at_init in transformers 4.52.1
+    # In PR https://github.com/huggingface/transformers/pull/36263
+    has_no_defaults_at_init = True
+    def __init__(
+        self,
+        vision_config=None,
+        llm_config=None,
+        use_backbone_lora=0,
+        use_llm_lora=0,
+        select_layer=-1,
+        force_image_size=None,
+        downsample_ratio=0.5,
+        template=None,
+        dynamic_image_size=False,
+        use_thumbnail=False,
+        min_dynamic_patch=1,
+        max_dynamic_patch=6,
+        mlp_checkpoint=True,
+        pre_feature_reduction=False,
+        keep_aspect_ratio=False,
+        vocab_size=-1,
+        q_max_length: Optional[int] = 512,
+        p_max_length: Optional[int] = 10240,
+        query_prefix: str = "query:",
+        passage_prefix: str = "passage:",
+        pooling: str = "last",
+        bidirectional_attention: bool = False,
+        max_input_tiles: int = 2,
+        img_context_token_id: int = 128258,  # tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
+        **kwargs,
+    ):
+        if vision_config is None:
+            vision_config = {}
+            logger.info(
+                "vision_config is None. Initializing Vision Encoders with default values."
+            )
+        else:
+            if vision_config["model_type"] == "siglip_vision_model":
+                self.vision_config = SiglipVisionConfig(**vision_config)
+            else:
+                raise ValueError(
+                    "Unsupported model_type: {}".format(vision_config["model_type"])
+                )
+        if llm_config is None:
+            llm_config = {}
+            logger.info(
+                "llm_config is None. Initializing the LLM config with default values"
+            )
+        else:
+            if llm_config["architectures"][0] in {
+                "LlamaBidirectionalModel",
+                "LlamaBidirectionalForSequenceClassification",
+            }:
+                self.llm_config = LlamaBidirectionalConfig(**llm_config)
+            else:
+                raise ValueError(
+                    "Unsupported architecture: {}".format(
+                        llm_config["architectures"][0]
+                    )
+                )
+            self.vocab_size = self.llm_config.vocab_size
+        self.use_backbone_lora = use_backbone_lora
+        self.use_llm_lora = use_llm_lora
+        self.select_layer = select_layer
+        self.force_image_size = force_image_size
+        self.downsample_ratio = downsample_ratio
+        self.template = template
+        self.dynamic_image_size = dynamic_image_size
+        self.use_thumbnail = use_thumbnail
+        self.min_dynamic_patch = min_dynamic_patch
+        self.max_dynamic_patch = max_dynamic_patch
+        self.mlp_checkpoint = mlp_checkpoint
+        self.pre_feature_reduction = pre_feature_reduction
+        self.keep_aspect_ratio = keep_aspect_ratio
+        self.q_max_length = q_max_length
+        self.p_max_length = p_max_length
+        self.query_prefix = query_prefix
+        self.passage_prefix = passage_prefix
+        self.pooling = pooling
+        self.bidirectional_attention = bidirectional_attention
+        self.img_context_token_id = img_context_token_id
+        self.max_input_tiles = max_input_tiles
+        super().__init__(**kwargs)

models/local_nemotron/modeling_llama_nemotron_vl.py ADDED Viewed

	@@ -0,0 +1,552 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0.
+import math
+from typing import List, Optional, Tuple, Union, Any, Dict
+import torch
+import torch.nn as nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers import AutoProcessor, PreTrainedModel, AutoConfig
+from transformers.cache_utils import Cache
+from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
+from transformers.modeling_outputs import (
+    CausalLMOutputWithPast,
+    SequenceClassifierOutputWithPast,
+)
+from transformers.models.llama.modeling_llama import (
+    LlamaForSequenceClassification,
+    LlamaModel,
+)
+from transformers.models.siglip.modeling_siglip import SiglipVisionModel
+from transformers.utils import logging
+from .configuration_llama_nemotron_vl import (
+    LlamaBidirectionalConfig,
+    LlamaNemotronVLConfig,
+)
+from .processing_llama_nemotron_vl import LlamaNemotronVLProcessor
+logger = logging.get_logger(__name__)
+def split_model(model_path, device):
+    device_map = {}
+    world_size = torch.cuda.device_count()
+    config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+    num_layers = config.llm_config.num_hidden_layers
+    print("world_size", world_size)
+    num_layers_per_gpu_ = math.floor(num_layers / (world_size - 1))
+    num_layers_per_gpu = [num_layers_per_gpu_] * world_size
+    num_layers_per_gpu[device] = num_layers - num_layers_per_gpu_ * (world_size - 1)
+    print(num_layers_per_gpu)
+    layer_cnt = 0
+    for i, num_layer in enumerate(num_layers_per_gpu):
+        for j in range(num_layer):
+            device_map[f"language_model.model.layers.{layer_cnt}"] = i
+            layer_cnt += 1
+    device_map["vision_model"] = device
+    device_map["mlp1"] = device
+    device_map["language_model.model.tok_embeddings"] = device
+    device_map["language_model.model.embed_tokens"] = device
+    device_map["language_model.output"] = device
+    device_map["language_model.model.norm"] = device
+    device_map["language_model.lm_head"] = device
+    device_map["language_model.model.rotary_emb"] = device
+    device_map[f"language_model.model.layers.{num_layers - 1}"] = device
+    return device_map
+def pool(
+    last_hidden_states: torch.Tensor, attention_mask: torch.Tensor, pool_type: str
+) -> torch.Tensor:
+    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
+    if pool_type == "avg":
+        emb = last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
+    elif pool_type == "weighted_avg":
+        emb = last_hidden.sum(dim=1)
+    elif pool_type == "cls":
+        emb = last_hidden[:, 0]
+    elif pool_type == "last":
+        left_padding = attention_mask[:, -1].sum() == attention_mask.shape[0]
+        if left_padding:
+            emb = last_hidden[:, -1]
+        else:
+            sequence_lengths = attention_mask.sum(dim=1) - 1
+            batch_size = last_hidden.shape[0]
+            emb = last_hidden[
+                torch.arange(batch_size, device=last_hidden.device), sequence_lengths
+            ]
+    elif pool_type == "cls_last":
+        emb = last_hidden[:, 0]
+    elif pool_type == "colbert":
+        emb = last_hidden
+    else:
+        raise ValueError(f"pool_type {pool_type} not supported")
+    return emb
+# ============================================================================
+# Bidirectional LLaMA Model
+# ============================================================================
+class LlamaBidirectionalModel(LlamaModel):
+    """LLaMA model with bidirectional (non-causal) attention."""
+    config_class = LlamaBidirectionalConfig
+    def __init__(self, config: LlamaBidirectionalConfig):
+        # ✅ FIX: Force eager attention before super().__init__ triggers FA2 checks
+        config._attn_implementation = "eager"
+        if hasattr(config, 'llm_config'):
+             config.llm_config._attn_implementation = "eager"
+        super().__init__(config)
+        for layer in self.layers:
+            layer.self_attn.is_causal = False
+    def _update_causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool,
+    ):
+        assert self.config._attn_implementation in ["flash_attention_2", "eager", "sdpa"], (
+            f"Unsupported attention implementation: {self.config._attn_implementation}, "
+            "only support flash_attention_2, eager or sdpa"
+        )
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and (attention_mask == 0.0).any():
+                return attention_mask
+            return None
+        elif self.config._attn_implementation in {"eager", "sdpa"}:
+            causal_mask = _prepare_4d_attention_mask(
+                attention_mask,
+                dtype=input_tensor.dtype,
+            )
+            return causal_mask
+class LlamaBidirectionalForSequenceClassification(LlamaForSequenceClassification):
+    """LLaMA sequence classification model with bidirectional attention."""
+    config_class = LlamaBidirectionalConfig
+    def __init__(self, config):
+        super().__init__(config)
+        # Releasing the parameters of LlamaModel created by parent
+        del self.model
+        self.model = LlamaBidirectionalModel(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        pooled_hidden_states = pool(
+            last_hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            pool_type=self.config.pooling,
+        )
+        pooled_logits = self.score(pooled_hidden_states)
+        pooled_logits = pooled_logits / self.config.temperature
+        loss = None
+        if labels is not None:
+            labels = labels.to(pooled_logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (
+                    labels.dtype == torch.long or labels.dtype == torch.int
+                ):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(
+                    pooled_logits.view(-1, self.num_labels), labels.view(-1)
+                )
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+# ============================================================================
+# LlamaNemotronVL Model Classes
+# ============================================================================
+class LlamaNemotronVLModel(PreTrainedModel):
+    """
+    LlamaNemotron VL model for vision-language reranking.
+    Combines a vision encoder (SigLIP) with a bidirectional language model (LLaMA)
+    for cross-modal reranking tasks.
+    """
+    config_class = LlamaNemotronVLConfig
+    main_input_name = "pixel_values"
+    _no_split_modules = ["LlamaDecoderLayer"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    def __init__(
+        self,
+        config: LlamaNemotronVLConfig,
+        vision_model: Optional[PreTrainedModel] = None,
+        language_model: Optional[PreTrainedModel] = None,
+    ):
+        # ✅ FIX: Force eager attention here as well
+        config._attn_implementation = "eager"
+        super().__init__(config)
+        # Calculate image token count
+        image_size = config.force_image_size or config.vision_config.image_size
+        if hasattr(config.vision_config, "grid_size"):
+            grid_size = config.vision_config.grid_size
+            self.patch_size = 14
+            self.num_image_token = int((grid_size * config.downsample_ratio) ** 2)
+        else:
+            patch_size = config.vision_config.patch_size
+            self.patch_size = patch_size
+            self.num_image_token = int(
+                (image_size // patch_size) ** 2 * (config.downsample_ratio**2)
+            )
+        self.select_layer = config.select_layer
+        self.template = config.template
+        self.downsample_ratio = config.downsample_ratio
+        logger.info(f"num_image_token: {self.num_image_token}")
+        if vision_model is not None:
+            self.vision_model = vision_model
+        else:
+            if config.vision_config.model_type == "siglip_vision_model":
+                config.vision_config._attn_implementation = config._attn_implementation
+                self.vision_model = SiglipVisionModel(config.vision_config)
+            else:
+                raise NotImplementedError(
+                    f"Unsupported vision model type: {config.vision_config.model_type}"
+                )
+        if language_model is not None:
+            self.language_model = language_model
+        else:
+            if config.llm_config.architectures[0] == "LlamaBidirectionalModel":
+                config.llm_config._attn_implementation = config._attn_implementation
+                self.language_model = LlamaBidirectionalModel(config.llm_config)
+            else:
+                raise NotImplementedError(
+                    f"{config.llm_config.architectures[0]} is not implemented."
+                )
+        # Vision-to-language projection
+        vit_hidden_size = config.vision_config.hidden_size
+        llm_hidden_size = config.llm_config.hidden_size
+        self.mlp1 = nn.Sequential(
+            nn.LayerNorm(vit_hidden_size * int(1 / self.downsample_ratio) ** 2),
+            nn.Linear(
+                vit_hidden_size * int(1 / self.downsample_ratio) ** 2,
+                llm_hidden_size,
+            ),
+            nn.GELU(),
+            nn.Linear(llm_hidden_size, llm_hidden_size),
+        )
+        self.img_context_token_id = None
+        # Initialize processor
+        self.processor = AutoProcessor.from_pretrained(
+            config.name_or_path, trust_remote_code=True
+        )
+    def _embed_batch(self, inputs: Dict[str, Any], pool_type: Optional[str] = None):
+        """
+        Encodes the inputs into a tensor of embeddings.
+        Args:
+            inputs: A dictionary of inputs to the model. You can prepare the inputs using the processor.process_queries and processor.process_documents methods.
+            pool_type: The type of pooling to use. If None, the pooling type is set to the pooling type configured in the model.
+        Returns:
+            A tensor of embeddings.
+        """
+        inputs = {
+            k: v.to(self.device) if isinstance(v, torch.Tensor) else v
+            for k, v in inputs.items()
+        }
+        outputs = self.forward(**inputs, output_hidden_states=True, return_dict=True)
+        if not pool_type:
+            pool_type = self.config.pooling
+        embeddings = pool(last_hidden_states=outputs.hidden_states[-1], attention_mask=inputs["attention_mask"], pool_type=pool_type)
+        return embeddings
+    def encode_queries(self, queries: List[str], **kwargs):
+        """
+        Encodes the input queries into a tensor of embeddings.
+        Args:
+            queries: A list of queries.
+        Returns:
+            A tensor of embeddings.
+        """
+        queries_dict = self.processor.process_queries(queries)
+        queries_embeddings = self._embed_batch(inputs=queries_dict, **kwargs)
+        return queries_embeddings
+    def encode_documents(self, images: Optional[List[Any]] = None, texts: Optional[List[str]] = None, **kwargs):
+        """
+        Encodes the input document images and texts into a tensor of embeddings.
+        Args:
+            images: A list of PIL.Image of document pages images.
+            texts: A list of document page texts.
+        Returns:
+            A tensor of embeddings.
+        """
+        if images and texts:
+            examples = [{
+                "image": image,
+                "text": doc_text
+            } for image, doc_text in zip(images, texts)]
+        elif images:
+            examples = [{
+                "image": image,
+                "text": ""
+            } for image in images]
+        elif texts:
+            examples = [{
+                "image": "",
+                "text": doc_text
+            } for doc_text in texts]
+        else:
+            raise ValueError("At least docs_images or docs_texts need to be provided")
+        docs_dict = self.processor.process_documents(examples)
+        docs_embeddings = self._embed_batch(inputs=docs_dict, **kwargs)
+        return docs_embeddings
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor = None,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        image_flags: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        num_patches_list: Optional[List[torch.Tensor]] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        # Get text embeddings
+        input_embeds = self.language_model.get_input_embeddings()(input_ids)
+        # Process and inject vision embeddings if present
+        if pixel_values is not None:
+            if image_flags is None:
+                image_flags = torch.ones(pixel_values.shape[0])
+            image_flags = image_flags.squeeze(-1)
+            vit_embeds = self.extract_feature(pixel_values).to(
+                device=input_embeds.device
+            )
+            if not isinstance(image_flags, list):
+                image_flags = image_flags.squeeze(-1)
+                vit_embeds = vit_embeds[image_flags == 1]
+            # Inject vision tokens into text embeddings
+            B, N, C = input_embeds.shape
+            input_embeds = input_embeds.reshape(B * N, C)
+            input_ids = input_ids.reshape(B * N)
+            selected = (input_ids == self.config.img_context_token_id).to(input_embeds.device)
+            try:
+                input_embeds[selected] = input_embeds[
+                    selected
+                ] * 0.0 + vit_embeds.reshape(-1, C)
+            except Exception as e:
+                vit_embeds = vit_embeds.reshape(-1, C)
+                print(
+                    f"warning: {e}, input_embeds[selected].shape={input_embeds[selected].shape}, "
+                    f"vit_embeds.shape={vit_embeds.shape}"
+                )
+                n_token = selected.sum()
+                input_embeds[selected] = (
+                    input_embeds[selected] * 0.0 + vit_embeds[:n_token]
+                )
+            input_embeds = input_embeds.reshape(B, N, C)
+        # Forward through language model
+        outputs = self.language_model(
+            inputs_embeds=input_embeds,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        logits = None
+        loss = None
+        if hasattr(outputs, "logits"):
+            logits = outputs.logits
+            if labels is not None:
+                # Shift so that tokens < n predict n
+                shift_logits = logits[..., :-1, :].contiguous()
+                shift_labels = labels[..., 1:].contiguous()
+                # Flatten the tokens
+                loss_fct = CrossEntropyLoss()
+                shift_logits = shift_logits.view(
+                    -1, self.language_model.config.vocab_size
+                )
+                shift_labels = shift_labels.view(-1)
+                # Enable model parallelism
+                shift_labels = shift_labels.to(shift_logits.device)
+                loss = loss_fct(shift_logits, shift_labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def pixel_shuffle(self, x, scale_factor=0.5):
+        n, w, h, c = x.shape
+        # N, W, H, C --> N, W, H * scale, C // scale
+        x = x.view(n, w, int(h * scale_factor), int(c / scale_factor))
+        # N, W, H * scale, C // scale --> N, H * scale, W, C // scale
+        x = x.permute(0, 2, 1, 3).contiguous()
+        # N, H * scale, W, C // scale --> N, H * scale, W * scale, C // (scale ** 2)
+        x = x.view(
+            n,
+            int(h * scale_factor),
+            int(w * scale_factor),
+            int(c / (scale_factor * scale_factor)),
+        )
+        x = x.permute(0, 2, 1, 3).contiguous()
+        return x
+    def extract_feature(self, pixel_values):
+        """Extract and project vision features to language model space."""
+        # Extract features from vision encoder
+        if self.select_layer == -1:
+            vit_embeds = self.vision_model(
+                pixel_values=pixel_values, output_hidden_states=False, return_dict=True
+            )
+            if hasattr(vit_embeds, "last_hidden_state"):
+                vit_embeds = vit_embeds.last_hidden_state
+        else:
+            vit_embeds = self.vision_model(
+                pixel_values=pixel_values, output_hidden_states=True, return_dict=True
+            ).hidden_states[self.select_layer]
+        # Remove CLS token if not using SigLIP
+        if not isinstance(self.vision_model, SiglipVisionModel):
+            vit_embeds = vit_embeds[:, 1:, :]
+        # Apply pixel shuffle and MLP projection
+        _, n, c = vit_embeds.shape
+        h = w = int(n**0.5)
+        vit_embeds = vit_embeds.reshape(-1, h, w, c)  # (B, H, W, C)
+        vit_embeds = self.pixel_shuffle(
+            vit_embeds, scale_factor=self.downsample_ratio
+        )  # (B, H/s, W/s, C*s*s)
+        _, h_s, w_s, c_s = vit_embeds.shape
+        vit_embeds = vit_embeds.reshape(-1, h_s * w_s, c_s)  # (B, (H/s)*(W/s), C*s*s)
+        vit_embeds = self.mlp1(vit_embeds)
+        return vit_embeds
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+    def get_output_embeddings(self):
+        return self.language_model.get_output_embeddings()
+    def build_collator(self, processor=None,**kwargs):
+        return processor or self.processor
+    def post_loss(self, loss, inputs):
+        # Add Dummy Gradients for Vision Encoder to ensure multi-GPU synchronization when there are batches with only text samples
+        # and other batches with images.
+        if "pixel_values" in inputs and inputs["pixel_values"] is None:
+            dummy_pixels = torch.zeros(
+                1, 3, 512, 512, device=loss.device, dtype=self.vision_model.dtype
+            )
+            dummy_output = self.extract_feature(dummy_pixels)
+            loss = loss + dummy_output.sum() * 0.0
+        return loss

models/local_nemotron/processing_llama_nemotron_vl.py ADDED Viewed

	@@ -0,0 +1,417 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0.
+import base64
+import os
+from io import BytesIO
+from typing import Any, Dict, List, Optional, Union, Tuple
+import dataclasses
+from dataclasses import field
+import requests
+import torch
+import torchvision.transforms as T
+from PIL import Image
+from torchvision.transforms.functional import InterpolationMode
+from transformers import ProcessorMixin
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+SIGLIP_MEAN = (0.5, 0.5, 0.5)
+SIGLIP_STD = (0.5, 0.5, 0.5)
+@dataclasses.dataclass
+class Conversation:
+    """Manages prompt construction with system messages and multi-turn dialogues."""
+    # System instruction prepended to prompts
+    system_message: str = ""
+    # Role identifiers for dialogue turns
+    roles: Tuple[str, str] = ("", "")
+    # Message history as (role, content) pairs
+    messages: List[List[str]] = field(default_factory=list)
+    # Separator token between messages
+    sep: str = ""
+    # Token IDs that trigger generation stopping
+    stop_token_ids: List[int] = None
+    def get_prompt(self) -> str:
+        """Construct the formatted prompt string from system message and dialogue history."""
+        ret = self.system_message + self.sep
+        for role, message in self.messages:
+            if message:
+                ret += role + message + self.sep
+            else:
+                ret += role
+        return ret
+    def append_message(self, role: str, message: str):
+        """Add a message turn to the dialogue history."""
+        self.messages.append([role, message])
+def get_conv_template(name: str) -> Conversation:
+    """Initialize a conversation instance with default configuration."""
+    return Conversation(
+        stop_token_ids=[128259, 128001],
+    )
+def load_image(image):
+    if isinstance(image, Image.Image):
+        return image
+    elif isinstance(image, str) and os.path.exists(image):
+        return Image.open(image)
+    elif isinstance(image, dict):
+        if "disk_path" in image:
+            return Image.open(image["disk_path"])
+        elif "base64" in image:
+            return Image.open(BytesIO(base64.b64decode(image["base64"])))
+        elif "url" in image:
+            response = requests.get(image["url"])
+            return Image.open(BytesIO(response.content))
+        elif "bytes" in image:
+            return Image.open(BytesIO(image["bytes"]))
+        else:
+            raise ValueError(f"Invalid image: {image}")
+    else:
+        raise ValueError(f"Invalid image: {image}")
+def build_transform(input_size, norm_type="imagenet"):
+    if norm_type == "imagenet":
+        MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
+    elif norm_type == "siglip":
+        MEAN, STD = SIGLIP_MEAN, SIGLIP_STD
+    transform = T.Compose(
+        [
+            T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img),
+            T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
+            T.ToTensor(),
+            T.Normalize(mean=MEAN, std=STD),
+        ]
+    )
+    return transform
+def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
+    """
+    previous version mainly foucs on ratio.
+    We also consider area ratio here.
+    """
+    best_factor = float("-inf")
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        area_ratio = (ratio[0] * ratio[1] * image_size * image_size) / area
+        # new area > 60% of original image area is enough.
+        factor_based_on_area_n_ratio = min(area_ratio, 0.6) * min(
+            target_aspect_ratio / aspect_ratio, aspect_ratio / target_aspect_ratio
+        )
+        if factor_based_on_area_n_ratio > best_factor:
+            best_factor = factor_based_on_area_n_ratio
+            best_ratio = ratio
+    return best_ratio
+def dynamic_preprocess(
+    image, min_num=1, max_num=6, image_size=448, use_thumbnail=False
+):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+    # calculate the existing image aspect ratio
+    target_ratios = set(
+        (i, j)
+        for n in range(min_num, max_num + 1)
+        for i in range(1, n + 1)
+        for j in range(1, n + 1)
+        if i * j <= max_num and i * j >= min_num
+    )
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio, target_ratios, orig_width, orig_height, image_size
+    )
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size,
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images
+class LlamaNemotronVLProcessor(ProcessorMixin):
+    attributes = ["tokenizer"]
+    tokenizer_class = "AutoTokenizer"
+    def __init__(
+        self,
+        tokenizer: Any,
+        q_max_length: Optional[int] = None,
+        p_max_length: Optional[int] = None,
+        pad_to_multiple_of: Optional[int] = None,
+        query_prefix: str = "query:",
+        passage_prefix: str = "passage:",
+        max_input_tiles: int = 6,
+        num_image_token: int = 128258,
+        dynamic_image_size: bool = True,
+        image_size: int = 512,
+        use_thumbnail: bool = True,
+        template: str = "bidirectional-llama-retriever",
+        num_channels: int = 3,
+        norm_type: str = "siglip",
+        system_message: str = "",
+        padding: Union[bool, str] = True,
+        **kwargs,
+    ):
+        tokens_to_keep = ["<box>", "</box>", "<ref>", "</ref>"]
+        tokenizer.additional_special_tokens = [
+            item
+            for item in tokenizer.additional_special_tokens
+            if item not in tokens_to_keep
+        ]
+        tokenizer.padding_side = "left"
+        tokenizer.model_input_names = tokenizer.model_input_names + ["pixel_values"]
+        self.tokenizer = tokenizer
+        self.q_max_length = q_max_length
+        self.p_max_length = p_max_length
+        self.pad_to_multiple_of = pad_to_multiple_of
+        self.query_prefix = query_prefix
+        self.passage_prefix = passage_prefix
+        self.max_input_tiles = max_input_tiles
+        self.num_image_token = num_image_token
+        self.dynamic_image_size = dynamic_image_size
+        self.image_size = image_size
+        self.use_thumbnail = use_thumbnail
+        self.template = template
+        self.num_channels = num_channels
+        self.norm_type = norm_type
+        self.system_message = system_message
+        self.padding = padding
+        super().__init__(self.tokenizer)
+    def process_documents(self, documents: Union[Dict, List[Dict]], **kwargs):
+        if isinstance(documents, dict):
+            images = documents["images"]
+            texts = documents["texts"]
+            assert len(texts) == len(images)
+        elif isinstance(documents, list):
+            images = [pair["image"] for pair in documents]
+            texts = [pair["text"] for pair in documents]
+        else:
+            raise ValueError("The documents need to be a dict or list of dicts")
+        contents, pil_images, max_input_tile_list, llm_onlys = [], [], [], []
+        for image, text in zip(images, texts):
+            prefix = ""
+            llm_only = True
+            if image is not None and image != "":
+                pil_images.append(load_image(image))
+                prefix = "<image>"
+                max_input_tile_list.append(self.max_input_tiles)
+                llm_only = False
+            else:
+                pil_images.append(None)
+                max_input_tile_list.append(self.max_input_tiles)
+            llm_onlys.append(llm_only)
+            # ToDo: Order is hardcoded and different than before. No \n after <image>
+            content = text
+            if prefix != "":
+                content = prefix + " " + content
+            if self.passage_prefix:
+                content = self.passage_prefix + " " + content
+            contents.append(content)
+        try:
+            assert len(max_input_tile_list) == len(pil_images), (
+                "The number of max_input_tile_list and pil_images should be the same."
+            )
+            assert len(max_input_tile_list) == len(contents), (
+                "The number of max_input_tile_list and pil_images should be the same."
+            )
+        except Exception as e:
+            print(f"Error: {e}")
+            print(
+                f"max_input_tile_list: {max_input_tile_list}, pil_images: {pil_images}"
+            )
+            raise e
+        transform = build_transform(
+            input_size=self.image_size, norm_type=self.norm_type
+        )
+        template = get_conv_template(self.template)
+        template.system_message = self.system_message
+        content_prompts = []
+        pixel_values_list = []
+        for content, pil_image, max_input_tiles, llm_only in zip(
+            contents, pil_images, max_input_tile_list, llm_onlys
+        ):
+            if pil_image is not None:
+                if self.dynamic_image_size:
+                    image_tiles = dynamic_preprocess(
+                        pil_image,
+                        image_size=self.image_size,
+                        max_num=max_input_tiles,
+                        use_thumbnail=self.use_thumbnail,
+                    )
+                else:
+                    image_tiles = [pil_image]
+                pixel_values = [transform(item) for item in image_tiles]
+                pixel_values = torch.stack(pixel_values).to(dtype=torch.bfloat16)
+                pixel_values_list.append(pixel_values)
+            else:
+                pixel_values = None
+            IMG_START_TOKEN = "<img>"
+            IMG_END_TOKEN = "</img>"
+            IMG_CONTEXT_TOKEN = "<IMG_CONTEXT>"
+            if pixel_values is not None and "<image>" not in content and not llm_only:
+                content = "<image> " + content
+            # Reseting conversation messages
+            template.messages.clear()
+            # TODO: do we need this template?
+            template.append_message(template.roles[0], content)  # user
+            template.append_message(template.roles[1], None)  # assistant
+            content_prompt = template.get_prompt()
+            if pixel_values is not None:
+                num_patches = pixel_values.shape[0]
+                image_tokens = (
+                    IMG_START_TOKEN
+                    + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches
+                    + IMG_END_TOKEN
+                )
+                content_prompt = content_prompt.replace("<image>", image_tokens, 1)
+            content_prompts.append(content_prompt)
+        model_inputs = self.tokenizer(
+            content_prompts,
+            truncation=True,
+            max_length=self.p_max_length,
+            padding=self.padding,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            return_tensors="pt",
+        )
+        if len(pixel_values_list) > 1:
+            pixel_values_squeezed = torch.concat(pixel_values_list, axis=0)
+        elif len(pixel_values_list) == 1:
+            pixel_values_squeezed = pixel_values_list[0]
+        else:
+            pixel_values_squeezed = None
+        batch_docs = {
+            "input_ids": model_inputs["input_ids"],
+            "attention_mask": model_inputs["attention_mask"],
+            "pixel_values": None,
+        }
+        if pixel_values_squeezed is not None:
+            batch_docs["pixel_values"] = pixel_values_squeezed
+        return batch_docs
+    def process_queries(self, queries: List[str], **kwargs):
+        template = get_conv_template(self.template)
+        template.system_message = self.system_message
+        query_prompts = []
+        for query in queries:
+            if self.query_prefix:
+                query = f"{self.query_prefix} {query}"
+            # Reseting conversation messages
+            template.messages.clear()
+            template.append_message(template.roles[0], query)  # user
+            template.append_message(template.roles[1], None)  # assistant
+            query_prompt = template.get_prompt()
+            query_prompts.append(query_prompt)
+        batch_query = self.tokenizer(
+            query_prompts,
+            truncation=True,
+            max_length=self.q_max_length,
+            padding=self.padding,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            return_tensors="pt",
+        )
+        return batch_query
+    def process_queries_documents_biencoder(self, features: Dict, **kwargs):
+        """
+        (Pdb) features
+        [{'image': [<PIL.Image.Image image mode=RGB size=1275x1650 at 0x155059A5C3A0>, <PIL.Image.Image image mode=RGB size=1275x1650 at 0x155059A5C580>, <PIL.Image.Image image mode=RGB size=1275x1650 at 0x155059A5C940>], 'text': ['passage: ', 'passage: ', 'passage: '], 'question': "query: What change did Carl Rey suggest for the Strategic Plan's website objective deadline?"}, {'image': [<PIL.Image.Image image mode=RGB size=1275x1650 at 0x155059A5C0D0>, <PIL.Image.Image image mode=RGB size=1275x1650 at 0x155059A5DC00>, <PIL.Image.Image image mode=RGB size=1275x1650 at 0x155059A5EBF0>], 'text': ['passage: ', 'passage: ', 'passage: '], 'question': 'query: What are the name and TIN requirements for individuals with real estate transactions?'}, {'image': [<PIL.Image.Image image mode=RGB size=1275x1650 at 0x155059A5D390>, <PIL.Image.Image image mode=RGB size=1275x1650 at 0x155059A5C850>, <PIL.Image.Image image mode=RGB size=1275x1650 at 0x155059A5C070>], 'text': ['passage: ', 'passage: ', 'passage: '], 'question': 'query: How does Richard Hooker view human inclinations?'}]
+        """
+        queries = []
+        pos_neg_text_batch = []
+        pos_neg_image_batch = []
+        for feature in features:
+            queries.append(feature["question"])
+            pos_neg_text_batch.extend(feature["doc_text"])
+            pos_neg_image_batch.extend(feature["doc_image"])
+        query_batch_dict = self.process_queries(queries, **kwargs)
+        doc_batch_dict = self.process_documents(
+            {"images": pos_neg_image_batch, "texts": pos_neg_text_batch}, **kwargs
+        )
+        merged_batch_dict = self.merge_batch_dict(query_batch_dict, doc_batch_dict)
+        merged_batch_dict = self.add_dummy_labels(queries, merged_batch_dict)
+        return merged_batch_dict
+    def merge_batch_dict(self, query_batch_dict, doc_batch_dict):
+        q_prefix, d_prefix = "q_", "d_"
+        # merge into a single BatchEncoding by adding prefix
+        merged_batch_dict = {}
+        for k in list(query_batch_dict.keys()):
+            merged_batch_dict[q_prefix + k] = query_batch_dict[k]
+            del query_batch_dict[k]
+        for k in list(doc_batch_dict.keys()):
+            merged_batch_dict[d_prefix + k] = doc_batch_dict[k]
+            del doc_batch_dict[k]
+        return merged_batch_dict
+    def add_dummy_labels(self, questions, merged_batch_dict):
+        # dummy placeholder for field "labels", won't use it to compute loss
+        labels = torch.zeros(len(questions), dtype=torch.long)
+        merged_batch_dict["labels"] = labels
+        return merged_batch_dict

models/local_nemotron_rerank/__init__.py ADDED Viewed

File without changes

models/local_nemotron_rerank/configuration_llama_nemotron_vl.py ADDED Viewed

	@@ -0,0 +1,164 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0.
+from typing import Optional
+from transformers.configuration_utils import PretrainedConfig
+from transformers.models.llama.configuration_llama import LlamaConfig
+from transformers.models.siglip.configuration_siglip import SiglipVisionConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+# ============================================================================
+# Bidirectional LLaMA Configuration
+# ============================================================================
+class LlamaBidirectionalConfig(LlamaConfig):
+    """Configuration for bidirectional (non-causal) LLaMA model."""
+    model_type = "llama_bidirec"
+    def __init__(
+        self,
+        pooling="avg",
+        temperature=1.0,
+        **kwargs,
+    ):
+        self.pooling = pooling
+        self.temperature = temperature
+        super().__init__(**kwargs)
+# ============================================================================
+# LlamaNemotronVL Configuration Classes
+# ============================================================================
+class LlamaNemotronVLConfig(PretrainedConfig):
+    """
+    Base configuration for vision-language models combining vision and language components.
+    This serves as the foundation for LlamaNemotronVL configurations.
+    """
+    model_type = "llama_nemotron_vl"
+    is_composition = True
+    # is_composition was renamed to has_no_defaults_at_init in transformers 4.52.1
+    # In PR https://github.com/huggingface/transformers/pull/36263
+    has_no_defaults_at_init = True
+    def __init__(
+        self,
+        # Vision-language parameters
+        vision_config=None,
+        llm_config=None,
+        use_backbone_lora=0,
+        use_llm_lora=0,
+        select_layer=-1,
+        force_image_size=None,
+        downsample_ratio=0.5,
+        template=None,
+        dynamic_image_size=False,
+        use_thumbnail=False,
+        min_dynamic_patch=1,
+        max_dynamic_patch=6,
+        mlp_checkpoint=True,
+        pre_feature_reduction=False,
+        keep_aspect_ratio=False,
+        vocab_size=-1,
+        q_max_length: Optional[int] = 512,
+        p_max_length: Optional[int] = 10240,
+        query_prefix: str = "query:",
+        passage_prefix: str = "passage:",
+        pooling: str = "last",
+        bidirectional_attention: bool = False,
+        max_input_tiles: int = 2,
+        img_context_token_id: int = 128258,  # tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
+        **kwargs,
+    ):
+        # Initialize vision config
+        if vision_config is None:
+            vision_config = {}
+            logger.info(
+                "vision_config is None. Initializing Vision Encoders with default values."
+            )
+        else:
+            if vision_config["model_type"] == "siglip_vision_model":
+                self.vision_config = SiglipVisionConfig(**vision_config)
+            else:
+                raise ValueError(
+                    "Unsupported model_type: {}".format(vision_config["model_type"])
+                )
+        # Initialize LLM config
+        if llm_config is None:
+            llm_config = {}
+            logger.info(
+                "llm_config is None. Initializing the LLM config with default values"
+            )
+        else:
+            if llm_config["architectures"][0] in {
+                "LlamaBidirectionalModel",
+                "LlamaBidirectionalForSequenceClassification",
+            }:
+                self.llm_config = LlamaBidirectionalConfig(**llm_config)
+            else:
+                raise ValueError(
+                    "Unsupported architecture: {}".format(
+                        llm_config["architectures"][0]
+                    )
+                )
+            self.vocab_size = self.llm_config.vocab_size
+        # Vision-language parameters
+        self.use_backbone_lora = use_backbone_lora
+        self.use_llm_lora = use_llm_lora
+        self.select_layer = select_layer
+        self.force_image_size = force_image_size
+        self.downsample_ratio = downsample_ratio
+        self.template = template
+        self.dynamic_image_size = dynamic_image_size
+        self.use_thumbnail = use_thumbnail
+        self.min_dynamic_patch = min_dynamic_patch
+        self.max_dynamic_patch = max_dynamic_patch
+        self.mlp_checkpoint = mlp_checkpoint
+        self.pre_feature_reduction = pre_feature_reduction
+        self.keep_aspect_ratio = keep_aspect_ratio
+        # Reranking-specific parameters
+        self.q_max_length = q_max_length
+        self.p_max_length = p_max_length
+        self.query_prefix = query_prefix
+        self.passage_prefix = passage_prefix
+        self.pooling = pooling
+        self.bidirectional_attention = bidirectional_attention
+        self.img_context_token_id = img_context_token_id
+        self.max_input_tiles = max_input_tiles
+        super().__init__(**kwargs)
+class LlamaNemotronVLForSequenceClassificationConfig(LlamaNemotronVLConfig):
+    """
+    Configuration class for LlamaNemotron VL sequence classification model.
+    This configuration extends LlamaNemotronVLConfig with parameters specific to
+    sequence classification tasks (reranking).
+    """
+    model_type = "llama_nemotron_vl_rerank"
+    def __init__(
+        self,
+        rerank_max_length: Optional[int] = 512,
+        temperature: float = 1.0,
+        prompt_template: str = None,
+        **kwargs,
+    ):
+        self.rerank_max_length = rerank_max_length
+        self.temperature = temperature
+        self.prompt_template = prompt_template
+        super().__init__(**kwargs)

models/local_nemotron_rerank/modeling_llama_nemotron_vl.py ADDED Viewed

	@@ -0,0 +1,678 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0.
+import math
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers import AutoProcessor, PreTrainedModel
+from transformers.cache_utils import Cache
+from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
+from transformers.modeling_outputs import (
+    CausalLMOutputWithPast,
+    SequenceClassifierOutputWithPast,
+)
+from transformers.models.llama.modeling_llama import (
+    LlamaForSequenceClassification,
+    LlamaModel,
+)
+from transformers.models.siglip.modeling_siglip import SiglipVisionModel
+from transformers.utils import logging
+from .configuration_llama_nemotron_vl import (
+    LlamaBidirectionalConfig,
+    LlamaNemotronVLConfig,
+    LlamaNemotronVLForSequenceClassificationConfig,
+)
+from .processing_llama_nemotron_vl import LlamaNemotronVLRerankProcessor
+logger = logging.get_logger(__name__)
+def pool(
+    last_hidden_states: torch.Tensor, attention_mask: torch.Tensor, pool_type: str
+) -> torch.Tensor:
+    """
+    Pool hidden states according to the specified pooling strategy.
+    Args:
+        last_hidden_states: Tensor of shape (batch_size, seq_len, hidden_size)
+        attention_mask: Tensor of shape (batch_size, seq_len)
+        pool_type: Pooling strategy ('avg', 'weighted_avg', 'cls', 'last', 'cls_last', 'colbert')
+    Returns:
+        Pooled embeddings
+    """
+    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
+    if pool_type == "avg":
+        emb = last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
+    elif pool_type == "weighted_avg":
+        emb = last_hidden.sum(dim=1)
+    elif pool_type == "cls":
+        emb = last_hidden[:, 0]
+    elif pool_type == "last":
+        left_padding = attention_mask[:, -1].sum() == attention_mask.shape[0]
+        if left_padding:
+            emb = last_hidden[:, -1]
+        else:
+            sequence_lengths = attention_mask.sum(dim=1) - 1
+            batch_size = last_hidden.shape[0]
+            emb = last_hidden[
+                torch.arange(batch_size, device=last_hidden.device), sequence_lengths
+            ]
+    elif pool_type == "cls_last":
+        emb = last_hidden[:, 0]
+    elif pool_type == "colbert":
+        emb = last_hidden
+    else:
+        raise ValueError(f"pool_type {pool_type} not supported")
+    return emb
+# ============================================================================
+# Bidirectional LLaMA Model
+# ============================================================================
+class LlamaBidirectionalModel(LlamaModel):
+    """LLaMA model with bidirectional (non-causal) attention."""
+    config_class = LlamaBidirectionalConfig
+    def __init__(self, config: LlamaBidirectionalConfig):
+        # ✅ FIX: Force eager attention before super().__init__ triggers FA2 checks
+        config._attn_implementation = "eager"
+        if hasattr(config, 'llm_config'):
+             config.llm_config._attn_implementation = "eager"
+        super().__init__(config)
+        # Set non-causal attention for all layers
+        for layer in self.layers:
+            layer.self_attn.is_causal = False
+    def _update_causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool,
+    ):
+        """
+        Update causal mask for bidirectional attention.
+        Supports flash_attention_2, sdpa, and eager implementations.
+        """
+        if self.config._attn_implementation == "flash_attention_2":
+            # Flash Attention 2: only pass mask if there are actual masks
+            if attention_mask is not None and (attention_mask == 0.0).any():
+                return attention_mask
+            return None
+        elif self.config._attn_implementation == "sdpa":
+            # SDPA: prepare 4D attention mask for bidirectional attention
+            if attention_mask is not None:
+                # Convert 2D mask to 4D: (batch_size, 1, seq_len, seq_len)
+                causal_mask = _prepare_4d_attention_mask(
+                    attention_mask,
+                    dtype=input_tensor.dtype,
+                    tgt_len=input_tensor.shape[1],
+                )
+                return causal_mask
+            return None
+        elif self.config._attn_implementation == "eager":
+            # Eager: standard 4D attention mask
+            causal_mask = _prepare_4d_attention_mask(
+                attention_mask,
+                dtype=input_tensor.dtype,
+            )
+            return causal_mask
+        else:
+            raise ValueError(
+                f"Unsupported attention implementation: {self.config._attn_implementation}. "
+                "Supported values: ['flash_attention_2', 'sdpa', 'eager']"
+            )
+class LlamaBidirectionalForSequenceClassification(LlamaForSequenceClassification):
+    """LLaMA sequence classification model with bidirectional attention."""
+    config_class = LlamaBidirectionalConfig
+    def __init__(self, config):
+        super().__init__(config)
+        # Release the parameters of LlamaModel created by parent
+        del self.model
+        self.model = LlamaBidirectionalModel(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss.
+        """
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        pooled_hidden_states = pool(
+            last_hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            pool_type=self.config.pooling,
+        )
+        pooled_logits = self.score(pooled_hidden_states)
+        pooled_logits = pooled_logits / self.config.temperature
+        loss = None
+        if labels is not None:
+            labels = labels.to(pooled_logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (
+                    labels.dtype == torch.long or labels.dtype == torch.int
+                ):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(
+                    pooled_logits.view(-1, self.num_labels), labels.view(-1)
+                )
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+# ============================================================================
+# LlamaNemotronVL Model Classes
+# ============================================================================
+class LlamaNemotronVLModel(PreTrainedModel):
+    """
+    LlamaNemotron VL model for vision-language reranking.
+    Combines a vision encoder (SigLIP) with a bidirectional language model (LLaMA)
+    for cross-modal reranking tasks.
+    Supports flash_attention_2, sdpa, and eager attention implementations.
+    """
+    config_class = LlamaNemotronVLConfig
+    main_input_name = "pixel_values"
+    _no_split_modules = ["LlamaDecoderLayer"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    def __init__(self, config: LlamaNemotronVLConfig, *model_args, **model_kwargs):
+        # ✅ FIX: Force eager attention here as well
+        config._attn_implementation = "eager"
+        super().__init__(config, *model_args, **model_kwargs)
+        # Calculate image token count
+        image_size = config.force_image_size or config.vision_config.image_size
+        if hasattr(config.vision_config, "grid_size"):
+            grid_size = config.vision_config.grid_size
+            self.patch_size = 14
+            self.num_image_token = int((grid_size * config.downsample_ratio) ** 2)
+        else:
+            patch_size = config.vision_config.patch_size
+            self.patch_size = patch_size
+            self.num_image_token = int(
+                (image_size // patch_size) ** 2 * (config.downsample_ratio**2)
+            )
+        self.select_layer = config.select_layer
+        self.template = config.template
+        self.downsample_ratio = config.downsample_ratio
+        logger.info(f"num_image_token: {self.num_image_token}")
+        # Initialize vision encoder
+        if config.vision_config.model_type == "siglip_vision_model":
+            self.vision_model = SiglipVisionModel(config.vision_config)
+        else:
+            raise NotImplementedError(
+                f"Unsupported vision model type: {config.vision_config.model_type}"
+            )
+        # Set attention implementation (default to flash_attention_2 if available)
+        if not hasattr(config.llm_config, '_attn_implementation'):
+            if torch.cuda.is_available() and hasattr(torch.nn.functional, 'scaled_dot_product_attention'):
+                config.llm_config._attn_implementation = "sdpa"
+                logger.info("Using SDPA attention implementation")
+            else:
+                config.llm_config._attn_implementation = "eager"
+                logger.info("Using eager attention implementation")
+        else:
+            logger.info(f"Using {config.llm_config._attn_implementation} attention implementation")
+        # Initialize language model (bidirectional for reranking)
+        if config.llm_config.architectures[0] in [
+            "LlamaBidirectionalModel",
+            "LlamaBidirectionalForSequenceClassification",
+        ]:
+            self.language_model = LlamaBidirectionalModel(config.llm_config)
+        else:
+            raise NotImplementedError(
+                f"{config.llm_config.architectures[0]} is not implemented for reranking."
+            )
+        # Vision-to-language projection
+        vit_hidden_size = config.vision_config.hidden_size
+        llm_hidden_size = config.llm_config.hidden_size
+        self.mlp1 = nn.Sequential(
+            nn.LayerNorm(vit_hidden_size * int(1 / self.downsample_ratio) ** 2),
+            nn.Linear(
+                vit_hidden_size * int(1 / self.downsample_ratio) ** 2,
+                llm_hidden_size,
+            ),
+            nn.GELU(),
+            nn.Linear(llm_hidden_size, llm_hidden_size),
+        )
+        self.img_context_token_id = None
+        # Initialize processor
+        self.processor = AutoProcessor.from_pretrained(
+            config.name_or_path, trust_remote_code=True
+        )
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor = None,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        image_flags: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        num_patches_list: Optional[List[torch.Tensor]] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        # Get text embeddings
+        input_embeds = self.language_model.get_input_embeddings()(input_ids)
+        # Process and inject vision embeddings if present
+        if pixel_values is not None:
+            if image_flags is None:
+                image_flags = torch.ones(pixel_values.shape[0])
+            image_flags = image_flags.squeeze(-1)
+            vit_embeds = self.extract_feature(pixel_values).to(
+                device=input_embeds.device
+            )
+            if not isinstance(image_flags, list):
+                image_flags = image_flags.squeeze(-1)
+                vit_embeds = vit_embeds[image_flags == 1]
+            # Inject vision tokens into text embeddings
+            B, N, C = input_embeds.shape
+            input_embeds = input_embeds.reshape(B * N, C)
+            input_ids = input_ids.reshape(B * N)
+            selected = input_ids == self.config.img_context_token_id
+            try:
+                input_embeds[selected] = input_embeds[
+                    selected
+                ] * 0.0 + vit_embeds.reshape(-1, C)
+            except Exception as e:
+                vit_embeds = vit_embeds.reshape(-1, C)
+                logger.warning(
+                    f"Shape mismatch in vision embedding injection: {e}, "
+                    f"input_embeds[selected].shape={input_embeds[selected].shape}, "
+                    f"vit_embeds.shape={vit_embeds.shape}"
+                )
+                n_token = selected.sum()
+                input_embeds[selected] = (
+                    input_embeds[selected] * 0.0 + vit_embeds[:n_token]
+                )
+            input_embeds = input_embeds.reshape(B, N, C)
+        # Forward through language model
+        outputs = self.language_model(
+            inputs_embeds=input_embeds,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        logits = None
+        loss = None
+        if hasattr(outputs, "logits"):
+            logits = outputs.logits
+            if labels is not None:
+                # Shift so that tokens < n predict n
+                shift_logits = logits[..., :-1, :].contiguous()
+                shift_labels = labels[..., 1:].contiguous()
+                # Flatten the tokens
+                loss_fct = CrossEntropyLoss()
+                shift_logits = shift_logits.view(
+                    -1, self.language_model.config.vocab_size
+                )
+                shift_labels = shift_labels.view(-1)
+                # Enable model parallelism
+                shift_labels = shift_labels.to(shift_logits.device)
+                loss = loss_fct(shift_logits, shift_labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def pixel_shuffle(self, x, scale_factor=0.5):
+        """
+        Rearrange pixels for downsampling/upsampling.
+        Args:
+            x: Input tensor of shape (N, W, H, C)
+            scale_factor: Scaling factor for shuffle operation
+        Returns:
+            Shuffled tensor
+        """
+        n, w, h, c = x.shape
+        # N, W, H, C --> N, W, H * scale, C // scale
+        x = x.view(n, w, int(h * scale_factor), int(c / scale_factor))
+        # N, W, H * scale, C // scale --> N, H * scale, W, C // scale
+        x = x.permute(0, 2, 1, 3).contiguous()
+        # N, H * scale, W, C // scale --> N, H * scale, W * scale, C // (scale ** 2)
+        x = x.view(
+            n,
+            int(h * scale_factor),
+            int(w * scale_factor),
+            int(c / (scale_factor * scale_factor)),
+        )
+        x = x.permute(0, 2, 1, 3).contiguous()
+        return x
+    def extract_feature(self, pixel_values):
+        """
+        Extract and project vision features to language model space.
+        Args:
+            pixel_values: Image tensor
+        Returns:
+            Projected vision embeddings
+        """
+        # Extract features from vision encoder
+        if self.select_layer == -1:
+            vit_embeds = self.vision_model(
+                pixel_values=pixel_values, output_hidden_states=False, return_dict=True
+            )
+            if hasattr(vit_embeds, "last_hidden_state"):
+                vit_embeds = vit_embeds.last_hidden_state
+        else:
+            vit_embeds = self.vision_model(
+                pixel_values=pixel_values, output_hidden_states=True, return_dict=True
+            ).hidden_states[self.select_layer]
+        # Remove CLS token if not using SigLIP
+        if not isinstance(self.vision_model, SiglipVisionModel):
+            vit_embeds = vit_embeds[:, 1:, :]
+        # Apply pixel shuffle and MLP projection
+        _, n, c = vit_embeds.shape
+        h = w = int(n**0.5)
+        vit_embeds = vit_embeds.reshape(-1, h, w, c)
+        vit_embeds = self.pixel_shuffle(vit_embeds, scale_factor=self.downsample_ratio)
+        _, h_s, w_s, c_s = vit_embeds.shape
+        vit_embeds = vit_embeds.reshape(-1, h_s * w_s, c_s)
+        vit_embeds = self.mlp1(vit_embeds)
+        return vit_embeds
+    def build_collator(self, tokenizer, **kwargs):
+        return self.processor
+    def post_loss(self, loss, inputs):
+        """
+        Add dummy gradients for vision encoder to ensure multi-GPU synchronization.
+        Args:
+            loss: Computed loss
+            inputs: Input dictionary
+        Returns:
+            Modified loss with dummy gradients
+        """
+        if "pixel_values" in inputs and inputs["pixel_values"] is None:
+            dummy_pixels = torch.zeros(
+                1, 3, 512, 512, device=loss.device, dtype=self.vision_model.dtype
+            )
+            dummy_output = self.extract_feature(dummy_pixels)
+            loss = loss + dummy_output.sum() * 0.0
+        return loss
+class CrossEncoderHead(nn.Linear):
+    """Classification head for cross-encoder reranking."""
+    pass
+class LlamaNemotronVLForSequenceClassification(PreTrainedModel):
+    """
+    LlamaNemotron VL model for sequence classification (reranking).
+    Supports flash_attention_2, sdpa, and eager attention implementations.
+    """
+    config_class = LlamaNemotronVLForSequenceClassificationConfig
+    base_model_prefix = "model"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _no_split_modules = ["LlamaNemotronVLModel"]
+    def __init__(self, config, **kwargs):
+        super().__init__(config, **kwargs)
+        self.num_labels = config.num_labels
+        self.add_module("model", LlamaNemotronVLModel(config))
+        score = CrossEncoderHead(
+            config.llm_config.hidden_size,
+            self.num_labels,
+            bias=False,
+            dtype=torch.float32,
+        )
+        self.add_module("score", score)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def _init_weights(self, module):
+        """Initialize weights for the model."""
+        super()._init_weights(module)
+        if isinstance(module, CrossEncoderHead):
+            # Initialize cross-encoder head to avoid NaN/Inf loss
+            torch.nn.init.kaiming_uniform_(module.weight, a=math.sqrt(5))
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor = None,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        image_flags: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        num_patches_list: Optional[List[torch.Tensor]] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        Forward pass for sequence classification.
+        Args:
+            pixel_values: Image pixel values
+            input_ids: Input token IDs
+            attention_mask: Attention mask
+            position_ids: Position IDs
+            image_flags: Flags indicating image presence
+            past_key_values: Cached key-value pairs
+            inputs_embeds: Input embeddings (alternative to input_ids)
+            labels: Labels for classification
+            use_cache: Whether to use KV cache
+            output_attentions: Whether to output attention weights
+            output_hidden_states: Whether to output hidden states
+            return_dict: Whether to return ModelOutput
+            num_patches_list: List of number of patches per image
+        Returns:
+            SequenceClassifierOutputWithPast or tuple
+        """
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        transformer_outputs = self.model(
+            pixel_values=pixel_values,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            image_flags=image_flags,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=True,
+            return_dict=return_dict,
+            num_patches_list=num_patches_list,
+        )
+        hidden_states = transformer_outputs.hidden_states[-1]
+        pooled_hidden_states = pool(
+            last_hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            pool_type=self.config.pooling,
+        )
+        pooled_logits = self.score(pooled_hidden_states.to(self.score.weight.dtype))
+        pooled_logits = pooled_logits / self.config.temperature
+        if torch.isnan(pooled_logits).any():
+            raise ValueError("NaN detected in pooled_logits!")
+        loss = None
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+    def build_collator(self, tokenizer, **kwargs):
+        """Build data collator for reranking."""
+        rerank_max_length = kwargs.pop(
+            "rerank_max_length", self.config.rerank_max_length
+        )
+        max_input_tiles = kwargs.pop("max_input_tiles", self.config.max_input_tiles)
+        prompt_template = kwargs.pop("prompt_template", self.config.prompt_template)
+        return LlamaNemotronVLRerankProcessor(
+            tokenizer=tokenizer,
+            rerank_max_length=rerank_max_length,
+            max_input_tiles=max_input_tiles,
+            num_image_token=self.model.num_image_token,
+            prompt_template=prompt_template,
+            **kwargs,
+        )
+    def post_loss(self, loss, inputs):
+        """
+        Add dummy gradients for vision encoder to ensure multi-GPU synchronization.
+        Args:
+            loss: Computed loss
+            inputs: Input dictionary
+        Returns:
+            Modified loss with dummy gradients
+        """
+        if "pixel_values" in inputs and inputs["pixel_values"] is None:
+            dummy_pixels = torch.zeros(
+                1, 3, 512, 512, device=loss.device, dtype=self.model.vision_model.dtype
+            )
+            dummy_output = self.model.extract_feature(dummy_pixels)
+            loss = loss + dummy_output.sum() * 0.0
+        return loss

models/local_nemotron_rerank/processing_llama_nemotron_vl.py ADDED Viewed

	@@ -0,0 +1,360 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0.
+import base64
+import os
+from io import BytesIO
+from typing import Any, Dict, List, Optional, Union, Tuple
+import dataclasses
+from dataclasses import field
+import requests
+import torch
+import torchvision.transforms as T
+from PIL import Image
+from torchvision.transforms.functional import InterpolationMode
+from transformers import ProcessorMixin
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+SIGLIP_MEAN = (0.5, 0.5, 0.5)
+SIGLIP_STD = (0.5, 0.5, 0.5)
+@dataclasses.dataclass
+class Conversation:
+    """Manages prompt construction with system messages and multi-turn dialogues."""
+    # System instruction prepended to prompts
+    system_message: str = ""
+    # Role identifiers for dialogue turns
+    roles: Tuple[str, str] = ("", "")
+    # Message history as (role, content) pairs
+    messages: List[List[str]] = field(default_factory=list)
+    # Separator token between messages
+    sep: str = ""
+    # Token IDs that trigger generation stopping
+    stop_token_ids: List[int] = None
+    def get_prompt(self) -> str:
+        """Construct the formatted prompt string from system message and dialogue history."""
+        ret = self.system_message + self.sep
+        for role, message in self.messages:
+            if message:
+                ret += role + message + self.sep
+            else:
+                ret += role
+        return ret
+    def append_message(self, role: str, message: str):
+        """Add a message turn to the dialogue history."""
+        self.messages.append([role, message])
+def get_conv_template(name: str) -> Conversation:
+    """Initialize a conversation instance with default configuration."""
+    return Conversation(
+        stop_token_ids=[128259, 128001],
+    )
+def load_image(image):
+    if isinstance(image, Image.Image):
+        return image
+    elif isinstance(image, str) and os.path.exists(image):
+        return Image.open(image)
+    elif isinstance(image, dict):
+        if "disk_path" in image:
+            return Image.open(image["disk_path"])
+        elif "base64" in image:
+            return Image.open(BytesIO(base64.b64decode(image["base64"])))
+        elif "url" in image:
+            response = requests.get(image["url"])
+            return Image.open(BytesIO(response.content))
+        elif "bytes" in image:
+            return Image.open(BytesIO(image["bytes"]))
+        else:
+            raise ValueError(f"Invalid image: {image}")
+    else:
+        raise ValueError(f"Invalid image: {image}")
+def build_transform(input_size, norm_type="imagenet"):
+    if norm_type == "imagenet":
+        MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
+    elif norm_type == "siglip":
+        MEAN, STD = SIGLIP_MEAN, SIGLIP_STD
+    transform = T.Compose(
+        [
+            T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img),
+            T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
+            T.ToTensor(),
+            T.Normalize(mean=MEAN, std=STD),
+        ]
+    )
+    return transform
+def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
+    """
+    previous version mainly foucs on ratio.
+    We also consider area ratio here.
+    """
+    best_factor = float("-inf")
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        area_ratio = (ratio[0] * ratio[1] * image_size * image_size) / area
+        # new area > 60% of original image area is enough.
+        factor_based_on_area_n_ratio = min(area_ratio, 0.6) * min(
+            target_aspect_ratio / aspect_ratio, aspect_ratio / target_aspect_ratio
+        )
+        if factor_based_on_area_n_ratio > best_factor:
+            best_factor = factor_based_on_area_n_ratio
+            best_ratio = ratio
+    return best_ratio
+def dynamic_preprocess(
+    image, min_num=1, max_num=6, image_size=448, use_thumbnail=False
+):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+    # calculate the existing image aspect ratio
+    target_ratios = set(
+        (i, j)
+        for n in range(min_num, max_num + 1)
+        for i in range(1, n + 1)
+        for j in range(1, n + 1)
+        if i * j <= max_num and i * j >= min_num
+    )
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio, target_ratios, orig_width, orig_height, image_size
+    )
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size,
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images
+class LlamaNemotronVLRerankProcessor(ProcessorMixin):
+    attributes = ["tokenizer"]
+    tokenizer_class = "AutoTokenizer"
+    def __init__(
+        self,
+        tokenizer: Any,
+        padding: Union[bool, str] = True,
+        rerank_max_length: Optional[int] = 512,
+        pad_to_multiple_of: Optional[int] = None,
+        max_input_tiles: int = 2,
+        num_image_token: int = None,
+        prompt_template: str = None,
+        force_image_size: int = 512,
+        template: str = "bidirectional-llama-retriever",
+        dynamic_image_size: bool = True,
+        use_thumbnail: bool = True,
+        **kwargs,
+    ):
+        self.padding = padding
+        self.rerank_max_length = rerank_max_length
+        self.pad_to_multiple_of = pad_to_multiple_of
+        tokens_to_keep = ["<box>", "</box>", "<ref>", "</ref>"]
+        tokenizer.additional_special_tokens = [
+            item
+            for item in tokenizer.additional_special_tokens
+            if item not in tokens_to_keep
+        ]
+        tokenizer.padding_side = "left"
+        self.tokenizer = tokenizer
+        self.norm_type = "siglip"
+        self.image_size = force_image_size
+        self.max_input_tiles = max_input_tiles
+        self.num_image_token = num_image_token
+        self.system_message = ""
+        self.prompt_template = prompt_template
+        self.template = template
+        self.dynamic_image_size = dynamic_image_size
+        self.use_thumbnail = use_thumbnail
+        super().__init__(self.tokenizer)
+    def process_query_documents(self, documents: Union[Dict, List[Dict]], **kwargs):
+        if isinstance(documents, dict):
+            images = documents["images"]
+            texts = documents["texts"]
+            assert len(texts) == len(images)
+        elif isinstance(documents, list):
+            images = [pair["image"] for pair in documents]
+            texts = [pair["text"] for pair in documents]
+        else:
+            raise ValueError("The documents need to be a dict or list of dicts")
+        contents, pil_images, max_input_tile_list, llm_onlys = [], [], [], []
+        for image, text in zip(images, texts):
+            prefix = ""
+            llm_only = True
+            if image is not None and image != "":
+                pil_images.append(load_image(image))
+                prefix = "<image>"
+                max_input_tile_list.append(self.max_input_tiles)
+                llm_only = False
+            else:
+                pil_images.append(None)
+                max_input_tile_list.append(self.max_input_tiles)
+            llm_onlys.append(llm_only)
+            # ToDo: Order is hardcoded and different than before. No \n after <image>
+            content = text
+            if prefix != "":
+                content = prefix + " " + content
+            contents.append(content)
+        assert len(max_input_tile_list) == len(pil_images), (
+            "The number of max_input_tile_list and pil_images should be the same."
+        )
+        assert len(max_input_tile_list) == len(contents), (
+            "The number of max_input_tile_list and contents should be the same."
+        )
+        transform = build_transform(
+            input_size=self.image_size, norm_type=self.norm_type
+        )
+        template = get_conv_template(self.template)
+        template.system_message = self.system_message
+        content_prompts = []
+        pixel_values_list = []
+        for content, pil_image, max_input_tiles, llm_only in zip(
+            contents, pil_images, max_input_tile_list, llm_onlys
+        ):
+            if pil_image is not None:
+                if self.dynamic_image_size:
+                    image_tiles = dynamic_preprocess(
+                        pil_image,
+                        image_size=self.image_size,
+                        max_num=max_input_tiles,
+                        use_thumbnail=self.use_thumbnail,
+                    )
+                else:
+                    image_tiles = [pil_image]
+                pixel_values = [transform(item) for item in image_tiles]
+                pixel_values = torch.stack(pixel_values).to(dtype=torch.bfloat16)
+                # print(f'Split images to {pixel_values[0].shape}')
+                pixel_values_list.append(pixel_values)
+            else:
+                pixel_values = None
+            IMG_START_TOKEN = "<img>"
+            IMG_END_TOKEN = "</img>"
+            IMG_CONTEXT_TOKEN = "<IMG_CONTEXT>"
+            if pixel_values is not None and "<image>" not in content and not llm_only:
+                content = "<image> " + content
+            # Reseting conversation messages
+            template.messages.clear()
+            # TODO: do we need this template?
+            template.append_message(template.roles[0], content)  # user
+            template.append_message(template.roles[1], None)  # assistant
+            content_prompt = template.get_prompt()
+            if "<image>" in content:
+                num_patches = pixel_values.shape[0]
+                image_tokens = (
+                    IMG_START_TOKEN
+                    + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches
+                    + IMG_END_TOKEN
+                )
+                content_prompt = content_prompt.replace("<image>", image_tokens, 1)
+            content_prompts.append(content_prompt)
+        model_inputs = self.tokenizer(
+            content_prompts,
+            truncation=True,
+            max_length=self.rerank_max_length,
+            padding=self.padding,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            return_tensors="pt",
+        )
+        if len(pixel_values_list) > 1:
+            pixel_values_squeezed = torch.concat(pixel_values_list, axis=0)
+        elif len(pixel_values_list) == 1:
+            pixel_values_squeezed = pixel_values_list[0]
+        else:
+            pixel_values_squeezed = None
+        batch_docs = {
+            "input_ids": model_inputs["input_ids"],
+            "attention_mask": model_inputs["attention_mask"],
+            "pixel_values": None,
+        }
+        if pixel_values_squeezed is not None:
+            batch_docs["pixel_values"] = pixel_values_squeezed
+        return batch_docs
+    def prompt_template_question_passage(self, question, text):
+        return f"question:{question} \n \n passage:{text}"
+    def process_queries_documents_crossencoder(self, features: List[Dict], **kwargs):
+        images = [feature["doc_image"] for feature in features]
+        if self.prompt_template == "v1":
+            questions_texts = [
+                self.prompt_template_question_passage(
+                    feature["question"], feature["doc_text"]
+                )
+                for feature in features
+            ]
+        else:
+            questions_texts = [
+                f"{feature['question']} \n {feature['doc_text']}"
+                for feature in features
+            ]
+        batch_dict = self.process_query_documents(
+            {"images": images, "texts": questions_texts}, **kwargs
+        )
+        if "num_labels" in features[0]:
+            batch_dict["labels"] = torch.zeros(
+                features[0]["num_labels"], dtype=torch.long
+            )
+        return batch_dict

models/model_loader.py CHANGED Viewed

@@ -9,17 +9,23 @@ def load_embed_model(model_path: str = "nvidia/llama-nemotron-embed-vl-1b-v2"):
     print(f"🔄 Loading embedding model on {device}...")
     config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
-    # ✅ FIX: Removed SDPA config override which causes issues in HF Spaces
-    # ✅ FIX: Use manual device instead of device_map="auto"
-    model = AutoModel.from_pretrained(
         model_path,
         config=config,
         torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
-        trust_remote_code=True,
-        low_cpu_mem_usage=True,  # ✅ CPU optimization
-        attn_implementation="eager", # ✅ FIX: Force eager execution
     ).to(device).eval()
     print(f"✅ Embedding model loaded on {device}")
@@ -34,10 +40,22 @@ def load_rerank_model(model_path: str = "nvidia/llama-nemotron-rerank-vl-1b-v2")
     print(f"🔄 Loading reranking model on {device}...")
     # ✅ FIX: Use manual device instead of device_map="auto"
-    model = AutoModelForSequenceClassification.from_pretrained(
         model_path,
         torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
-        trust_remote_code=True,
         attn_implementation="eager",
     ).to(device).eval()

     print(f"🔄 Loading embedding model on {device}...")
+    # ✅ FIX: Load CONFIG from hub but CODE from local patched file
     config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+    # Import local patched model class
+    import sys
+    import os
+    sys.path.append(os.path.join(os.path.dirname(__file__), "local_nemotron"))
+    from local_nemotron.modeling_llama_nemotron_vl import LlamaNemotronVLModel
+    # Initialize model using local class
+    model = LlamaNemotronVLModel.from_pretrained(
         model_path,
         config=config,
         torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
+        trust_remote_code=False, # We are using local code now
+        low_cpu_mem_usage=True,
+        # attn_implementation="eager", # Explicitly set in __init__ patch now
     ).to(device).eval()
     print(f"✅ Embedding model loaded on {device}")
     print(f"🔄 Loading reranking model on {device}...")
     # ✅ FIX: Use manual device instead of device_map="auto"
+    # ✅ FIX: Load CONFIG from hub but CODE from local patched file
+    config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+    # Import local patched model class
+    import sys
+    import os
+    sys.path.append(os.path.join(os.path.dirname(__file__), "local_nemotron_rerank"))
+    # Rerank model usually uses ForSequenceClassification variant, checking imports
+    from local_nemotron_rerank.modeling_llama_nemotron_vl import LlamaNemotronVLForSequenceClassification
+    # Initialize model using local class
+    model = LlamaNemotronVLForSequenceClassification.from_pretrained(
         model_path,
+        config=config,
         torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
+        trust_remote_code=False,
         attn_implementation="eager",
     ).to(device).eval()