working

Browse files

Files changed (4) hide show

builders.py +1 -3
config.json +2 -1
modeling_blip3o_qwen.py +19 -3
vision_tower.py +92 -130

builders.py CHANGED Viewed

@@ -7,7 +7,7 @@ from typing import Any
 import torch.nn as nn
 from .diffusion_auto import AutoDiffusionModel, DiffusionConfig
-from .vision_tower import AutoEvaClipVisionTower, default_device, resolve_eva_repo
 logger = logging.getLogger(__name__)
 _PKG_ROOT = Path(__file__).resolve().parent
@@ -79,9 +79,7 @@ def build_down_projector(config, delay_load: bool = False, **kwargs):
 def build_gen_vision_tower(config, delay_load: bool = False, **kwargs):
     """Instantiate the EVA-CLIP tower purely from HF Hub assets."""
-    repo_id = resolve_eva_repo(config)
     tower = AutoEvaClipVisionTower(
-        repo_id,
         config=config,
         torch_dtype=kwargs.get("torch_dtype"),
         device=default_device(kwargs.get("device")),

 import torch.nn as nn
 from .diffusion_auto import AutoDiffusionModel, DiffusionConfig
+from .vision_tower import AutoEvaClipVisionTower, default_device
 logger = logging.getLogger(__name__)
 _PKG_ROOT = Path(__file__).resolve().parent
 def build_gen_vision_tower(config, delay_load: bool = False, **kwargs):
     """Instantiate the EVA-CLIP tower purely from HF Hub assets."""
     tower = AutoEvaClipVisionTower(
         config=config,
         torch_dtype=kwargs.get("torch_dtype"),
         device=default_device(kwargs.get("device")),

config.json CHANGED Viewed

@@ -78,8 +78,9 @@
   },
   "vision_end_token_id": 151653,
   "vision_start_token_id": 151652,
   "vision_token_id": 151654,
-  "vision_tower_pretrained": null,
   "vocab_size": 151668,
   "auto_map": {
     "AutoConfig": "modeling_blip3o_qwen.blip3oQwenConfig",

   },
   "vision_end_token_id": 151653,
   "vision_start_token_id": 151652,
+  "eva_image_size": 448,
   "vision_token_id": 151654,
+  "vision_tower_pretrained": "model_zoo/EVA-CLIP-E14-Plus",
   "vocab_size": 151668,
   "auto_map": {
     "AutoConfig": "modeling_blip3o_qwen.blip3oQwenConfig",

modeling_blip3o_qwen.py CHANGED Viewed

@@ -22,6 +22,9 @@ from transformers import (
     Qwen2_5_VLModel,
 )
 from transformers.generation.utils import GenerateOutput
 from transformers.modeling_outputs import CausalLMOutputWithPast
 from .builders import build_dit, build_gen_vision_tower, build_down_projector
@@ -37,6 +40,12 @@ IGNORE_INDEX = -100
 IMAGE_TOKEN_IDX = 151667
 class blip3oMetaModel:
     def __init__(self, config):
         super(blip3oMetaModel, self).__init__(config)
@@ -439,12 +448,13 @@ class blip3oQwenForCausalLM(Qwen2_5_VLForConditionalGeneration, blip3oMetaForCau
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        gen_image: Optional[torch.FloatTensor] = None,
         pixel_values: Optional[torch.Tensor] = None,
         image_grid_thw: Optional[torch.Tensor] = None,
         return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -491,6 +501,8 @@ class blip3oQwenForCausalLM(Qwen2_5_VLForConditionalGeneration, blip3oMetaForCau
         logits = logits.float()
         total_loss = None
         if labels is not None:
             # Shift so that tokens < n predict n
             shift_logits = logits[..., :-1, :].contiguous()
@@ -564,14 +576,18 @@ class blip3oQwenForCausalLM(Qwen2_5_VLForConditionalGeneration, blip3oMetaForCau
             text_weight = getattr(self.config, "text_loss_weight", 1.0)
             img_weight = getattr(self.config, "img_loss_weight", 1.0)
             total_loss = text_weight * text_loss + img_weight * img_loss
-            print(f"text loss {text_loss} | img loss {img_loss}")
-        return CausalLMOutputWithPast(
             loss=total_loss,
             logits=logits,
             past_key_values=outputs.past_key_values,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )

     Qwen2_5_VLModel,
 )
 from transformers.generation.utils import GenerateOutput
+from dataclasses import dataclass
+from typing import Optional
 from transformers.modeling_outputs import CausalLMOutputWithPast
 from .builders import build_dit, build_gen_vision_tower, build_down_projector
 IMAGE_TOKEN_IDX = 151667
+@dataclass
+class Blip3oCausalLMOutput(CausalLMOutputWithPast):
+    text_loss: Optional[torch.FloatTensor] = None
+    img_loss: Optional[torch.FloatTensor] = None
 class blip3oMetaModel:
     def __init__(self, config):
         super(blip3oMetaModel, self).__init__(config)
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
+        gen_images: Optional[torch.FloatTensor] = None,
         pixel_values: Optional[torch.Tensor] = None,
         image_grid_thw: Optional[torch.Tensor] = None,
         return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None
     ) -> Union[Tuple, CausalLMOutputWithPast]:
+        gen_image=gen_images
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
         logits = logits.float()
         total_loss = None
+        text_loss = None
+        img_loss = None
         if labels is not None:
             # Shift so that tokens < n predict n
             shift_logits = logits[..., :-1, :].contiguous()
             text_weight = getattr(self.config, "text_loss_weight", 1.0)
             img_weight = getattr(self.config, "img_loss_weight", 1.0)
             total_loss = text_weight * text_loss + img_weight * img_loss
+            # cache latest component losses for logging
+            self._last_text_loss = float(text_loss.detach().mean().cpu())
+            self._last_img_loss = float(img_loss.detach().mean().cpu())
+        return Blip3oCausalLMOutput(
             loss=total_loss,
             logits=logits,
             past_key_values=outputs.past_key_values,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
+            text_loss=text_loss,
+            img_loss=img_loss,
         )

vision_tower.py CHANGED Viewed

@@ -1,103 +1,10 @@
 from __future__ import annotations
-import importlib
-import os
-import sys
-from pathlib import Path
-from types import SimpleNamespace
 from typing import Optional, Union
 import torch
 import torch.nn as nn
-from huggingface_hub import snapshot_download
-_DEFAULT_EVA_REPO = os.environ.get("BLIP3O_EVA_ID", "orrzohar/EVA-CLIP-E14-Plus")
-_EVA_ALIASES = {
-    "eva-clip-e-14-plus": "orrzohar/EVA-CLIP-E14-Plus",
-    "eva_clip_e_14_plus": "orrzohar/EVA-CLIP-E14-Plus",
-    "orrzohar/eva-clip-e14-plus": "orrzohar/EVA-CLIP-E14-Plus",
-}
-_PKG_ROOT = Path(__file__).resolve().parent
-_EVA_ROOT = (_PKG_ROOT.parent / "EVA-CLIP-E14-Plus").resolve()
-_LEGACY_TOWER_CLASS = None
-def _normalize_candidate(candidate: Optional[str]) -> Optional[str]:
-    if candidate is None:
-        return None
-    candidate = candidate.strip()
-    return candidate or None
-def _ensure_code_on_path(path_hint: str) -> Path:
-    """Make sure the legacy_eva_clip package is importable."""
-    path = Path(path_hint)
-    search_roots: list[Path] = []
-    if path.exists():
-        search_roots.append(path if path.is_dir() else path.parent)
-    if _EVA_ROOT.exists():
-        search_roots.append(_EVA_ROOT)
-    for root in search_roots:
-        pkg_dir = root / "legacy_eva_clip"
-        if pkg_dir.exists():
-            if str(root) not in sys.path:
-                sys.path.insert(0, str(root))
-            return root
-    repo_id, _, revision = _DEFAULT_EVA_REPO.partition("@")
-    download_root = Path(snapshot_download(repo_id=repo_id, revision=revision or None))
-    if str(download_root) not in sys.path:
-        sys.path.insert(0, str(download_root))
-    return download_root
-def _get_legacy_tower_class(code_root: Path):
-    """Import the legacy EVA tower implementation, caching the class."""
-    global _LEGACY_TOWER_CLASS
-    if _LEGACY_TOWER_CLASS is not None:
-        return _LEGACY_TOWER_CLASS
-    if str(code_root) not in sys.path:
-        sys.path.insert(0, str(code_root))
-    module = importlib.import_module("legacy_eva_clip.eva_clip_encoder")
-    _LEGACY_TOWER_CLASS = module.EvaClipVisionTower
-    return _LEGACY_TOWER_CLASS
-def resolve_eva_repo(config=None, fallback: Optional[str] = None) -> str:
-    """Return a concrete path (file or directory) containing EVA weights."""
-    candidate = _normalize_candidate(
-        fallback
-        or os.environ.get("BLIP3O_EVA_ID")
-        or (getattr(config, "vision_tower_pretrained", None) if config is not None else None)
-        or (getattr(config, "gen_vision_tower", None) if config is not None else None)
-        or _DEFAULT_EVA_REPO
-    )
-    if candidate is None:
-        raise ValueError("Unable to determine EVA checkpoint location.")
-    candidate_path = Path(candidate)
-    potential_paths = [
-        candidate_path,
-        (_PKG_ROOT / candidate) if not candidate_path.is_absolute() else None,
-        _EVA_ROOT if _EVA_ROOT.exists() else None,
-    ]
-    for path in potential_paths:
-        if path and path.exists():
-            return str(path.resolve())
-    alias = _EVA_ALIASES.get(candidate.lower())
-    repo_spec = alias or candidate
-    repo_id, sep, revision = repo_spec.partition("@")
-    download_path = snapshot_download(repo_id=repo_id, revision=revision or None)
-    return download_path
 def default_device(spec: Optional[Union[str, torch.device]] = None) -> torch.device:
@@ -109,56 +16,111 @@ def default_device(spec: Optional[Union[str, torch.device]] = None) -> torch.dev
 class AutoEvaClipVisionTower(nn.Module):
-    """Wrapper that dynamically loads the EVA tower code + weights from HF."""
     def __init__(
         self,
-        repo_id: Optional[str] = None,
-        *,
-        config=None,
         torch_dtype: torch.dtype | None = None,
         device: Optional[Union[str, torch.device]] = None,
         delay_load: bool = False,
     ):
         super().__init__()
-        pretrained_path = resolve_eva_repo(config, repo_id)
-        code_root = _ensure_code_on_path(pretrained_path)
-        legacy_cls = _get_legacy_tower_class(code_root)
-        tower_name = (
-            getattr(config, "gen_vision_tower", None)
-            or getattr(config, "vision_tower_pretrained", None)
-            or "eva-clip-E-14-plus"
-        )
-        self.repo_id = pretrained_path
         self.torch_dtype = torch_dtype or torch.bfloat16
         self._device = default_device(device)
-        args = SimpleNamespace(
-            vision_tower_pretrained=pretrained_path,
-            gen_vision_tower=self.repo_id,
-            mm_vision_tower=self.repo_id,
-            unfreeze_mm_vision_tower=False,
-            mm_tunable_parts=[],
         )
-        self.legacy_tower = legacy_cls(tower_name, args=args, delay_load=delay_load)
-    def load_model(self, device_map=None):
-        result = self.legacy_tower.load_model(device_map=device_map)
-        if hasattr(self.legacy_tower, "vision_tower"):
-            self.legacy_tower.vision_tower.to(device=self._device, dtype=self.torch_dtype)
-        return result
-    def forward(self, *args, **kwargs):
-        return self.legacy_tower(*args, **kwargs)
-    def __getattr__(self, item):
-        if "legacy_tower" in self.__dict__ and hasattr(self.legacy_tower, item):
-            return getattr(self.legacy_tower, item)
-        return super().__getattr__(item)
-__all__ = ["AutoEvaClipVisionTower", "resolve_eva_repo", "default_device"]

 from __future__ import annotations
 from typing import Optional, Union
 import torch
 import torch.nn as nn
+from transformers import AutoConfig, AutoImageProcessor, AutoModel
 def default_device(spec: Optional[Union[str, torch.device]] = None) -> torch.device:
 class AutoEvaClipVisionTower(nn.Module):
+    """Plain Hugging Face vision tower wrapper (AutoModel + AutoImageProcessor)."""
     def __init__(
         self,
+        config,
         torch_dtype: torch.dtype | None = None,
         device: Optional[Union[str, torch.device]] = None,
         delay_load: bool = False,
     ):
         super().__init__()
+        if getattr(config, "vision_tower_pretrained", None) is None:
+            raise ValueError("vision_tower_pretrained must be defined in the config.")
+        self.repo_id = config.vision_tower_pretrained
         self.torch_dtype = torch_dtype or torch.bfloat16
         self._device = default_device(device)
+        self.is_loaded = False
+        self.image_processor = None
+        self.vision_model = None
+        self._hf_config = AutoConfig.from_pretrained(self.repo_id, trust_remote_code=True)
+        self._vision_cfg = dict(getattr(self._hf_config, "vision_cfg", {}))
+        self._hidden_size = (
+            self._vision_cfg.get("width")
+            or getattr(self._hf_config, "embed_dim", None)
+            or getattr(self._hf_config, "hidden_size", None)
         )
+        if not delay_load:
+            self.load_model(torch_dtype=self.torch_dtype, device=self._device)
+    def load_model(
+        self,
+        *,
+        torch_dtype: torch.dtype | None = None,
+        device: Optional[Union[str, torch.device]] = None,
+    ):
+        if self.is_loaded:
+            return self
+        dtype = torch_dtype or self.torch_dtype
+        target_device = default_device(device or self._device)
+        self.image_processor = AutoImageProcessor.from_pretrained(
+            self.repo_id,
+            trust_remote_code=True,
+        )
+        self.vision_model = AutoModel.from_pretrained(
+            self.repo_id,
+            trust_remote_code=True,
+            torch_dtype=dtype,
+        )
+        self.vision_model.to(target_device)
+        self.vision_model.eval()
+        self.vision_model.requires_grad_(False)
+        self.torch_dtype = dtype
+        self._device = target_device
+        self.is_loaded = True
+        return self
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        if not self.is_loaded:
+            raise RuntimeError("Vision tower used before load_model()")
+        inputs = pixel_values.to(self.device, dtype=self.torch_dtype)
+        outputs = self.vision_model(pixel_values=inputs)
+        hidden_states = getattr(outputs, "last_hidden_state", None)
+        if hidden_states is None:
+            raise ValueError("EVA model did not return last_hidden_state")
+        return hidden_states.to(pixel_values.dtype)
+    @property
+    def dtype(self) -> torch.dtype:
+        if self.is_loaded:
+            return next(self.vision_model.parameters()).dtype
+        return self.torch_dtype
+    @property
+    def device(self) -> torch.device:
+        if self.is_loaded:
+            return next(self.vision_model.parameters()).device
+        return self._device
+    @property
+    def hidden_size(self) -> int:
+        if self._hidden_size is not None:
+            return int(self._hidden_size)
+        if self.vision_model is not None:
+            return int(getattr(self.vision_model.config, "hidden_size"))
+        return 1024
+    @property
+    def num_patches(self) -> int:
+        return self.num_patches_per_side**2
+    @property
+    def num_patches_per_side(self) -> int:
+        size = self.image_size
+        patch = self._vision_cfg.get("patch_size", 14)
+        return size // patch
+    @property
+    def image_size(self) -> int:
+        return int(self._vision_cfg.get("image_size", 448))
+__all__ = ["AutoEvaClipVisionTower", "default_device"]