Spaces:

Longxiang-ai
/

TransNormal

Build error

App Files Files Community

Longxiang-ai commited on Jan 31

Commit

159500c

0 Parent(s):

Initial release: TransNormal with Zero GPU support

Browse files

Files changed (7) hide show

README.md +20 -0
app.py +197 -0
requirements.txt +22 -0
transnormal/__init__.py +59 -0
transnormal/dino_encoder.py +352 -0
transnormal/pipeline.py +394 -0
transnormal/utils.py +240 -0

README.md ADDED Viewed

	@@ -0,0 +1,20 @@

+---
+title: TransNormal
+emoji: 🔮
+colorFrom: blue
+colorTo: purple
+sdk: gradio
+sdk_version: 5.9.1
+app_file: app.py
+pinned: false
+license: cc-by-nc-4.0
+suggested_hardware: zero-a10g
+---
+# TransNormal
+Surface Normal Estimation for Transparent Objects using Dense Visual Semantics.
+**Paper:** [TransNormal: Dense Visual Semantics for Diffusion-based Transparent Object Normal Estimation](https://longxiang-ai.github.io/TransNormal/)
+**Authors:** Mingwei Li, Hehe Fan, Yi Yang (Zhejiang University)

app.py ADDED Viewed

	@@ -0,0 +1,197 @@

+#!/usr/bin/env python
+"""
+TransNormal - Hugging Face Spaces Zero GPU Version
+Surface Normal Estimation for Transparent Objects
+"""
+import os
+import spaces
+import torch
+import gradio as gr
+from PIL import Image
+from huggingface_hub import snapshot_download
+from transnormal import TransNormalPipeline, create_dino_encoder
+# ============== Model Paths ==============
+TRANSNORMAL_REPO = "Longxiang-ai/TransNormal"
+DINO_REPO = "facebook/dinov3-vith16plus-pretrain-lvd1689m"
+# =========================================
+# Global pipeline
+pipe = None
+weights_downloaded = False
+def download_weights():
+    """Download model weights from HuggingFace Hub."""
+    global weights_downloaded
+    if weights_downloaded:
+        return "./weights/transnormal", "./weights/dinov3_vith16plus"
+    print("[TransNormal] Downloading TransNormal weights...")
+    transnormal_path = snapshot_download(
+        TRANSNORMAL_REPO,
+        local_dir="./weights/transnormal"
+    )
+    print("[TransNormal] Downloading DINOv3 weights...")
+    dino_path = snapshot_download(
+        DINO_REPO,
+        local_dir="./weights/dinov3_vith16plus"
+    )
+    weights_downloaded = True
+    print("[TransNormal] Weights downloaded successfully!")
+    return transnormal_path, dino_path
+def load_pipeline():
+    """Load the TransNormal pipeline."""
+    global pipe
+    if pipe is not None:
+        return pipe
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    dtype = torch.bfloat16 if device == "cuda" else torch.float32
+    print(f"[TransNormal] Loading model on {device} with {dtype}...")
+    # Download weights
+    transnormal_path, dino_path = download_weights()
+    projector_path = os.path.join(transnormal_path, "cross_attention_projector.pt")
+    # Load DINO encoder
+    dino_encoder = create_dino_encoder(
+        model_name="dinov3_vith16plus",
+        cross_attention_dim=1024,
+        weights_path=dino_path,
+        projector_path=projector_path,
+        device=device,
+        dtype=dtype,
+        freeze_encoder=True,
+    )
+    # Load pipeline
+    pipe = TransNormalPipeline.from_pretrained(
+        transnormal_path,
+        dino_encoder=dino_encoder,
+        torch_dtype=dtype,
+    )
+    pipe = pipe.to(device)
+    print("[TransNormal] Model loaded successfully!")
+    return pipe
+@spaces.GPU(duration=120)
+def predict_normal(image: Image.Image, processing_res: int = 768) -> Image.Image:
+    """
+    Predict surface normal from input image using Zero GPU.
+    Args:
+        image: Input RGB image
+        processing_res: Processing resolution
+    Returns:
+        Normal map as PIL Image
+    """
+    if image is None:
+        return None
+    # Load pipeline (will use GPU allocated by @spaces.GPU)
+    pipeline = load_pipeline()
+    # Run inference
+    with torch.no_grad():
+        normal_map = pipeline(
+            image=image,
+            processing_res=processing_res,
+            output_type="pil",
+        )
+    return normal_map
+# ============== Gradio Interface ==============
+custom_css = """
+.gradio-container {
+    font-family: 'Segoe UI', 'Helvetica Neue', Arial, sans-serif !important;
+}
+h1 {
+    font-weight: 600 !important;
+}
+"""
+with gr.Blocks(
+    title="TransNormal",
+    theme=gr.themes.Soft(),
+    css=custom_css,
+) as demo:
+    gr.Markdown(
+        """
+        # 🔮 TransNormal
+        ### Surface Normal Estimation for Transparent Objects
+        Upload an image to estimate surface normals. Particularly effective for **transparent objects** like glass and plastic.
+        **Normal Convention:** Red=X (Left) | Green=Y (Up) | Blue=Z (Out)
+        > ⏱️ First inference may take ~1-2 minutes to load model weights.
+        """
+    )
+    with gr.Row():
+        with gr.Column():
+            input_image = gr.Image(
+                label="Input Image",
+                type="pil",
+                height=400,
+            )
+            processing_res = gr.Slider(
+                minimum=256,
+                maximum=1024,
+                value=768,
+                step=64,
+                label="Processing Resolution",
+                info="Higher resolution = better quality but slower"
+            )
+            submit_btn = gr.Button("🚀 Estimate Normal", variant="primary", size="lg")
+        with gr.Column():
+            output_image = gr.Image(
+                label="Normal Map",
+                type="pil",
+                height=400,
+            )
+    # Event handlers
+    submit_btn.click(
+        fn=predict_normal,
+        inputs=[input_image, processing_res],
+        outputs=output_image,
+    )
+    # Footer
+    gr.Markdown(
+        """
+        ---
+        **Paper:** [TransNormal: Dense Visual Semantics for Diffusion-based Transparent Object Normal Estimation](https://longxiang-ai.github.io/TransNormal/)
+        **Authors:** Mingwei Li, Hehe Fan, Yi Yang (Zhejiang University)
+        **Code:** [GitHub](https://github.com/longxiang-ai/TransNormal)
+        """
+    )
+# Launch
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,22 @@

+# TransNormal HuggingFace Space Dependencies
+# PyTorch
+torch>=2.0.0
+torchvision>=0.15.0
+# Diffusers and Transformers
+diffusers>=0.28.0
+transformers>=4.56.0
+accelerate>=0.24.0
+safetensors>=0.4.0
+# Image processing
+Pillow>=9.0.0
+numpy>=1.23.0
+# HuggingFace
+huggingface_hub
+# Gradio and Spaces
+gradio>=5.0.0
+spaces

transnormal/__init__.py ADDED Viewed

	@@ -0,0 +1,59 @@

+"""
+TransNormal: Surface Normal Estimation for Transparent Objects
+This package provides a diffusion-based pipeline for estimating surface normals
+from RGB images, with particular effectiveness on transparent objects.
+Example usage:
+    from transnormal import TransNormalPipeline, create_dino_encoder
+    import torch
+    # Create DINO encoder
+    dino_encoder = create_dino_encoder(
+        model_name="dinov3_vith16plus",
+        weights_path="path/to/dinov3_weights",
+        projector_path="path/to/projector.pt",
+        device="cuda",
+    )
+    # Load pipeline
+    pipe = TransNormalPipeline.from_pretrained(
+        "path/to/transnormal_model",
+        dino_encoder=dino_encoder,
+        torch_dtype=torch.float16,
+    )
+    pipe = pipe.to("cuda")
+    # Run inference
+    normal_map = pipe("path/to/image.jpg", output_type="np")
+"""
+__version__ = "1.0.0"
+__author__ = "TransNormal Team"
+from .pipeline import TransNormalPipeline
+from .dino_encoder import DINOv3Encoder, create_dino_encoder
+from .utils import (
+    resize_max_res,
+    resize_back,
+    get_tv_resample_method,
+    get_pil_resample_method,
+    normal_to_rgb,
+    save_normal_map,
+    load_image,
+    concatenate_images,
+)
+__all__ = [
+    "TransNormalPipeline",
+    "DINOv3Encoder",
+    "create_dino_encoder",
+    "resize_max_res",
+    "resize_back",
+    "get_tv_resample_method",
+    "get_pil_resample_method",
+    "normal_to_rgb",
+    "save_normal_map",
+    "load_image",
+    "concatenate_images",
+]

transnormal/dino_encoder.py ADDED Viewed

	@@ -0,0 +1,352 @@

+"""
+DINOv3 Encoder for Semantic-Guided Surface Normal Estimation
+This module provides a simplified DINOv3 encoder that extracts semantic features
+from RGB images for cross-attention in the TransNormal pipeline.
+The encoder is particularly effective for transparent objects, as DINOv3's
+strong semantic features can "see through" refraction artifacts.
+"""
+import os
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Optional, Dict
+# DINOv3 model configurations
+DINOV3_CONFIGS = {
+    "dinov3_vits16": {
+        "embed_dim": 384,
+        "patch_size": 16,
+        "n_storage_tokens": 4,
+    },
+    "dinov3_vitb16": {
+        "embed_dim": 768,
+        "patch_size": 16,
+        "n_storage_tokens": 4,
+    },
+    "dinov3_vitl16": {
+        "embed_dim": 1024,
+        "patch_size": 16,
+        "n_storage_tokens": 4,
+    },
+    "dinov3_vith16plus": {
+        "embed_dim": 1280,
+        "patch_size": 16,
+        "n_storage_tokens": 4,
+    },
+}
+class DINOv3Encoder(nn.Module):
+    """
+    DINOv3 Encoder for extracting semantic features from RGB images.
+    This encoder provides projected patch tokens for cross-attention in the UNet,
+    replacing CLIP text embeddings with visual semantic features.
+    Args:
+        model_name: DINOv3 model name (e.g., "dinov3_vith16plus")
+        cross_attention_dim: Target dimension for cross-attention (1024 for SD 2.x)
+        weights_path: Path to DINOv3 pretrained weights (HuggingFace format)
+        freeze_encoder: Whether to freeze the DINOv3 backbone
+    """
+    def __init__(
+        self,
+        model_name: str = "dinov3_vith16plus",
+        cross_attention_dim: int = 1024,
+        weights_path: Optional[str] = None,
+        freeze_encoder: bool = True,
+    ):
+        super().__init__()
+        self.model_name = model_name
+        self.cross_attention_dim = cross_attention_dim
+        self.weights_path = weights_path
+        self.freeze_encoder = freeze_encoder
+        # Get model configuration
+        if model_name not in DINOV3_CONFIGS:
+            raise ValueError(f"Unknown DINOv3 model: {model_name}. Available: {list(DINOV3_CONFIGS.keys())}")
+        self.config = DINOV3_CONFIGS[model_name]
+        self.dino_hidden_dim = self.config["embed_dim"]
+        self.patch_size = self.config["patch_size"]
+        self.n_storage_tokens = self.config["n_storage_tokens"]
+        # DINOv3 backbone (loaded later)
+        self.dino_backbone = None
+        self._use_hf_interface = False
+        self._is_loaded = False
+        # Cross-attention projector: DINO hidden_dim -> SD cross_attention_dim
+        self.cross_attention_projector = nn.Linear(self.dino_hidden_dim, cross_attention_dim)
+        self._init_projector()
+        # ImageNet normalization for DINOv3
+        self.register_buffer(
+            "imagenet_mean",
+            torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1),
+            persistent=False
+        )
+        self.register_buffer(
+            "imagenet_std",
+            torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1),
+            persistent=False
+        )
+    @property
+    def dtype(self) -> torch.dtype:
+        """Return the dtype of the encoder (for diffusers compatibility)."""
+        return self.cross_attention_projector.weight.dtype
+    @property
+    def device(self) -> torch.device:
+        """Return the device of the encoder."""
+        return self.cross_attention_projector.weight.device
+    def _init_projector(self):
+        """Initialize the cross-attention projector with Xavier initialization."""
+        nn.init.xavier_uniform_(self.cross_attention_projector.weight)
+        nn.init.zeros_(self.cross_attention_projector.bias)
+    def _preprocess_image(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        """
+        Preprocess image from [-1, 1] to ImageNet normalized format.
+        Args:
+            pixel_values: Input images, shape (B, 3, H, W), normalized to [-1, 1]
+        Returns:
+            Preprocessed images with ImageNet normalization
+        """
+        # Convert from [-1, 1] to [0, 1]
+        pixel_values = (pixel_values + 1.0) / 2.0
+        # Ensure mean/std are on the same device and dtype
+        mean = self.imagenet_mean.to(device=pixel_values.device, dtype=pixel_values.dtype)
+        std = self.imagenet_std.to(device=pixel_values.device, dtype=pixel_values.dtype)
+        # Apply ImageNet normalization
+        pixel_values = (pixel_values - mean) / std
+        return pixel_values
+    def load_dino_model(self, device: torch.device = None, dtype: torch.dtype = None):
+        """
+        Load the DINOv3 model from HuggingFace format.
+        Args:
+            device: Device to load the model on
+            dtype: Data type for the model weights
+        """
+        if self._is_loaded:
+            return
+        if self.weights_path is None:
+            raise ValueError("weights_path must be provided to load DINOv3 model")
+        try:
+            from transformers import AutoModel
+            print(f"[DINOv3] Loading from: {self.weights_path}")
+            self.dino_backbone = AutoModel.from_pretrained(
+                self.weights_path,
+                trust_remote_code=True,
+            )
+            # Update config from loaded model
+            hf_config = getattr(self.dino_backbone, "config", None)
+            if hf_config is not None:
+                self.dino_hidden_dim = getattr(hf_config, "hidden_size", self.dino_hidden_dim)
+                self.patch_size = getattr(hf_config, "patch_size", self.patch_size)
+                self.n_storage_tokens = getattr(hf_config, "num_register_tokens", self.n_storage_tokens)
+                # Reinitialize projector if hidden dim changed
+                if self.cross_attention_projector.in_features != self.dino_hidden_dim:
+                    self.cross_attention_projector = nn.Linear(
+                        self.dino_hidden_dim, self.cross_attention_dim
+                    )
+                    self._init_projector()
+            self._use_hf_interface = True
+            # Move to device/dtype
+            if device is not None:
+                self.dino_backbone = self.dino_backbone.to(device)
+                self.cross_attention_projector = self.cross_attention_projector.to(device)
+            if dtype is not None:
+                self.dino_backbone = self.dino_backbone.to(dtype)
+                self.cross_attention_projector = self.cross_attention_projector.to(dtype)
+            # Freeze backbone
+            if self.freeze_encoder:
+                self.dino_backbone.requires_grad_(False)
+                self.dino_backbone.eval()
+            self._is_loaded = True
+            print(f"[DINOv3] Successfully loaded {self.model_name}")
+            print(f"  - Hidden dim: {self.dino_hidden_dim}")
+            print(f"  - Patch size: {self.patch_size}")
+            print(f"  - Cross-attention dim: {self.cross_attention_dim}")
+        except Exception as e:
+            raise RuntimeError(
+                f"Failed to load DINOv3 model from {self.weights_path}.\n"
+                f"Error: {e}"
+            )
+    def _ensure_loaded(self):
+        """Ensure the model is loaded before forward pass."""
+        if not self._is_loaded:
+            raise RuntimeError(
+                "DINOv3 model not loaded. Call load_dino_model() first."
+            )
+    def extract_patch_tokens(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        """
+        Extract patch tokens from DINOv3.
+        Args:
+            pixel_values: Input images, shape (B, 3, H, W), normalized to [-1, 1]
+        Returns:
+            patch_tokens: Shape (B, N, D) where N is number of patches, D is hidden_dim
+        """
+        self._ensure_loaded()
+        # Preprocess image
+        preprocessed = self._preprocess_image(pixel_values)
+        # Ensure dimensions are multiples of patch_size
+        _, _, H, W = preprocessed.shape
+        new_H = (H // self.patch_size) * self.patch_size
+        new_W = (W // self.patch_size) * self.patch_size
+        if new_H != H or new_W != W:
+            preprocessed = F.interpolate(
+                preprocessed,
+                size=(new_H, new_W),
+                mode='bilinear',
+                align_corners=False
+            )
+        # Forward through DINOv3
+        with torch.no_grad() if self.freeze_encoder else torch.enable_grad():
+            if self._use_hf_interface:
+                outputs = self.dino_backbone(
+                    pixel_values=preprocessed,
+                    output_hidden_states=True
+                )
+                last_hidden = outputs.last_hidden_state
+                # Remove CLS and register tokens
+                n_special = 1 + self.n_storage_tokens
+                patch_tokens = last_hidden[:, n_special:, :]
+            else:
+                outputs = self.dino_backbone.forward_features(preprocessed, masks=None)
+                patch_tokens = outputs['x_norm_patchtokens']
+        return patch_tokens
+    def forward(self, pixel_values: torch.Tensor) -> Dict[str, torch.Tensor]:
+        """
+        Forward pass to extract features for cross-attention.
+        Args:
+            pixel_values: Input images, shape (B, 3, H, W), normalized to [-1, 1]
+        Returns:
+            dict with 'cross_attention_features': Projected features, shape (B, N, cross_attention_dim)
+        """
+        self._ensure_loaded()
+        # Extract patch tokens
+        patch_tokens = self.extract_patch_tokens(pixel_values)
+        # Project to cross-attention dimension
+        projector_dtype = next(self.cross_attention_projector.parameters()).dtype
+        if patch_tokens.dtype != projector_dtype:
+            patch_tokens = patch_tokens.to(dtype=projector_dtype)
+        cross_attention_features = self.cross_attention_projector(patch_tokens)
+        return {'cross_attention_features': cross_attention_features}
+    def get_cross_attention_features(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        """
+        Convenience method to get only cross-attention features.
+        Args:
+            pixel_values: Input images, shape (B, 3, H, W), normalized to [-1, 1]
+        Returns:
+            cross_attention_features: Shape (B, N, cross_attention_dim)
+        """
+        return self.forward(pixel_values)['cross_attention_features']
+    def load_projector(self, projector_path: str, device: torch.device = None):
+        """
+        Load pretrained projector weights.
+        Args:
+            projector_path: Path to projector weights file (.pt)
+            device: Device to load weights on
+        """
+        if not os.path.exists(projector_path):
+            raise FileNotFoundError(f"Projector weights not found: {projector_path}")
+        state_dict = torch.load(projector_path, map_location=device or "cpu")
+        self.cross_attention_projector.load_state_dict(state_dict)
+        print(f"[DINOv3] Loaded projector weights from {projector_path}")
+def create_dino_encoder(
+    model_name: str = "dinov3_vith16plus",
+    cross_attention_dim: int = 1024,
+    weights_path: Optional[str] = None,
+    projector_path: Optional[str] = None,
+    device: torch.device = None,
+    dtype: torch.dtype = None,
+    freeze_encoder: bool = True,
+) -> DINOv3Encoder:
+    """
+    Factory function to create and initialize a DINOv3 encoder.
+    Args:
+        model_name: DINOv3 model name
+        cross_attention_dim: Target dimension for cross-attention
+        weights_path: Path to DINOv3 pretrained weights
+        projector_path: Path to projector weights (optional)
+        device: Device to load the model on
+        dtype: Data type for the model
+        freeze_encoder: Whether to freeze the backbone
+    Returns:
+        Initialized DINOv3Encoder
+    """
+    encoder = DINOv3Encoder(
+        model_name=model_name,
+        cross_attention_dim=cross_attention_dim,
+        weights_path=weights_path,
+        freeze_encoder=freeze_encoder,
+    )
+    # Load DINO backbone
+    if weights_path is not None:
+        encoder.load_dino_model(device=device, dtype=dtype)
+    # Load projector weights if provided
+    if projector_path is not None:
+        encoder.load_projector(projector_path, device=device)
+    # Move to device
+    if device is not None:
+        encoder = encoder.to(device)
+    if dtype is not None:
+        encoder = encoder.to(dtype)
+    return encoder

transnormal/pipeline.py ADDED Viewed

	@@ -0,0 +1,394 @@

+"""
+TransNormal Pipeline for Surface Normal Estimation
+This pipeline is designed for transparent object surface normal estimation,
+using DINOv3 encoder for semantic-guided geometry estimation.
+Based on the Lotus-D deterministic pipeline architecture.
+"""
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from PIL import Image
+import numpy as np
+from diffusers import DiffusionPipeline, StableDiffusionMixin
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.utils import logging
+from transformers import CLIPTextModel, CLIPTokenizer
+from .utils import resize_max_res, resize_back, get_tv_resample_method
+from torchvision.transforms import InterpolationMode
+logger = logging.get_logger(__name__)
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    **kwargs,
+):
+    """
+    Get timesteps from scheduler.
+    Args:
+        scheduler: The scheduler to get timesteps from
+        num_inference_steps: Number of diffusion steps
+        device: Device to move timesteps to
+        timesteps: Custom timesteps (optional)
+    Returns:
+        Tuple of (timesteps, num_inference_steps)
+    """
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__} does not support custom "
+                f"timestep schedules."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+class TransNormalPipeline(DiffusionPipeline, StableDiffusionMixin):
+    """
+    TransNormal Pipeline for Surface Normal Estimation
+    This pipeline uses DINOv3 encoder for semantic-guided geometry estimation,
+    particularly effective for transparent objects where traditional methods fail.
+    Args:
+        vae: Variational Autoencoder for encoding/decoding images
+        text_encoder: CLIP text encoder (kept for compatibility)
+        tokenizer: CLIP tokenizer (kept for compatibility)
+        unet: UNet2DConditionModel for denoising
+        scheduler: Noise scheduler
+        dino_encoder: Optional DINOv3 encoder for semantic features
+    """
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    _optional_components = ["text_encoder", "tokenizer", "dino_encoder"]
+    # Default processing resolution
+    default_processing_resolution = 768
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        dino_encoder: Optional[nn.Module] = None,
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            dino_encoder=dino_encoder,
+        )
+        # VAE scale factor (typically 8 for SD)
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        # DINOv3 encoder usage flag
+        self._use_dino_for_cross_attention = dino_encoder is not None
+    def set_dino_encoder(self, dino_encoder: Optional[nn.Module], device: torch.device = None):
+        """
+        Set or remove the DINOv3 encoder.
+        Args:
+            dino_encoder: DINOv3 encoder module, or None to disable
+            device: Target device for the encoder
+        """
+        if dino_encoder is not None and device is not None:
+            dino_encoder = dino_encoder.to(device)
+            if hasattr(dino_encoder, 'dino_backbone') and dino_encoder.dino_backbone is not None:
+                dino_encoder.dino_backbone = dino_encoder.dino_backbone.to(device)
+        # Update registered module
+        self.register_modules(dino_encoder=dino_encoder)
+        self._use_dino_for_cross_attention = dino_encoder is not None
+    def encode_prompt(
+        self,
+        prompt: str,
+        device: torch.device,
+        num_images_per_prompt: int = 1,
+    ) -> torch.Tensor:
+        """
+        Encode text prompt using CLIP text encoder.
+        Args:
+            prompt: Text prompt
+            device: Target device
+            num_images_per_prompt: Number of images per prompt
+        Returns:
+            Text embeddings tensor
+        """
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="do_not_pad",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        prompt_embeds = self.text_encoder(text_input_ids.to(device))[0]
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+        return prompt_embeds
+    def _get_encoder_hidden_states(
+        self,
+        rgb_in: torch.Tensor,
+        prompt: str,
+        device: torch.device,
+    ) -> torch.Tensor:
+        """
+        Get encoder hidden states for cross-attention.
+        Uses DINOv3 features if encoder is available, otherwise uses CLIP text embeddings.
+        Args:
+            rgb_in: Input RGB image tensor, shape (B, 3, H, W), range [-1, 1]
+            prompt: Text prompt (used only if DINO encoder is not available)
+            device: Target device
+        Returns:
+            Encoder hidden states for cross-attention
+        """
+        if self._use_dino_for_cross_attention and self.dino_encoder is not None:
+            # Use DINOv3 to extract semantic features
+            encoder_hidden_states = self.dino_encoder.get_cross_attention_features(rgb_in)
+            # Ensure dtype matches UNet
+            if self.unet is not None:
+                encoder_hidden_states = encoder_hidden_states.to(dtype=self.unet.dtype)
+            return encoder_hidden_states
+        else:
+            # Fallback to CLIP text encoder
+            return self.encode_prompt(prompt, device)
+    def preprocess_image(
+        self,
+        image: Union[torch.Tensor, Image.Image, np.ndarray, str],
+        device: torch.device,
+        dtype: torch.dtype,
+    ) -> torch.Tensor:
+        """
+        Preprocess input image to tensor format.
+        Args:
+            image: Input image (PIL, numpy, tensor, or path)
+            device: Target device
+            dtype: Target dtype
+        Returns:
+            Preprocessed image tensor, shape (1, 3, H, W), range [-1, 1]
+        """
+        # Load image if path is provided
+        if isinstance(image, str):
+            image = Image.open(image).convert("RGB")
+        # Convert PIL to numpy
+        if isinstance(image, Image.Image):
+            image = np.array(image)
+        # Convert numpy to tensor
+        if isinstance(image, np.ndarray):
+            # Ensure HWC format
+            if image.ndim == 2:
+                image = np.stack([image] * 3, axis=-1)
+            elif image.shape[0] == 3:  # CHW format
+                image = np.transpose(image, (1, 2, 0))
+            # Normalize to [0, 1]
+            if image.dtype == np.uint8:
+                image = image.astype(np.float32) / 255.0
+            # Convert to tensor (B, C, H, W)
+            image = torch.from_numpy(image).permute(2, 0, 1).unsqueeze(0)
+        # Ensure batch dimension
+        if image.dim() == 3:
+            image = image.unsqueeze(0)
+        # Normalize to [-1, 1]
+        if image.min() >= 0 and image.max() <= 1:
+            image = image * 2.0 - 1.0
+        return image.to(device=device, dtype=dtype)
+    @torch.no_grad()
+    def __call__(
+        self,
+        image: Union[torch.Tensor, Image.Image, np.ndarray, str],
+        prompt: str = "",
+        timestep: int = 1,
+        processing_res: Optional[int] = None,
+        match_input_res: bool = True,
+        resample_method: str = "bilinear",
+        output_type: str = "np",
+        return_dict: bool = False,
+        **kwargs,
+    ):
+        """
+        Run surface normal estimation on input image.
+        Args:
+            image: Input RGB image (PIL, numpy, tensor, or file path)
+            prompt: Text prompt (optional, used only if DINO encoder is not available)
+            timestep: Diffusion timestep for deterministic prediction (default: 1)
+            processing_res: Processing resolution (default: 768)
+            match_input_res: Whether to resize output to match input resolution
+            resample_method: Resampling method for resizing
+            output_type: Output format - "np" (numpy), "pt" (tensor), or "pil" (PIL Image)
+            return_dict: Whether to return a dict with additional info
+        Returns:
+            Normal map in specified format. Normal vectors are in camera coordinates:
+            - X: right (positive = right)
+            - Y: down (positive = down)
+            - Z: forward (positive = into screen)
+            Output range is [0, 1] where 0.5 represents zero in each axis.
+        """
+        # Set default processing resolution
+        if processing_res is None:
+            processing_res = self.default_processing_resolution
+        device = self._execution_device
+        dtype = self.unet.dtype if self.unet is not None else torch.float32
+        # Preprocess input image
+        rgb_in = self.preprocess_image(image, device, dtype)
+        input_size = rgb_in.shape[-2:]
+        # Resize to processing resolution
+        resample_method_tv = get_tv_resample_method(resample_method)
+        if processing_res > 0:
+            rgb_in = resize_max_res(
+                rgb_in,
+                max_edge_resolution=processing_res,
+                resample_method=resample_method_tv,
+            )
+        # Get encoder hidden states (DINO or CLIP)
+        encoder_hidden_states = self._get_encoder_hidden_states(
+            rgb_in=rgb_in,
+            prompt=prompt,
+            device=device,
+        )
+        # Prepare timestep
+        timesteps = torch.tensor([timestep], device=device).long()
+        # Encode RGB to latent space
+        rgb_latents = self.vae.encode(rgb_in).latent_dist.sample()
+        rgb_latents = rgb_latents * self.vae.config.scaling_factor
+        # Task embedding for normal estimation
+        task_emb = torch.tensor([1, 0], dtype=dtype, device=device).unsqueeze(0)
+        task_emb = torch.cat([torch.sin(task_emb), torch.cos(task_emb)], dim=-1)
+        # Single-step deterministic prediction
+        t = timesteps[0]
+        pred = self.unet(
+            rgb_latents,
+            t,
+            encoder_hidden_states=encoder_hidden_states,
+            return_dict=False,
+            class_labels=task_emb,
+        )[0]
+        # Decode prediction
+        normal_latent = pred / self.vae.config.scaling_factor
+        normal_image = self.vae.decode(normal_latent, return_dict=False)[0]
+        # Post-process to [0, 1] range
+        normal_image = (normal_image / 2 + 0.5).clamp(0, 1)
+        # Resize back to input resolution if requested
+        if match_input_res and processing_res > 0:
+            normal_image = F.interpolate(
+                normal_image,
+                size=input_size,
+                mode='bilinear',
+                align_corners=False,
+            )
+        # Convert to output format
+        if output_type == "pt":
+            output = normal_image  # (B, 3, H, W), range [0, 1]
+        elif output_type == "np":
+            # Convert to float32 first (bfloat16 not supported by numpy)
+            output = normal_image.float().cpu().permute(0, 2, 3, 1).numpy()  # (B, H, W, 3)
+            if output.shape[0] == 1:
+                output = output[0]  # (H, W, 3)
+        elif output_type == "pil":
+            # Convert to float32 first (bfloat16 not supported by numpy)
+            output = normal_image.float().cpu().permute(0, 2, 3, 1).numpy()
+            output = (output * 255).astype(np.uint8)
+            if output.shape[0] == 1:
+                output = Image.fromarray(output[0])
+            else:
+                output = [Image.fromarray(img) for img in output]
+        else:
+            raise ValueError(f"Unknown output_type: {output_type}")
+        if return_dict:
+            return {"normal": output, "resolution": normal_image.shape[-2:]}
+        return output
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: str,
+        dino_encoder: Optional[nn.Module] = None,
+        **kwargs,
+    ):
+        """
+        Load TransNormalPipeline from pretrained weights.
+        Args:
+            pretrained_model_name_or_path: Path to pretrained model or HuggingFace model ID
+            dino_encoder: Optional pre-loaded DINO encoder
+            **kwargs: Additional arguments passed to DiffusionPipeline.from_pretrained
+        Returns:
+            TransNormalPipeline instance
+        """
+        # Load base pipeline components
+        pipeline = super().from_pretrained(pretrained_model_name_or_path, **kwargs)
+        # Set DINO encoder if provided
+        if dino_encoder is not None:
+            pipeline.set_dino_encoder(dino_encoder)
+        return pipeline

transnormal/utils.py ADDED Viewed

	@@ -0,0 +1,240 @@

+"""
+Utility functions for TransNormal pipeline.
+Includes image processing utilities for preprocessing and postprocessing.
+"""
+from typing import List, Union
+from PIL import Image
+import numpy as np
+import torch
+from torchvision.transforms import InterpolationMode
+from torchvision.transforms.functional import resize
+def resize_max_res(
+    img: torch.Tensor,
+    max_edge_resolution: int,
+    resample_method: InterpolationMode = InterpolationMode.BILINEAR,
+) -> torch.Tensor:
+    """
+    Resize image to limit maximum edge length while keeping aspect ratio.
+    Args:
+        img: Image tensor to be resized. Expected shape: [B, C, H, W]
+        max_edge_resolution: Maximum edge length (pixels)
+        resample_method: Resampling method used to resize images
+    Returns:
+        Resized image tensor
+    """
+    assert img.dim() == 4, f"Invalid input shape {img.shape}, expected [B, C, H, W]"
+    original_height, original_width = img.shape[-2:]
+    downscale_factor = min(
+        max_edge_resolution / original_width,
+        max_edge_resolution / original_height
+    )
+    new_width = int(original_width * downscale_factor)
+    new_height = int(original_height * downscale_factor)
+    resized_img = resize(img, (new_height, new_width), resample_method, antialias=True)
+    return resized_img
+def resize_back(
+    img: Union[torch.Tensor, np.ndarray, Image.Image, List[Image.Image]],
+    target_size: Union[int, tuple],
+    resample_method: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+) -> Union[torch.Tensor, np.ndarray, Image.Image, List[Image.Image]]:
+    """
+    Resize image back to target size.
+    Args:
+        img: Image to be resized (tensor, numpy, PIL, or list of PIL)
+        target_size: Target size (H, W) or single int for square
+        resample_method: Resampling method for resizing
+    Returns:
+        Resized image in the same format as input
+    """
+    if isinstance(target_size, int):
+        target_size = (target_size, target_size)
+    if isinstance(img, torch.Tensor):
+        resized_img = resize(img, target_size, resample_method, antialias=True)
+    elif isinstance(img, np.ndarray):
+        # Convert to tensor
+        if img.ndim == 3:  # HWC
+            img_tensor = torch.from_numpy(img).permute(2, 0, 1).unsqueeze(0)
+        else:  # BHWC
+            img_tensor = torch.from_numpy(img).permute(0, 3, 1, 2)
+        resized_tensor = resize(img_tensor, target_size, resample_method, antialias=True)
+        # Convert back
+        if img.ndim == 3:
+            resized_img = resized_tensor.squeeze(0).permute(1, 2, 0).numpy()
+        else:
+            resized_img = resized_tensor.permute(0, 2, 3, 1).numpy()
+    elif isinstance(img, Image.Image):
+        # PIL uses (width, height)
+        pil_size = (target_size[1], target_size[0])
+        resized_img = img.resize(pil_size, resample_method)
+    elif isinstance(img, list) and all(isinstance(i, Image.Image) for i in img):
+        pil_size = (target_size[1], target_size[0])
+        resized_img = [i.resize(pil_size, resample_method) for i in img]
+    else:
+        raise TypeError(f"Unsupported image type: {type(img)}")
+    return resized_img
+def get_tv_resample_method(method_str: str) -> InterpolationMode:
+    """
+    Get torchvision interpolation mode from string.
+    Args:
+        method_str: Resampling method name ("bilinear", "bicubic", "nearest")
+    Returns:
+        Corresponding InterpolationMode
+    """
+    resample_method_dict = {
+        "bilinear": InterpolationMode.BILINEAR,
+        "bicubic": InterpolationMode.BICUBIC,
+        "nearest": InterpolationMode.NEAREST_EXACT,
+        "nearest-exact": InterpolationMode.NEAREST_EXACT,
+    }
+    resample_method = resample_method_dict.get(method_str.lower())
+    if resample_method is None:
+        raise ValueError(f"Unknown resampling method: {method_str}")
+    return resample_method
+def get_pil_resample_method(method_str: str) -> int:
+    """
+    Get PIL resampling method from string.
+    Args:
+        method_str: Resampling method name ("bilinear", "bicubic", "nearest")
+    Returns:
+        Corresponding PIL resampling constant
+    """
+    resample_method_dict = {
+        "bilinear": Image.BILINEAR,
+        "bicubic": Image.BICUBIC,
+        "nearest": Image.NEAREST,
+    }
+    resample_method = resample_method_dict.get(method_str.lower())
+    if resample_method is None:
+        raise ValueError(f"Unknown resampling method: {method_str}")
+    return resample_method
+def normal_to_rgb(normal: Union[torch.Tensor, np.ndarray]) -> np.ndarray:
+    """
+    Convert normal map to RGB visualization.
+    Normal vectors are assumed to be in range [-1, 1] or [0, 1].
+    Output is RGB image in range [0, 255].
+    Args:
+        normal: Normal map tensor/array, shape (H, W, 3) or (B, H, W, 3) or (B, 3, H, W)
+    Returns:
+        RGB visualization as uint8 numpy array
+    """
+    if isinstance(normal, torch.Tensor):
+        normal = normal.cpu().numpy()
+    # Handle different formats
+    if normal.ndim == 4:
+        if normal.shape[1] == 3:  # BCHW
+            normal = np.transpose(normal, (0, 2, 3, 1))  # BHWC
+        normal = normal[0]  # Take first batch
+    # Convert from [-1, 1] to [0, 1] if needed
+    if normal.min() < 0:
+        normal = (normal + 1.0) / 2.0
+    # Clamp and convert to uint8
+    normal = np.clip(normal, 0, 1)
+    rgb = (normal * 255).astype(np.uint8)
+    return rgb
+def save_normal_map(
+    normal: Union[torch.Tensor, np.ndarray],
+    output_path: str,
+    as_rgb: bool = True,
+):
+    """
+    Save normal map to file.
+    Args:
+        normal: Normal map tensor/array
+        output_path: Output file path
+        as_rgb: If True, save as RGB visualization; if False, save raw values as NPZ
+    """
+    if as_rgb:
+        rgb = normal_to_rgb(normal)
+        Image.fromarray(rgb).save(output_path)
+    else:
+        if isinstance(normal, torch.Tensor):
+            normal = normal.cpu().numpy()
+        np.savez_compressed(output_path, normal=normal)
+def load_image(image_path: str) -> Image.Image:
+    """
+    Load image from file path.
+    Args:
+        image_path: Path to image file
+    Returns:
+        PIL Image in RGB mode
+    """
+    return Image.open(image_path).convert("RGB")
+def concatenate_images(*image_lists) -> Image.Image:
+    """
+    Concatenate multiple rows of images into a single image.
+    Args:
+        *image_lists: Variable number of image lists, each list is a row
+    Returns:
+        Concatenated PIL Image
+    """
+    if not image_lists or not image_lists[0]:
+        raise ValueError("At least one non-empty image list must be provided")
+    max_width = 0
+    total_height = 0
+    row_heights = []
+    for image_list in image_lists:
+        if image_list:
+            width = sum(img.width for img in image_list)
+            height = image_list[0].height
+            max_width = max(max_width, width)
+            total_height += height
+            row_heights.append(height)
+    new_image = Image.new('RGB', (max_width, total_height))
+    y_offset = 0
+    for i, image_list in enumerate(image_lists):
+        x_offset = 0
+        for img in image_list:
+            new_image.paste(img, (x_offset, y_offset))
+            x_offset += img.width
+        y_offset += row_heights[i]
+    return new_image