| """Phase 2: Multi-View Generation Module. |
| |
| Generates consistent multi-view images for: |
| - Per-object reconstruction (6 orthographic views) |
| - Room shell reconstruction (panoramic-style views) |
| - Depth-conditioned view synthesis |
| """ |
|
|
| import math |
| import os |
| from typing import Dict, List, Optional, Tuple, Union |
|
|
| import numpy as np |
| import torch |
| import torch.nn as nn |
| import torch.nn.functional as F |
| from PIL import Image |
| from diffusers import DiffusionPipeline, StableDiffusionPipeline, DDIMScheduler |
| from transformers import CLIPVisionModel, CLIPImageProcessor |
|
|
|
|
| class MultiViewGenerationModule(nn.Module): |
| """Generate consistent multi-view images from single image + depth.""" |
| |
| def __init__( |
| self, |
| model_size: str = "L", |
| device: str = "cuda", |
| dtype: torch.dtype = torch.float16, |
| cache_dir: Optional[str] = None, |
| ): |
| super().__init__() |
| self.model_size = model_size |
| self.device = device |
| self.dtype = dtype |
| self.cache_dir = cache_dir |
| |
| |
| self._zero123_pipeline = None |
| self._depth_guided_pipe = None |
| self._image_encoder = None |
| |
| @property |
| def zero123_pipeline(self): |
| if self._zero123_pipeline is None: |
| |
| model_id = "sudo-ai/zero123plus-v1.1" |
| try: |
| self._zero123_pipeline = DiffusionPipeline.from_pretrained( |
| model_id, |
| torch_dtype=self.dtype, |
| cache_dir=self.cache_dir, |
| ).to(self.device) |
| except Exception as e: |
| print(f"Warning: Could not load {model_id}, using placeholder: {e}") |
| self._zero123_pipeline = None |
| return self._zero123_pipeline |
| |
| def generate_object_views( |
| self, |
| image: Image.Image, |
| mask: Optional[np.ndarray] = None, |
| depth_map: Optional[np.ndarray] = None, |
| num_views: int = 6, |
| elevation: float = 15.0, |
| ) -> List[Image.Image]: |
| """ |
| Generate multi-view images of a single object. |
| |
| Args: |
| image: Object crop image |
| mask: Object mask (optional) |
| depth_map: Depth map for the object (optional) |
| num_views: Number of views (default 6: 0, 60, 120, 180, 240, 300 deg) |
| elevation: Camera elevation angle |
| |
| Returns: |
| List of PIL Images, one per view |
| """ |
| |
| pipe = self.zero123_pipeline |
| if pipe is not None: |
| try: |
| result = pipe( |
| image, |
| num_inference_steps=30, |
| guidance_scale=3.0, |
| ).images |
| return result[0] if isinstance(result[0], list) else result |
| except Exception as e: |
| print(f"Zero123++ generation failed: {e}, falling back to heuristic") |
| |
| |
| return self._generate_heuristic_views(image, num_views, elevation) |
| |
| def _generate_heuristic_views( |
| self, |
| image: Image.Image, |
| num_views: int, |
| elevation: float, |
| ) -> List[Image.Image]: |
| """Generate simple multi-views by rotating and transforming the image.""" |
| views = [] |
| |
| for i in range(num_views): |
| angle = i * (360 / num_views) |
| |
| |
| |
| rotated = image.rotate(angle, expand=True, fillcolor=(255, 255, 255)) |
| |
| |
| scale = math.cos(math.radians(elevation)) |
| w, h = rotated.size |
| new_w, new_h = int(w * scale), int(h * scale) |
| |
| if new_w > 0 and new_h > 0: |
| scaled = rotated.resize((new_w, new_h), Image.LANCZOS) |
| |
| |
| padded = Image.new("RGB", (w, h), (255, 255, 255)) |
| paste_x = (w - new_w) // 2 |
| paste_y = (h - new_h) // 2 |
| padded.paste(scaled, (paste_x, paste_y)) |
| views.append(padded) |
| else: |
| views.append(rotated) |
| |
| return views |
| |
| def generate_room_shell_views( |
| self, |
| image: Image.Image, |
| depth_map: np.ndarray, |
| room_layout: Dict, |
| ) -> Dict[str, Image.Image]: |
| """ |
| Generate extended views for room shell (walls, floor, ceiling). |
| |
| Uses depth-guided inpainting to hallucinate occluded regions. |
| |
| Returns: |
| Dict with keys: 'ceiling', 'floor', 'left_wall', 'right_wall', |
| 'back_wall', 'front_wall' |
| """ |
| img_np = np.array(image) |
| H, W = img_np.shape[:2] |
| |
| views = {} |
| |
| |
| ceiling_region = img_np[:int(H * 0.15), :] |
| views["ceiling"] = Image.fromarray(ceiling_region) |
| |
| |
| floor_region = img_np[int(H * 0.65):, :] |
| views["floor"] = Image.fromarray(floor_region) |
| |
| |
| views["back_wall"] = image |
| |
| |
| left_wall = img_np[:, :int(W * 0.25)] |
| views["left_wall"] = Image.fromarray(left_wall) |
| |
| |
| right_wall = img_np[:, -int(W * 0.25):] |
| views["right_wall"] = Image.fromarray(right_wall) |
| |
| |
| front_wall = np.fliplr(img_np) |
| views["front_wall"] = Image.fromarray(front_wall) |
| |
| return views |
| |
| def depth_conditioned_inpainting( |
| self, |
| image: Image.Image, |
| depth: np.ndarray, |
| mask: np.ndarray, |
| prompt: str = "interior room, photorealistic", |
| ) -> Image.Image: |
| """ |
| Inpaint occluded regions guided by depth map. |
| |
| Uses depth-aware diffusion to fill in missing regions |
| while maintaining geometric consistency. |
| """ |
| |
| |
| |
| return image |
|
|