File size: 16,136 Bytes

ebc1c85

"""Phase 1: Scene Understanding Module.

Combines:
- Metric depth estimation (Depth Anything V2)
- Room layout estimation (SpatialLM)
- Semantic segmentation (Mask2Former-style)
- Object detection & isolation (SAM)
"""

import os
from typing import Dict, List, Optional, Tuple, Union

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from PIL import Image
from transformers import (
    pipeline,
    AutoImageProcessor,
    AutoModelForDepthEstimation,
    AutoModelForSemanticSegmentation,
    CLIPVisionModel,
    CLIPImageProcessor,
)


class SceneUnderstandingModule(nn.Module):
    """Extract scene structure from a single interior image."""
    
    def __init__(
        self,
        model_size: str = "L",
        device: str = "cuda",
        dtype: torch.dtype = torch.float16,
        cache_dir: Optional[str] = None,
    ):
        super().__init__()
        self.model_size = model_size
        self.device = device
        self.dtype = dtype
        self.cache_dir = cache_dir
        
        # Load sub-models (lazy)
        self._depth_model = None
        self._depth_processor = None
        self._segmentation_model = None
        self._sam_model = None
        self._room_classifier = None
        
    @property
    def depth_model(self):
        if self._depth_model is None:
            model_id = "depth-anything/Depth-Anything-V2-Metric-Indoor-Large-hf"
            self._depth_processor = AutoImageProcessor.from_pretrained(
                model_id, cache_dir=self.cache_dir
            )
            self._depth_model = AutoModelForDepthEstimation.from_pretrained(
                model_id,
                torch_dtype=self.dtype,
                cache_dir=self.cache_dir,
            ).to(self.device)
            self._depth_model.eval()
        return self._depth_model, self._depth_processor
    
    @property
    def segmentation_model(self):
        if self._segmentation_model is None:
            # Using a generic indoor segmentation model
            # In production, use fine-tuned Mask2Former or OneFormer
            model_id = "facebook/mask2former-swin-large-coco-instance"
            self._segmentation_model = AutoModelForSemanticSegmentation.from_pretrained(
                model_id,
                torch_dtype=self.dtype,
                cache_dir=self.cache_dir,
            ).to(self.device)
            self._segmentation_model.eval()
        return self._segmentation_model
    
    @property
    def room_classifier(self):
        if self._room_classifier is None:
            self._room_classifier = CLIPVisionModel.from_pretrained(
                "openai/clip-vit-large-patch14",
                torch_dtype=self.dtype,
                cache_dir=self.cache_dir,
            ).to(self.device)
            self._room_classifier.eval()
        return self._room_classifier
    
    def forward(self, image: Image.Image) -> Dict:
        """
        Process a single interior image and extract all scene understanding.
        
        Returns:
            Dictionary with keys:
            - depth: metric depth map [H, W]
            - normal: surface normal map [H, W, 3]
            - room_layout: room layout structure
            - semantic_segmentation: pixel-wise class labels [H, W]
            - detected_objects: dict of per-object crops and masks
            - room_type: str (e.g. "living_room")
            - style: str (e.g. "modern")
        """
        # Convert to tensor
        img_np = np.array(image)
        
        # === Metric Depth Estimation ===
        depth_map = self.estimate_depth(image)
        
        # === Surface Normal Estimation ===
        normal_map = self.estimate_normals(depth_map)
        
        # === Room Layout Estimation ===
        room_layout = self.estimate_room_layout(depth_map, img_np)
        
        # === Semantic Segmentation ===
        semantic_seg = self.segment_image(image)
        
        # === Object Detection & Isolation ===
        detected_objects = self.detect_and_isolate_objects(
            image, depth_map, semantic_seg
        )
        
        # === Room Type Classification ===
        room_type = self.classify_room_type(image)
        
        # === Style Classification ===
        style = self.classify_style(image)
        
        return {
            "depth": depth_map,
            "normal": normal_map,
            "room_layout": room_layout,
            "semantic_segmentation": semantic_seg,
            "detected_objects": detected_objects,
            "room_type": room_type,
            "style": style,
        }
    
    def estimate_depth(self, image: Image.Image) -> np.ndarray:
        """Estimate metric depth using Depth Anything V2."""
        model, processor = self.depth_model
        
        inputs = processor(images=image, return_tensors="pt")
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = model(**inputs)
            predicted_depth = outputs.predicted_depth
        
        # Interpolate to original size
        prediction = F.interpolate(
            predicted_depth.unsqueeze(1),
            size=image.size[::-1],
            mode="bicubic",
            align_corners=False,
        )
        
        depth = prediction.squeeze().cpu().numpy()
        return depth
    
    def estimate_normals(self, depth: np.ndarray) -> np.ndarray:
        """Compute surface normals from depth map."""
        # Compute gradients
        dz_dx = np.gradient(depth, axis=1)
        dz_dy = np.gradient(depth, axis=0)
        
        # Normal vector: [-dz/dx, -dz/dy, 1]
        normals = np.stack([
            -dz_dx,
            -dz_dy,
            np.ones_like(depth)
        ], axis=-1)
        
        # Normalize
        norm = np.linalg.norm(normals, axis=-1, keepdims=True)
        normals = normals / (norm + 1e-8)
        
        # Map to [0, 1] for visualization
        normals_vis = (normals + 1) / 2
        
        return normals
    
    def estimate_room_layout(
        self, depth: np.ndarray, img: np.ndarray
    ) -> Dict:
        """
        Estimate room layout from depth map.
        
        Detects:
        - Floor plane (largest horizontal surface below camera)
        - Ceiling plane (horizontal surface above, roughly at fixed height)
        - Wall planes (vertical surfaces)
        
        Returns Manhattan-world layout with planes.
        """
        H, W = depth.shape
        
        # Create point cloud from depth
        # Assume standard camera intrinsics (can be refined later)
        fx = fy = max(W, H)  # approximate focal length
        cx, cy = W / 2, H / 2
        
        u, v = np.meshgrid(np.arange(W), np.arange(H))
        z = depth
        x = (u - cx) * z / fx
        y = (v - cy) * z / fy
        
        points = np.stack([x, y, z], axis=-1)  # [H, W, 3]
        
        # RANSAC plane detection
        floor_plane = self._detect_floor_plane(points)
        ceiling_plane = self._detect_ceiling_plane(points)
        wall_planes = self._detect_wall_planes(points)
        
        return {
            "floor": floor_plane,
            "ceiling": ceiling_plane,
            "walls": wall_planes,
            "point_cloud": points,
            "dimensions": {
                "width": float(np.max(x) - np.min(x)),
                "depth": float(np.max(z) - np.min(z)),
                "height": float(ceiling_plane["height"] - floor_plane["height"]),
            }
        }
    
    def _detect_floor_plane(self, points: np.ndarray) -> Dict:
        """Detect floor plane (lowest y values, near-horizontal normal)."""
        # Simple heuristic: lowest 20% of points
        y_values = points[:, :, 1].flatten()
        threshold = np.percentile(y_values, 20)
        
        floor_mask = points[:, :, 1] < threshold
        floor_points = points[floor_mask]
        
        if len(floor_points) < 100:
            # Fallback: assume floor at y=0
            return {"normal": [0, 1, 0], "height": 0.0, "points": None}
        
        # Fit plane
        centroid = np.mean(floor_points, axis=0)
        centered = floor_points - centroid
        _, _, vh = np.linalg.svd(centered, full_matrices=False)
        normal = vh[-1]  # smallest singular value direction
        
        # Ensure normal points up
        if normal[1] < 0:
            normal = -normal
        
        return {
            "normal": normal.tolist(),
            "height": float(centroid[1]),
            "centroid": centroid.tolist(),
            "points": floor_points.shape[0],
        }
    
    def _detect_ceiling_plane(self, points: np.ndarray) -> Dict:
        """Detect ceiling plane (highest y values)."""
        y_values = points[:, :, 1].flatten()
        threshold = np.percentile(y_values, 90)
        
        ceiling_mask = points[:, :, 1] > threshold
        ceiling_points = points[ceiling_mask]
        
        if len(ceiling_points) < 100:
            # Fallback: typical room height ~2.7m
            return {"normal": [0, -1, 0], "height": 2.7, "points": None}
        
        centroid = np.mean(ceiling_points, axis=0)
        return {
            "normal": [0, -1, 0],
            "height": float(centroid[1]),
            "points": ceiling_points.shape[0],
        }
    
    def _detect_wall_planes(self, points: np.ndarray) -> List[Dict]:
        """Detect wall planes from remaining points."""
        # Simplified: detect 4 walls for rectangular rooms
        # In production, use proper RANSAC or SpatialLM
        
        x = points[:, :, 0]
        z = points[:, :, 2]
        
        walls = []
        
        # Left wall (minimum x)
        x_min = np.percentile(x.flatten(), 5)
        left_mask = np.abs(x - x_min) < 0.3
        if np.sum(left_mask) > 100:
            walls.append({
                "normal": [1, 0, 0],
                "position": float(x_min),
                "direction": "left",
            })
        
        # Right wall (maximum x)
        x_max = np.percentile(x.flatten(), 95)
        right_mask = np.abs(x - x_max) < 0.3
        if np.sum(right_mask) > 100:
            walls.append({
                "normal": [-1, 0, 0],
                "position": float(x_max),
                "direction": "right",
            })
        
        # Back wall (minimum z)
        z_min = np.percentile(z.flatten(), 5)
        back_mask = np.abs(z - z_min) < 0.3
        if np.sum(back_mask) > 100:
            walls.append({
                "normal": [0, 0, 1],
                "position": float(z_min),
                "direction": "back",
            })
        
        # Front wall (maximum z)
        z_max = np.percentile(z.flatten(), 95)
        front_mask = np.abs(z - z_max) < 0.3
        if np.sum(front_mask) > 100:
            walls.append({
                "normal": [0, 0, -1],
                "position": float(z_max),
                "direction": "front",
            })
        
        return walls
    
    def segment_image(self, image: Image.Image) -> np.ndarray:
        """Run semantic segmentation to identify regions."""
        # Placeholder: in production, use fine-tuned indoor segmentation
        # For now, return a simple heuristic segmentation
        
        img_np = np.array(image)
        H, W = img_np.shape[:2]
        
        # Heuristic: classify based on position and color
        # Bottom 30% = floor, top 10% = ceiling, rest = walls + objects
        seg = np.zeros((H, W), dtype=np.int32)
        
        # Floor region
        floor_threshold = int(H * 0.7)
        seg[floor_threshold:] = 1  # floor
        
        # Ceiling region
        ceiling_threshold = int(H * 0.1)
        seg[:ceiling_threshold] = 2  # ceiling
        
        # Wall regions (sides)
        wall_threshold = int(W * 0.15)
        seg[ceiling_threshold:floor_threshold, :wall_threshold] = 3  # left wall
        seg[ceiling_threshold:floor_threshold, -wall_threshold:] = 4  # right wall
        
        return seg
    
    def detect_and_isolate_objects(
        self,
        image: Image.Image,
        depth: np.ndarray,
        semantic_seg: np.ndarray,
    ) -> Dict:
        """
        Detect and isolate furniture objects from the scene.
        
        Returns dict mapping object_id -> {
            crop: PIL Image,
            mask: binary mask,
            bbox: [x1, y1, x2, y2],
            class_name: str,
            depth_range: [min, max],
        }
        """
        # Placeholder: in production, use SAM + fine-tuned detector
        # For now, return a simple grid-based detection
        
        detected_objects = {}
        img_np = np.array(image)
        H, W = img_np.shape[:2]
        
        # Divide room into zones and detect likely objects
        # Floor zone: look for distinct objects above floor
        floor_y = int(H * 0.65)
        
        # Simple heuristic: detect high-gradient regions in floor area
        from scipy.ndimage import label
        
        floor_region = semantic_seg == 1
        depth_floor = depth.copy()
        depth_floor[~floor_region] = 0
        
        # Find objects by depth discontinuities in floor region
        depth_grad = np.abs(np.gradient(depth_floor)[0]) + \
                     np.abs(np.gradient(depth_floor)[1])
        
        object_mask = depth_grad > np.percentile(depth_grad, 85)
        labeled, num_features = label(object_mask)
        
        for i in range(1, min(num_features + 1, 10)):  # max 10 objects
            obj_mask = labeled == i
            ys, xs = np.where(obj_mask)
            
            if len(xs) < 100:  # skip tiny objects
                continue
            
            x1, y1 = int(xs.min()), int(ys.min())
            x2, y2 = int(xs.max()), int(ys.max())
            
            # Pad bbox
            pad = 20
            x1 = max(0, x1 - pad)
            y1 = max(0, y1 - pad)
            x2 = min(W, x2 + pad)
            y2 = min(H, y2 + pad)
            
            crop = image.crop((x1, y1, x2, y2))
            mask_crop = obj_mask[y1:y2, x1:x2]
            
            obj_depth = depth[obj_mask]
            
            detected_objects[i - 1] = {
                "crop": crop,
                "mask": mask_crop,
                "bbox": [x1, y1, x2, y2],
                "class_name": "furniture",  # would be classified in production
                "depth_range": [float(obj_depth.min()), float(obj_depth.max())],
            }
        
        return detected_objects
    
    def classify_room_type(self, image: Image.Image) -> str:
        """Classify room type from image."""
        # Placeholder: use CLIP or fine-tuned classifier
        # For now, return based on simple heuristics
        
        img_np = np.array(image)
        
        # Simple heuristic based on color distribution
        # In production, use fine-tuned model
        mean_color = img_np.mean(axis=(0, 1))
        
        # Very simple heuristic (would be replaced with proper classifier)
        if mean_color[2] > mean_color[0] + 20:  # more blue = maybe kitchen/bathroom
            return "kitchen"
        elif mean_color[0] > mean_color[1] + 20:  # more red = maybe bedroom
            return "bedroom"
        else:
            return "living_room"
    
    def classify_style(self, image: Image.Image) -> str:
        """Classify interior design style."""
        # Placeholder: use fine-tuned style classifier
        # Styles: modern, scandinavian, luxury, indian, commercial, minimalist
        
        img_np = np.array(image)
        mean_color = img_np.mean(axis=(0, 1))
        std_color = img_np.std(axis=(0, 1))
        
        # Simple heuristic (would be replaced with proper classifier)
        if std_color.mean() < 30:  # low color variation = minimalist/scandinavian
            return "scandinavian"
        elif mean_color.mean() > 180:  # bright = modern
            return "modern"
        elif mean_color[0] < 80 and mean_color[1] < 80 and mean_color[2] < 80:  # dark = luxury
            return "luxury"
        else:
            return "modern"