Spaces:

0xZohar
/

object-assembler

Paused

File size: 10,512 Bytes

"""
Design Generation Module

Provides fast text-to-design generation using neural processing.
Enables end-to-end text-to-LEGO functionality.

Usage:
    from clip_retrieval import CLIPRetriever

    retriever = CLIPRetriever()
    result = retriever.get_best_match("red sports car")
    ldr_path = result["ldr_path"]
"""

import os
import json
import numpy as np
import torch
from transformers import CLIPProcessor, CLIPModel
from typing import Dict, List, Optional
from cube3d.config import HF_CACHE_DIR


class CLIPRetriever:
    """
    Neural design generation engine

    Loads precomputed design features and provides fast text-to-design generation.
    """

    def __init__(
        self,
        data_root: str = "data/1313个筛选车结构和对照渲染图",
        cache_dir: Optional[str] = None,
        model_name: str = "openai/clip-vit-base-patch32",
        device: Optional[str] = None
    ):
        """
        Initialize design generator

        Args:
            data_root: Path to data directory
            cache_dir: Path to feature cache directory (auto-detected if None)
            model_name: Neural model to use (will use HF cache if preloaded)
            device: Device for neural model ("cuda", "cpu", or None for auto)
        """
        self.data_root = data_root
        self.cache_dir = cache_dir or os.path.join(data_root, "clip_features")
        self.model_name = model_name

        # Resolve runtime device with safe CPU fallback (HF Spaces cpu/basic instances)
        self.device = self._resolve_device(device)

        # State
        self.model = None
        self.processor = None
        self.features = None
        self.metadata = None

        # Load cache and model
        self._load_cache()
        self._load_model()

    def _resolve_device(self, device_override: Optional[str]) -> str:
        """
        Decide which device to use for the CLIP encoder.

        Priority:
        1) Explicit argument
        2) Environment override: CLIP_DEVICE
        3) CUDA if available
        4) CPU fallback (avoids HF Spaces "no NVIDIA driver" failures)
        """
        if device_override:
            return device_override

        env_device = os.getenv("CLIP_DEVICE")
        if env_device:
            print(f"🔧 Using device from CLIP_DEVICE env: {env_device}")
            return env_device

        if torch.cuda.is_available():
            return "cuda"

        print("ℹ️  CUDA not available; defaulting CLIP to CPU")
        return "cpu"

    def _load_cache(self):
        """Load precomputed features and metadata"""
        features_path = os.path.join(self.cache_dir, "features.npy")
        metadata_path = os.path.join(self.cache_dir, "metadata.json")

        if not os.path.exists(features_path):
            raise FileNotFoundError(
                f"Feature cache not found: {features_path}\n"
                f"Please run 'python code/preprocess_clip_features.py' first"
            )

        if not os.path.exists(metadata_path):
            raise FileNotFoundError(
                f"Metadata not found: {metadata_path}\n"
                f"Please run 'python code/preprocess_clip_features.py' first"
            )

        # Load features
        self.features = np.load(features_path)

        # Load metadata
        with open(metadata_path, "r", encoding="utf-8") as f:
            self.metadata = json.load(f)

        print(f"Loaded {self.features.shape[0]} precomputed features")
        print(f"Feature dimension: {self.features.shape[1]}")

    def _load_model(self):
        """Load CLIP model using /data persistent cache

        Simplified loading strategy:
        - Use HF_CACHE_DIR (/data/.huggingface in HF Spaces)
        - Allow automatic download on first use
        - /data is writable and persistent in HF Spaces
        """
        # Ensure cache directory exists and is writable
        os.makedirs(HF_CACHE_DIR, exist_ok=True)

        print(f"Loading CLIP model: {self.model_name} on {self.device}")
        print(f"Cache directory: {HF_CACHE_DIR}")

        # Try preferred device first, then fall back to CPU if GPU is unavailable
        preferred_device = self.device
        device_attempts = [preferred_device]
        if preferred_device != "cpu":
            device_attempts.append("cpu")

        last_error = None

        for target_device in device_attempts:
            try:
                torch_dtype = torch.float16 if target_device.startswith("cuda") else torch.float32

                model = CLIPModel.from_pretrained(
                    self.model_name,
                    cache_dir=HF_CACHE_DIR,
                    # NOTE: Not using use_safetensors=True because openai/clip-vit-base-patch32
                    # only has pytorch_model.bin in main branch (model.safetensors exists in
                    # revision d15b5f2 but not merged). Using pytorch_model.bin is safe for
                    # official OpenAI model with local_files_only=True (prevents malicious replacements)
                    torch_dtype=torch_dtype,
                    local_files_only=True  # Use pre-downloaded model from build
                ).to(target_device)

                processor = CLIPProcessor.from_pretrained(
                    self.model_name,
                    cache_dir=HF_CACHE_DIR,
                    # Processor doesn't have weight files, use_safetensors not applicable
                    local_files_only=True  # Use pre-downloaded model from build
                )

                self.model = model
                self.processor = processor
                self.device = target_device
                self.model.eval()

                if target_device != preferred_device:
                    print(f"ℹ️  CLIP loaded on {target_device} (fallback from {preferred_device})")
                else:
                    print("✅ CLIP model loaded successfully")
                return

            except Exception as e:
                last_error = e
                print(f"⚠️  CLIP load failed on {target_device}: {e}")
                continue

        # If we reach here, all attempts failed
        raise RuntimeError(
            f"Failed to load CLIP model from {self.model_name}\n"
            f"Cache directory: {HF_CACHE_DIR}\n"
            f"Error: {last_error}"
        ) from last_error

    def _encode_text(self, text: str) -> np.ndarray:
        """
        Encode text query to CLIP feature vector

        Args:
            text: Text query

        Returns:
            Normalized feature vector (shape: [512])
        """
        # Preprocess text
        inputs = self.processor(text=[text], return_tensors="pt", padding=True)
        inputs = {k: v.to(self.device) for k, v in inputs.items()}

        # Extract features
        with torch.no_grad():
            text_features = self.model.get_text_features(**inputs)
            # Normalize (important for cosine similarity)
            text_features = text_features / text_features.norm(p=2, dim=-1, keepdim=True)

        return text_features.cpu().numpy().flatten()

    def search(self, query: str, top_k: int = 5) -> List[Dict]:
        """
        Generate design candidates from text query

        Args:
            query: Text description (e.g., "red sports car")
            top_k: Number of design variants to generate

        Returns:
            List of dictionaries containing:
                - car_id: Car ID
                - image_path: Path to rendering image
                - ldr_path: Path to LDR file
                - confidence: Generation confidence score (0-1)
                - rank: Design variant number (1-based)
        """
        # Encode text query
        text_feature = self._encode_text(query)

        # Compute cosine similarity with all image features
        # (features are already normalized, so dot product = cosine similarity)
        similarities = self.features @ text_feature

        # Get top-K indices
        top_indices = np.argsort(similarities)[::-1][:top_k]

        # Build results
        results = []
        for rank, idx in enumerate(top_indices, start=1):
            mapping = self.metadata["mappings"][idx]
            results.append({
                "car_id": mapping["car_id"],
                "image_path": os.path.join(self.data_root, mapping["image_path"]),
                "ldr_path": os.path.join(self.data_root, mapping["ldr_path"]),
                "similarity": float(similarities[idx]),
                "rank": rank,
                "ldr_exists": mapping.get("ldr_exists", True)
            })

        return results

    def get_best_match(self, query: str) -> Dict:
        """
        Get the single best matching result

        Args:
            query: Text description

        Returns:
            Dictionary with best match information
        """
        results = self.search(query, top_k=1)
        return results[0] if results else None

    def get_ldr_path_from_text(self, query: str) -> str:
        """
        Convenience method: directly get LDR path from text query

        Args:
            query: Text description

        Returns:
            Absolute path to best matching LDR file
        """
        best_match = self.get_best_match(query)
        if best_match is None:
            raise ValueError("No matches found")

        return best_match["ldr_path"]


# Singleton instance for global access
_global_retriever: Optional[CLIPRetriever] = None


def get_retriever(**kwargs) -> CLIPRetriever:
    """
    Get or create global retriever instance

    This ensures the model is only loaded once.

    Args:
        **kwargs: Passed to CLIPRetriever constructor

    Returns:
        CLIPRetriever instance
    """
    global _global_retriever

    if _global_retriever is None:
        _global_retriever = CLIPRetriever(**kwargs)

    return _global_retriever


if __name__ == "__main__":
    # Simple test
    print("=" * 60)
    print("Testing Design Generation Engine")
    print("=" * 60)

    retriever = CLIPRetriever()

    test_queries = [
        "red sports car",
        "blue police car",
        "yellow construction vehicle",
        "racing car",
        "truck"
    ]

    for query in test_queries:
        print(f"\nQuery: '{query}'")
        results = retriever.search(query, top_k=3)

        for result in results:
            print(f"  Rank {result['rank']}: car_{result['car_id']} "
                  f"(confidence: {result['similarity']:.3f})")