Spaces:

KinetoLabs
/

SmokeScan

Paused

File size: 20,184 Bytes

14c59e5
88bdcff
706520f
14c59e5
706520f
 
14c59e5
455c786
 
14c59e5
455c786
 
88bdcff
 
ed575b1
 
 
14c59e5
ed575b1
88bdcff
 
 
f3ebc82
88bdcff
 
 
 
706520f
88bdcff
 
 
 
 
 
5f0db1e
 
14c59e5
 
706520f
 
5f0db1e
88bdcff
 
 
 
333c083
88bdcff
5f0db1e
 
f3ebc82
 
5f0db1e
f3ebc82
5f0db1e
 
 
 
 
 
333c083
706520f
5f0db1e
706520f
5f0db1e
333c083
 
5f0db1e
 
706520f
5f0db1e
 
333c083
5f0db1e
14c59e5
706520f
 
 
 
 
 
 
 
14c59e5
333c083
b85b1e0
 
1b7fbd7
333c083
5f0db1e
706520f
 
 
333c083
 
706520f
 
 
 
 
 
 
 
333c083
5f0db1e
706520f
 
 
f3ebc82
 
455c786
 
 
 
88bdcff
 
455c786
f3ebc82
88bdcff
706520f
f3ebc82
 
455c786
 
 
 
88bdcff
 
455c786
f3ebc82
88bdcff
333c083
 
 
5f0db1e
 
 
88bdcff
333c083
 
88bdcff
c190082
706520f
 
333c083
 
706520f
 
 
 
333c083
c190082
 
 
 
333c083
 
c190082
 
 
 
 
333c083
 
c190082
 
88bdcff
706520f
 
333c083
14c59e5
706520f
 
333c083
706520f
333c083
88bdcff
706520f
f3ebc82
 
 
706520f
f3ebc82
 
 
 
 
 
 
 
 
 
 
 
 
706520f
f3ebc82
 
 
 
 
 
706520f
 
 
 
 
 
 
 
 
88bdcff
 
 
706520f
 
88bdcff
 
 
706520f
88bdcff
706520f
 
88bdcff
 
 
706520f
 
88bdcff
706520f
88bdcff
 
 
 
 
 
706520f
88bdcff
 
 
706520f
88bdcff
 
 
706520f
88bdcff
 
706520f
88bdcff
 
 
706520f
 
 
 
88bdcff
 
706520f
333c083
706520f
 
 
 
 
 
333c083
f3ebc82
706520f
f3ebc82
333c083
706520f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
333c083
 
 
 
 
 
 
 
706520f
333c083
 
 
 
 
 
 
 
 
706520f
 
88bdcff
706520f
 
 
 
 
 
88bdcff
706520f
88bdcff
 
706520f
88bdcff
 
 
 
706520f
88bdcff
333c083
 
706520f
88bdcff
333c083
706520f
88bdcff
 
 
 
 
 
 
706520f
88bdcff
 
333c083
88bdcff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f3ebc82
 
455c786
 
706520f
 
f3ebc82
88bdcff
 
455c786
 
 
 
 
 
88bdcff
 
 
455c786
 
f3ebc82
455c786
 
 
 
f3ebc82
455c786
 
f3ebc82
455c786
706520f
f3ebc82
88bdcff
455c786
 
 
88bdcff
455c786
88bdcff
 
 
 
706520f
 
88bdcff
 
 
455c786
 
 
 
 
 
 
 
 
 
706520f
455c786
88bdcff
 
 
f3ebc82
 
455c786
 
f3ebc82
 
 
 
706520f
f3ebc82
88bdcff
 
455c786
f3ebc82
455c786
 
 
f3ebc82
455c786
 
f3ebc82
455c786
 
f3ebc82
455c786
 
 
 
 
88bdcff
455c786
 
 
88bdcff
455c786
 
 
88bdcff
 
 
 
 
455c786
 
 
 
 
 
 
 
88bdcff
 
455c786
 
88bdcff

"""Real model loading for production (HuggingFace Spaces).

This module loads the production models:
- Vision: Qwen/Qwen3-VL-4B-Thinking (~10GB via vLLM, single GPU)
- Embedding: Qwen/Qwen3-VL-Embedding-2B (~4GB)
- Reranker: Qwen/Qwen3-VL-Reranker-2B (~4GB)
- Total: ~18GB on single L4 GPU (22GB)

Model Loading:
- Vision: vLLM with single GPU (no tensor parallelism needed)
- Embedding: Qwen3VLEmbedder (official scripts from QwenLM/Qwen3-VL-Embedding)
- Reranker: Qwen3VLReranker (official scripts from QwenLM/Qwen3-VL-Embedding)
"""

import os

# vLLM environment variables - MUST be set before importing vLLM
# Note: Using single GPU (TP=1) so NCCL workarounds are not needed

import json
import logging
import re
import time
import torch
from typing import Any
from PIL import Image

from config.inference import vision_config
from config.settings import settings

logger = logging.getLogger(__name__)


class RealModelStack:
    """Real model stack for production on HuggingFace Spaces.

    Loads all 3 models at initialization (~18GB total on single GPU):
    - Vision 4B via vLLM: ~10GB
    - Embedding 2B: ~4GB
    - Reranker 2B: ~4GB
    """

    def __init__(self):
        self.models: dict[str, Any] = {}
        self.processors: dict[str, Any] = {}
        self._loaded = False

    def _log_gpu_status(self):
        """Log current GPU memory status."""
        if torch.cuda.is_available():
            gpu_count = torch.cuda.device_count()
            logger.info(f"GPU memory status ({gpu_count} devices):")
            for i in range(gpu_count):
                total = torch.cuda.get_device_properties(i).total_memory / (1024**3)
                allocated = torch.cuda.memory_allocated(i) / (1024**3)
                cached = torch.cuda.memory_reserved(i) / (1024**3)
                free = total - allocated
                logger.info(f"  GPU {i}: {allocated:.1f}GB allocated, {cached:.1f}GB cached, {free:.1f}GB free / {total:.1f}GB total")

    def load_all(self) -> "RealModelStack":
        """Load all models.

        Loads FP8 vision model via vLLM and RAG models (Embedding + Reranker).
        """
        if self._loaded:
            logger.debug("Models already loaded, skipping")
            return self

        logger.info("Loading production models...")
        self._log_gpu_status()

        total_start = time.time()

        # Vision model via vLLM (~10GB for 4B model)
        logger.info(f"Loading vision model: {settings.vision_model}")
        vision_start = time.time()

        from vllm import LLM, SamplingParams
        from transformers import AutoProcessor

        self.models["vision"] = LLM(
            model=settings.vision_model,
            tensor_parallel_size=settings.vllm_tensor_parallel_size,  # 1 for single GPU
            trust_remote_code=True,
            gpu_memory_utilization=0.55,  # Leave ~10GB for embedding + reranker
            max_model_len=8192,  # Reduced to save KV cache memory
            enforce_eager=True,  # Skip torch.compile to reduce memory overhead
        )

        # Load processor for chat template formatting
        self.processors["vision"] = AutoProcessor.from_pretrained(
            settings.vision_model,
            trust_remote_code=True,
        )

        # Store sampling params for inference
        self.models["vision_sampling_params"] = SamplingParams(
            max_tokens=vision_config.max_tokens,
            temperature=vision_config.temperature,
            top_p=vision_config.top_p,
            top_k=vision_config.top_k,
            repetition_penalty=vision_config.repetition_penalty,
        )

        logger.info(f"Vision model loaded in {time.time() - vision_start:.2f}s")

        # Embedding model (~4GB in BF16) - Using official Qwen3VLEmbedder
        logger.info(f"Loading embedding model: {settings.embedding_model}")
        embed_start = time.time()
        from scripts.qwen3_vl import Qwen3VLEmbedder

        self.models["embedding"] = Qwen3VLEmbedder(
            model_name_or_path=settings.embedding_model,
            torch_dtype=torch.bfloat16,
        )
        self.processors["embedding"] = self.models["embedding"].processor
        logger.info(f"Embedding model loaded in {time.time() - embed_start:.2f}s")

        # Reranker model (~4GB in BF16) - Using official Qwen3VLReranker
        logger.info(f"Loading reranker model: {settings.reranker_model}")
        reranker_start = time.time()
        from scripts.qwen3_vl import Qwen3VLReranker

        self.models["reranker"] = Qwen3VLReranker(
            model_name_or_path=settings.reranker_model,
            torch_dtype=torch.bfloat16,
        )
        self.processors["reranker"] = self.models["reranker"].processor
        logger.info(f"Reranker model loaded in {time.time() - reranker_start:.2f}s")

        self._loaded = True
        total_time = time.time() - total_start
        logger.info(f"All models loaded in {total_time:.2f}s")
        self._log_gpu_status()
        return self

    def is_loaded(self) -> bool:
        """Check if models are loaded."""
        return self._loaded

    @property
    def vision(self) -> "VisionModel":
        """Return FP8 vision model wrapped for pipeline consumption."""
        if not self._loaded:
            raise RuntimeError("Models not loaded. Call load_all() first.")
        return VisionModel(
            model=self.models["vision"],
            processor=self.processors["vision"],
            sampling_params=self.models["vision_sampling_params"],
        )

    @property
    def embedding(self) -> "RealEmbeddingModel":
        """Return embedding model wrapped for pipeline consumption."""
        if not self._loaded:
            raise RuntimeError("Models not loaded. Call load_all() first.")
        return RealEmbeddingModel(self.models["embedding"], self.processors["embedding"])

    @property
    def reranker(self) -> "RealRerankerModel":
        """Return reranker model wrapped for pipeline consumption."""
        if not self._loaded:
            raise RuntimeError("Models not loaded. Call load_all() first.")
        return RealRerankerModel(self.models["reranker"], self.processors["reranker"])


class VisionModel:
    """Vision model for fire damage analysis.

    Uses Qwen/Qwen3-VL-4B-Thinking via vLLM for inference.
    Reasoning-enhanced model handles analysis with extended thinking
    and outputs structured JSON.

    Pipeline: Image -> Thinking Model (reasoning + JSON) -> Output
    """

    # System prompt for FDAM fire damage assessment
    VISION_SYSTEM_PROMPT = """You are an expert industrial hygienist analyzing fire damage images for the FDAM (Fire Damage Assessment Methodology) framework.

## Your Task
Analyze the provided image and return a structured JSON response with fire damage assessment.

## Zone Classification Criteria
- **Burn Zone**: Direct fire involvement. Look for structural char, complete combustion, exposed/damaged structural elements.
- **Near-Field**: Adjacent to burn zone with heavy smoke/heat exposure. Look for heavy soot deposits, heat damage (warping, discoloration), strong visible contamination.
- **Far-Field**: Smoke migration without direct heat exposure. Look for light to moderate deposits, discoloration, no structural damage.

## Condition Assessment Criteria
- **Background**: No visible contamination; surfaces appear normal/clean.
- **Light**: Faint discoloration; minimal visible deposits; would show faint marks on white wipe test.
- **Moderate**: Visible film or deposits; clear contamination; surface color noticeably altered.
- **Heavy**: Thick deposits; surface texture obscured; heavy coating visible.
- **Structural Damage**: Physical damage requiring repair before cleaning (charring, warping, holes, collapse).

## Material Categories
- **Non-porous**: steel, concrete, glass, metal, CMU (concrete masonry unit)
- **Semi-porous**: painted drywall, sealed wood
- **Porous**: unpainted drywall, carpet, insulation, acoustic tile, upholstery
- **HVAC**: rigid ductwork, flexible ductwork

## Combustion Particle Visual Indicators
- **Soot**: Black/dark gray coating with oily/sticky appearance; fine uniform texture
- **Char**: Black angular fragments; visible wood grain or fibrous structure
- **Ash**: Gray/white powdery residue; crystalline appearance"""

    # JSON output format prompt
    JSON_FORMAT_PROMPT = """Analyze this fire damage image and return a JSON response with this exact structure:

{
    "zone": {
        "classification": "burn" | "near-field" | "far-field",
        "confidence": 0.0-1.0,
        "reasoning": "explanation"
    },
    "condition": {
        "level": "background" | "light" | "moderate" | "heavy" | "structural-damage",
        "confidence": 0.0-1.0,
        "reasoning": "explanation"
    },
    "materials": [
        {
            "type": "material type",
            "category": "non-porous" | "semi-porous" | "porous" | "hvac",
            "confidence": 0.0-1.0,
            "location_description": "where in image",
            "bounding_box": {"x": 0.0-1.0, "y": 0.0-1.0, "width": 0.0-1.0, "height": 0.0-1.0}
        }
    ],
    "combustion_indicators": {
        "soot_visible": true/false,
        "soot_pattern": "description or null",
        "char_visible": true/false,
        "char_description": "description or null",
        "ash_visible": true/false,
        "ash_description": "description or null"
    },
    "structural_concerns": ["list of structural issues if any"],
    "access_issues": ["list of access problems if any"],
    "recommended_sampling_locations": [
        {
            "description": "where to sample",
            "sample_type": "tape_lift" | "surface_wipe" | "air_sample",
            "priority": "high" | "medium" | "low"
        }
    ],
    "flags_for_review": ["any items requiring human review"]
}

IMPORTANT: Return ONLY valid JSON, no additional text."""

    def __init__(self, model, processor, sampling_params):
        self.model = model
        self.processor = processor
        self.sampling_params = sampling_params

    def analyze_image(self, image: Image.Image, context: str = "") -> dict[str, Any]:
        """Analyze an image using the FP8 vision model via vLLM.

        Args:
            image: PIL Image to analyze
            context: Optional context string (room info, etc.)

        Returns:
            Structured dict with zone, condition, materials, etc.
        """
        start_time = time.time()
        logger.debug(f"Starting FP8 vision analysis (context: {len(context)} chars)")

        try:
            # Build messages in Qwen3-VL format
            messages = self._build_messages(image, context)

            # Apply chat template to format prompt correctly
            prompt = self.processor.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=True,
            )

            # Generate response using vLLM multimodal API
            # Per vLLM docs: pass PIL image directly in multi_modal_data dict
            outputs = self.model.generate(
                prompts=[{
                    "prompt": prompt,
                    "multi_modal_data": {"image": image},  # Single PIL image
                }],
                sampling_params=self.sampling_params,
            )

            response_text = outputs[0].outputs[0].text

            # Parse JSON from response
            result = self._parse_json_response(response_text)

            # Log result summary
            total_time = time.time() - start_time
            zone = result.get("zone", {}).get("classification", "unknown")
            zone_conf = result.get("zone", {}).get("confidence", 0)
            condition = result.get("condition", {}).get("level", "unknown")
            condition_conf = result.get("condition", {}).get("confidence", 0)
            num_materials = len(result.get("materials", []))
            logger.info(f"Vision analysis complete in {total_time:.2f}s: "
                       f"zone={zone} ({zone_conf:.2f}), condition={condition} ({condition_conf:.2f}), "
                       f"materials={num_materials}")

            return result

        except Exception as e:
            logger.error(f"Vision analysis failed: {e}")
            return self._get_fallback_response(str(e))

    def _build_messages(self, image: Image.Image, context: str) -> list[dict]:
        """Build messages in Qwen3-VL format for chat template.

        Qwen3-VL expects:
        - System message with role="system"
        - User message with mixed content [{"type": "image", ...}, {"type": "text", ...}]
        """
        # Build user text content
        user_text = self.JSON_FORMAT_PROMPT
        if context:
            user_text = f"Context: {context}\n\n{user_text}"

        messages = [
            {"role": "system", "content": self.VISION_SYSTEM_PROMPT},
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": image},
                    {"type": "text", "text": user_text},
                ],
            },
        ]
        return messages

    def _parse_json_response(self, response: str) -> dict[str, Any]:
        """Parse JSON response from model."""
        try:
            # Try to extract JSON from response
            json_match = re.search(r'\{[\s\S]*\}', response)
            if json_match:
                json_str = json_match.group()
                return json.loads(json_str)
            else:
                logger.warning("No JSON found in response")
                return self._get_fallback_response("No JSON in response")
        except json.JSONDecodeError as e:
            logger.warning(f"Failed to parse JSON: {e}")
            return self._get_fallback_response(f"JSON parse error: {e}")

    def _get_fallback_response(self, reason: str) -> dict[str, Any]:
        """Return fallback response when analysis fails."""
        return {
            "zone": {
                "classification": "far-field",
                "confidence": 0.3,
                "reasoning": f"Fallback due to: {reason}",
            },
            "condition": {
                "level": "light",
                "confidence": 0.3,
                "reasoning": f"Fallback due to: {reason}",
            },
            "materials": [
                {
                    "type": "general-surface",
                    "category": "semi-porous",
                    "confidence": 0.3,
                    "location_description": "Unable to determine",
                    "bounding_box": {"x": 0.0, "y": 0.0, "width": 1.0, "height": 1.0},
                }
            ],
            "combustion_indicators": {
                "soot_visible": False,
                "soot_pattern": None,
                "char_visible": False,
                "char_description": None,
                "ash_visible": False,
                "ash_description": None,
            },
            "structural_concerns": [],
            "access_issues": [],
            "recommended_sampling_locations": [],
            "flags_for_review": [f"Analysis failed: {reason}"],
            "_fallback_used": True,
        }


class RealEmbeddingModel:
    """Wrapper for real embedding model inference.

    Uses the official Qwen3VLEmbedder from QwenLM/Qwen3-VL-Embedding.
    The model handles last-token pooling and L2 normalization internally.

    Model: Qwen/Qwen3-VL-Embedding-2B (2048-dim output)
    """

    def __init__(self, model, processor):
        """Initialize with Qwen3VLEmbedder instance.

        Args:
            model: Qwen3VLEmbedder instance (official loader)
            processor: Processor (stored for compatibility, but model has its own)
        """
        self.model = model
        self.processor = processor

    def embed(self, text: str) -> list[float]:
        """Generate embedding for text using official Qwen3VLEmbedder.

        The official model.process() handles:
        - Tokenization and preprocessing
        - Last-token pooling
        - L2 normalization

        Args:
            text: Input text to embed

        Returns:
            List of floats representing the embedding (2048-dim for 2B model)
        """
        try:
            # Use official process() API - expects list of dicts
            inputs = [{"text": text}]
            embeddings = self.model.process(inputs, normalize=True)

            # embeddings is a tensor of shape (1, hidden_dim)
            return embeddings[0].cpu().tolist()

        except Exception as e:
            logger.error(f"Embedding generation failed: {e}")
            # Return zero vector as fallback (2048-dim per Qwen3-VL-Embedding-2B)
            hidden_size = getattr(self.model.model.config, "hidden_size", 2048)
            return [0.0] * hidden_size

    def embed_batch(self, texts: list[str]) -> list[list[float]]:
        """Generate embeddings for a batch of texts.

        Uses official batch processing for efficiency.
        """
        try:
            inputs = [{"text": text} for text in texts]
            embeddings = self.model.process(inputs, normalize=True)
            return [emb.cpu().tolist() for emb in embeddings]
        except Exception as e:
            logger.error(f"Batch embedding generation failed: {e}")
            hidden_size = getattr(self.model.model.config, "hidden_size", 2048)
            return [[0.0] * hidden_size for _ in texts]


class RealRerankerModel:
    """Wrapper for real reranker model inference.

    Uses the official Qwen3VLReranker from QwenLM/Qwen3-VL-Embedding.
    The model handles yes/no scoring internally via:
    - Extracts "yes" and "no" token weights from the LM head
    - Creates a binary linear layer: weight = yes_weight - no_weight
    - Scores = sigmoid(linear(last_token_hidden_state))

    Model: Qwen/Qwen3-VL-Reranker-2B
    """

    def __init__(self, model, processor):
        """Initialize with Qwen3VLReranker instance.

        Args:
            model: Qwen3VLReranker instance (official loader)
            processor: Processor (stored for compatibility, but model has its own)
        """
        self.model = model
        self.processor = processor

    def rerank(self, query: str, documents: list[str]) -> list[float]:
        """Rerank documents by relevance to query using official Qwen3VLReranker.

        The official model.process() handles:
        - Proper message formatting
        - Tokenization
        - Yes/no scoring with LM head weights
        - Sigmoid normalization

        Args:
            query: The search query
            documents: List of documents to rerank

        Returns:
            List of relevance scores (0-1) for each document.
            Higher scores indicate more relevant documents.
        """
        if not documents:
            return []

        try:
            # Use official process() API - expects dict with query and documents
            inputs = {
                "instruction": "Retrieve relevant documents for the query.",
                "query": {"text": query},
                "documents": [{"text": doc} for doc in documents],
            }
            scores = self.model.process(inputs)
            return scores

        except Exception as e:
            logger.error(f"Reranking failed: {e}")
            return [0.0] * len(documents)

    def rerank_with_indices(
        self, query: str, documents: list[str], top_k: int = None
    ) -> list[tuple[int, float]]:
        """Rerank and return sorted (index, score) tuples.

        Args:
            query: The search query
            documents: List of documents to rerank
            top_k: Optional limit on number of results

        Returns:
            List of (original_index, score) tuples, sorted by score descending
        """
        scores = self.rerank(query, documents)

        # Create (index, score) pairs and sort by score descending
        indexed_scores = list(enumerate(scores))
        indexed_scores.sort(key=lambda x: x[1], reverse=True)

        if top_k is not None:
            indexed_scores = indexed_scores[:top_k]

        return indexed_scores