File size: 20,184 Bytes
14c59e5
88bdcff
706520f
14c59e5
706520f
 
14c59e5
455c786
 
14c59e5
455c786
 
88bdcff
 
ed575b1
 
 
14c59e5
ed575b1
88bdcff
 
 
f3ebc82
88bdcff
 
 
 
706520f
88bdcff
 
 
 
 
 
5f0db1e
 
14c59e5
 
706520f
 
5f0db1e
88bdcff
 
 
 
333c083
88bdcff
5f0db1e
 
f3ebc82
 
5f0db1e
f3ebc82
5f0db1e
 
 
 
 
 
333c083
706520f
5f0db1e
706520f
5f0db1e
333c083
 
5f0db1e
 
706520f
5f0db1e
 
333c083
5f0db1e
14c59e5
706520f
 
 
 
 
 
 
 
14c59e5
333c083
b85b1e0
 
1b7fbd7
333c083
5f0db1e
706520f
 
 
333c083
 
706520f
 
 
 
 
 
 
 
333c083
5f0db1e
706520f
 
 
f3ebc82
 
455c786
 
 
 
88bdcff
 
455c786
f3ebc82
88bdcff
706520f
f3ebc82
 
455c786
 
 
 
88bdcff
 
455c786
f3ebc82
88bdcff
333c083
 
 
5f0db1e
 
 
88bdcff
333c083
 
88bdcff
c190082
706520f
 
333c083
 
706520f
 
 
 
333c083
c190082
 
 
 
333c083
 
c190082
 
 
 
 
333c083
 
c190082
 
88bdcff
706520f
 
333c083
14c59e5
706520f
 
333c083
706520f
333c083
88bdcff
706520f
f3ebc82
 
 
706520f
f3ebc82
 
 
 
 
 
 
 
 
 
 
 
 
706520f
f3ebc82
 
 
 
 
 
706520f
 
 
 
 
 
 
 
 
88bdcff
 
 
706520f
 
88bdcff
 
 
706520f
88bdcff
706520f
 
88bdcff
 
 
706520f
 
88bdcff
706520f
88bdcff
 
 
 
 
 
706520f
88bdcff
 
 
706520f
88bdcff
 
 
706520f
88bdcff
 
706520f
88bdcff
 
 
706520f
 
 
 
88bdcff
 
706520f
333c083
706520f
 
 
 
 
 
333c083
f3ebc82
706520f
f3ebc82
333c083
706520f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
333c083
 
 
 
 
 
 
 
706520f
333c083
 
 
 
 
 
 
 
 
706520f
 
88bdcff
706520f
 
 
 
 
 
88bdcff
706520f
88bdcff
 
706520f
88bdcff
 
 
 
706520f
88bdcff
333c083
 
706520f
88bdcff
333c083
706520f
88bdcff
 
 
 
 
 
 
706520f
88bdcff
 
333c083
88bdcff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f3ebc82
 
455c786
 
706520f
 
f3ebc82
88bdcff
 
455c786
 
 
 
 
 
88bdcff
 
 
455c786
 
f3ebc82
455c786
 
 
 
f3ebc82
455c786
 
f3ebc82
455c786
706520f
f3ebc82
88bdcff
455c786
 
 
88bdcff
455c786
88bdcff
 
 
 
706520f
 
88bdcff
 
 
455c786
 
 
 
 
 
 
 
 
 
706520f
455c786
88bdcff
 
 
f3ebc82
 
455c786
 
f3ebc82
 
 
 
706520f
f3ebc82
88bdcff
 
455c786
f3ebc82
455c786
 
 
f3ebc82
455c786
 
f3ebc82
455c786
 
f3ebc82
455c786
 
 
 
 
88bdcff
455c786
 
 
88bdcff
455c786
 
 
88bdcff
 
 
 
 
455c786
 
 
 
 
 
 
 
88bdcff
 
455c786
 
88bdcff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
"""Real model loading for production (HuggingFace Spaces).

This module loads the production models:
- Vision: Qwen/Qwen3-VL-4B-Thinking (~10GB via vLLM, single GPU)
- Embedding: Qwen/Qwen3-VL-Embedding-2B (~4GB)
- Reranker: Qwen/Qwen3-VL-Reranker-2B (~4GB)
- Total: ~18GB on single L4 GPU (22GB)

Model Loading:
- Vision: vLLM with single GPU (no tensor parallelism needed)
- Embedding: Qwen3VLEmbedder (official scripts from QwenLM/Qwen3-VL-Embedding)
- Reranker: Qwen3VLReranker (official scripts from QwenLM/Qwen3-VL-Embedding)
"""

import os

# vLLM environment variables - MUST be set before importing vLLM
# Note: Using single GPU (TP=1) so NCCL workarounds are not needed

import json
import logging
import re
import time
import torch
from typing import Any
from PIL import Image

from config.inference import vision_config
from config.settings import settings

logger = logging.getLogger(__name__)


class RealModelStack:
    """Real model stack for production on HuggingFace Spaces.

    Loads all 3 models at initialization (~18GB total on single GPU):
    - Vision 4B via vLLM: ~10GB
    - Embedding 2B: ~4GB
    - Reranker 2B: ~4GB
    """

    def __init__(self):
        self.models: dict[str, Any] = {}
        self.processors: dict[str, Any] = {}
        self._loaded = False

    def _log_gpu_status(self):
        """Log current GPU memory status."""
        if torch.cuda.is_available():
            gpu_count = torch.cuda.device_count()
            logger.info(f"GPU memory status ({gpu_count} devices):")
            for i in range(gpu_count):
                total = torch.cuda.get_device_properties(i).total_memory / (1024**3)
                allocated = torch.cuda.memory_allocated(i) / (1024**3)
                cached = torch.cuda.memory_reserved(i) / (1024**3)
                free = total - allocated
                logger.info(f"  GPU {i}: {allocated:.1f}GB allocated, {cached:.1f}GB cached, {free:.1f}GB free / {total:.1f}GB total")

    def load_all(self) -> "RealModelStack":
        """Load all models.

        Loads FP8 vision model via vLLM and RAG models (Embedding + Reranker).
        """
        if self._loaded:
            logger.debug("Models already loaded, skipping")
            return self

        logger.info("Loading production models...")
        self._log_gpu_status()

        total_start = time.time()

        # Vision model via vLLM (~10GB for 4B model)
        logger.info(f"Loading vision model: {settings.vision_model}")
        vision_start = time.time()

        from vllm import LLM, SamplingParams
        from transformers import AutoProcessor

        self.models["vision"] = LLM(
            model=settings.vision_model,
            tensor_parallel_size=settings.vllm_tensor_parallel_size,  # 1 for single GPU
            trust_remote_code=True,
            gpu_memory_utilization=0.55,  # Leave ~10GB for embedding + reranker
            max_model_len=8192,  # Reduced to save KV cache memory
            enforce_eager=True,  # Skip torch.compile to reduce memory overhead
        )

        # Load processor for chat template formatting
        self.processors["vision"] = AutoProcessor.from_pretrained(
            settings.vision_model,
            trust_remote_code=True,
        )

        # Store sampling params for inference
        self.models["vision_sampling_params"] = SamplingParams(
            max_tokens=vision_config.max_tokens,
            temperature=vision_config.temperature,
            top_p=vision_config.top_p,
            top_k=vision_config.top_k,
            repetition_penalty=vision_config.repetition_penalty,
        )

        logger.info(f"Vision model loaded in {time.time() - vision_start:.2f}s")

        # Embedding model (~4GB in BF16) - Using official Qwen3VLEmbedder
        logger.info(f"Loading embedding model: {settings.embedding_model}")
        embed_start = time.time()
        from scripts.qwen3_vl import Qwen3VLEmbedder

        self.models["embedding"] = Qwen3VLEmbedder(
            model_name_or_path=settings.embedding_model,
            torch_dtype=torch.bfloat16,
        )
        self.processors["embedding"] = self.models["embedding"].processor
        logger.info(f"Embedding model loaded in {time.time() - embed_start:.2f}s")

        # Reranker model (~4GB in BF16) - Using official Qwen3VLReranker
        logger.info(f"Loading reranker model: {settings.reranker_model}")
        reranker_start = time.time()
        from scripts.qwen3_vl import Qwen3VLReranker

        self.models["reranker"] = Qwen3VLReranker(
            model_name_or_path=settings.reranker_model,
            torch_dtype=torch.bfloat16,
        )
        self.processors["reranker"] = self.models["reranker"].processor
        logger.info(f"Reranker model loaded in {time.time() - reranker_start:.2f}s")

        self._loaded = True
        total_time = time.time() - total_start
        logger.info(f"All models loaded in {total_time:.2f}s")
        self._log_gpu_status()
        return self

    def is_loaded(self) -> bool:
        """Check if models are loaded."""
        return self._loaded

    @property
    def vision(self) -> "VisionModel":
        """Return FP8 vision model wrapped for pipeline consumption."""
        if not self._loaded:
            raise RuntimeError("Models not loaded. Call load_all() first.")
        return VisionModel(
            model=self.models["vision"],
            processor=self.processors["vision"],
            sampling_params=self.models["vision_sampling_params"],
        )

    @property
    def embedding(self) -> "RealEmbeddingModel":
        """Return embedding model wrapped for pipeline consumption."""
        if not self._loaded:
            raise RuntimeError("Models not loaded. Call load_all() first.")
        return RealEmbeddingModel(self.models["embedding"], self.processors["embedding"])

    @property
    def reranker(self) -> "RealRerankerModel":
        """Return reranker model wrapped for pipeline consumption."""
        if not self._loaded:
            raise RuntimeError("Models not loaded. Call load_all() first.")
        return RealRerankerModel(self.models["reranker"], self.processors["reranker"])


class VisionModel:
    """Vision model for fire damage analysis.

    Uses Qwen/Qwen3-VL-4B-Thinking via vLLM for inference.
    Reasoning-enhanced model handles analysis with extended thinking
    and outputs structured JSON.

    Pipeline: Image -> Thinking Model (reasoning + JSON) -> Output
    """

    # System prompt for FDAM fire damage assessment
    VISION_SYSTEM_PROMPT = """You are an expert industrial hygienist analyzing fire damage images for the FDAM (Fire Damage Assessment Methodology) framework.

## Your Task
Analyze the provided image and return a structured JSON response with fire damage assessment.

## Zone Classification Criteria
- **Burn Zone**: Direct fire involvement. Look for structural char, complete combustion, exposed/damaged structural elements.
- **Near-Field**: Adjacent to burn zone with heavy smoke/heat exposure. Look for heavy soot deposits, heat damage (warping, discoloration), strong visible contamination.
- **Far-Field**: Smoke migration without direct heat exposure. Look for light to moderate deposits, discoloration, no structural damage.

## Condition Assessment Criteria
- **Background**: No visible contamination; surfaces appear normal/clean.
- **Light**: Faint discoloration; minimal visible deposits; would show faint marks on white wipe test.
- **Moderate**: Visible film or deposits; clear contamination; surface color noticeably altered.
- **Heavy**: Thick deposits; surface texture obscured; heavy coating visible.
- **Structural Damage**: Physical damage requiring repair before cleaning (charring, warping, holes, collapse).

## Material Categories
- **Non-porous**: steel, concrete, glass, metal, CMU (concrete masonry unit)
- **Semi-porous**: painted drywall, sealed wood
- **Porous**: unpainted drywall, carpet, insulation, acoustic tile, upholstery
- **HVAC**: rigid ductwork, flexible ductwork

## Combustion Particle Visual Indicators
- **Soot**: Black/dark gray coating with oily/sticky appearance; fine uniform texture
- **Char**: Black angular fragments; visible wood grain or fibrous structure
- **Ash**: Gray/white powdery residue; crystalline appearance"""

    # JSON output format prompt
    JSON_FORMAT_PROMPT = """Analyze this fire damage image and return a JSON response with this exact structure:

{
    "zone": {
        "classification": "burn" | "near-field" | "far-field",
        "confidence": 0.0-1.0,
        "reasoning": "explanation"
    },
    "condition": {
        "level": "background" | "light" | "moderate" | "heavy" | "structural-damage",
        "confidence": 0.0-1.0,
        "reasoning": "explanation"
    },
    "materials": [
        {
            "type": "material type",
            "category": "non-porous" | "semi-porous" | "porous" | "hvac",
            "confidence": 0.0-1.0,
            "location_description": "where in image",
            "bounding_box": {"x": 0.0-1.0, "y": 0.0-1.0, "width": 0.0-1.0, "height": 0.0-1.0}
        }
    ],
    "combustion_indicators": {
        "soot_visible": true/false,
        "soot_pattern": "description or null",
        "char_visible": true/false,
        "char_description": "description or null",
        "ash_visible": true/false,
        "ash_description": "description or null"
    },
    "structural_concerns": ["list of structural issues if any"],
    "access_issues": ["list of access problems if any"],
    "recommended_sampling_locations": [
        {
            "description": "where to sample",
            "sample_type": "tape_lift" | "surface_wipe" | "air_sample",
            "priority": "high" | "medium" | "low"
        }
    ],
    "flags_for_review": ["any items requiring human review"]
}

IMPORTANT: Return ONLY valid JSON, no additional text."""

    def __init__(self, model, processor, sampling_params):
        self.model = model
        self.processor = processor
        self.sampling_params = sampling_params

    def analyze_image(self, image: Image.Image, context: str = "") -> dict[str, Any]:
        """Analyze an image using the FP8 vision model via vLLM.

        Args:
            image: PIL Image to analyze
            context: Optional context string (room info, etc.)

        Returns:
            Structured dict with zone, condition, materials, etc.
        """
        start_time = time.time()
        logger.debug(f"Starting FP8 vision analysis (context: {len(context)} chars)")

        try:
            # Build messages in Qwen3-VL format
            messages = self._build_messages(image, context)

            # Apply chat template to format prompt correctly
            prompt = self.processor.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=True,
            )

            # Generate response using vLLM multimodal API
            # Per vLLM docs: pass PIL image directly in multi_modal_data dict
            outputs = self.model.generate(
                prompts=[{
                    "prompt": prompt,
                    "multi_modal_data": {"image": image},  # Single PIL image
                }],
                sampling_params=self.sampling_params,
            )

            response_text = outputs[0].outputs[0].text

            # Parse JSON from response
            result = self._parse_json_response(response_text)

            # Log result summary
            total_time = time.time() - start_time
            zone = result.get("zone", {}).get("classification", "unknown")
            zone_conf = result.get("zone", {}).get("confidence", 0)
            condition = result.get("condition", {}).get("level", "unknown")
            condition_conf = result.get("condition", {}).get("confidence", 0)
            num_materials = len(result.get("materials", []))
            logger.info(f"Vision analysis complete in {total_time:.2f}s: "
                       f"zone={zone} ({zone_conf:.2f}), condition={condition} ({condition_conf:.2f}), "
                       f"materials={num_materials}")

            return result

        except Exception as e:
            logger.error(f"Vision analysis failed: {e}")
            return self._get_fallback_response(str(e))

    def _build_messages(self, image: Image.Image, context: str) -> list[dict]:
        """Build messages in Qwen3-VL format for chat template.

        Qwen3-VL expects:
        - System message with role="system"
        - User message with mixed content [{"type": "image", ...}, {"type": "text", ...}]
        """
        # Build user text content
        user_text = self.JSON_FORMAT_PROMPT
        if context:
            user_text = f"Context: {context}\n\n{user_text}"

        messages = [
            {"role": "system", "content": self.VISION_SYSTEM_PROMPT},
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": image},
                    {"type": "text", "text": user_text},
                ],
            },
        ]
        return messages

    def _parse_json_response(self, response: str) -> dict[str, Any]:
        """Parse JSON response from model."""
        try:
            # Try to extract JSON from response
            json_match = re.search(r'\{[\s\S]*\}', response)
            if json_match:
                json_str = json_match.group()
                return json.loads(json_str)
            else:
                logger.warning("No JSON found in response")
                return self._get_fallback_response("No JSON in response")
        except json.JSONDecodeError as e:
            logger.warning(f"Failed to parse JSON: {e}")
            return self._get_fallback_response(f"JSON parse error: {e}")

    def _get_fallback_response(self, reason: str) -> dict[str, Any]:
        """Return fallback response when analysis fails."""
        return {
            "zone": {
                "classification": "far-field",
                "confidence": 0.3,
                "reasoning": f"Fallback due to: {reason}",
            },
            "condition": {
                "level": "light",
                "confidence": 0.3,
                "reasoning": f"Fallback due to: {reason}",
            },
            "materials": [
                {
                    "type": "general-surface",
                    "category": "semi-porous",
                    "confidence": 0.3,
                    "location_description": "Unable to determine",
                    "bounding_box": {"x": 0.0, "y": 0.0, "width": 1.0, "height": 1.0},
                }
            ],
            "combustion_indicators": {
                "soot_visible": False,
                "soot_pattern": None,
                "char_visible": False,
                "char_description": None,
                "ash_visible": False,
                "ash_description": None,
            },
            "structural_concerns": [],
            "access_issues": [],
            "recommended_sampling_locations": [],
            "flags_for_review": [f"Analysis failed: {reason}"],
            "_fallback_used": True,
        }


class RealEmbeddingModel:
    """Wrapper for real embedding model inference.

    Uses the official Qwen3VLEmbedder from QwenLM/Qwen3-VL-Embedding.
    The model handles last-token pooling and L2 normalization internally.

    Model: Qwen/Qwen3-VL-Embedding-2B (2048-dim output)
    """

    def __init__(self, model, processor):
        """Initialize with Qwen3VLEmbedder instance.

        Args:
            model: Qwen3VLEmbedder instance (official loader)
            processor: Processor (stored for compatibility, but model has its own)
        """
        self.model = model
        self.processor = processor

    def embed(self, text: str) -> list[float]:
        """Generate embedding for text using official Qwen3VLEmbedder.

        The official model.process() handles:
        - Tokenization and preprocessing
        - Last-token pooling
        - L2 normalization

        Args:
            text: Input text to embed

        Returns:
            List of floats representing the embedding (2048-dim for 2B model)
        """
        try:
            # Use official process() API - expects list of dicts
            inputs = [{"text": text}]
            embeddings = self.model.process(inputs, normalize=True)

            # embeddings is a tensor of shape (1, hidden_dim)
            return embeddings[0].cpu().tolist()

        except Exception as e:
            logger.error(f"Embedding generation failed: {e}")
            # Return zero vector as fallback (2048-dim per Qwen3-VL-Embedding-2B)
            hidden_size = getattr(self.model.model.config, "hidden_size", 2048)
            return [0.0] * hidden_size

    def embed_batch(self, texts: list[str]) -> list[list[float]]:
        """Generate embeddings for a batch of texts.

        Uses official batch processing for efficiency.
        """
        try:
            inputs = [{"text": text} for text in texts]
            embeddings = self.model.process(inputs, normalize=True)
            return [emb.cpu().tolist() for emb in embeddings]
        except Exception as e:
            logger.error(f"Batch embedding generation failed: {e}")
            hidden_size = getattr(self.model.model.config, "hidden_size", 2048)
            return [[0.0] * hidden_size for _ in texts]


class RealRerankerModel:
    """Wrapper for real reranker model inference.

    Uses the official Qwen3VLReranker from QwenLM/Qwen3-VL-Embedding.
    The model handles yes/no scoring internally via:
    - Extracts "yes" and "no" token weights from the LM head
    - Creates a binary linear layer: weight = yes_weight - no_weight
    - Scores = sigmoid(linear(last_token_hidden_state))

    Model: Qwen/Qwen3-VL-Reranker-2B
    """

    def __init__(self, model, processor):
        """Initialize with Qwen3VLReranker instance.

        Args:
            model: Qwen3VLReranker instance (official loader)
            processor: Processor (stored for compatibility, but model has its own)
        """
        self.model = model
        self.processor = processor

    def rerank(self, query: str, documents: list[str]) -> list[float]:
        """Rerank documents by relevance to query using official Qwen3VLReranker.

        The official model.process() handles:
        - Proper message formatting
        - Tokenization
        - Yes/no scoring with LM head weights
        - Sigmoid normalization

        Args:
            query: The search query
            documents: List of documents to rerank

        Returns:
            List of relevance scores (0-1) for each document.
            Higher scores indicate more relevant documents.
        """
        if not documents:
            return []

        try:
            # Use official process() API - expects dict with query and documents
            inputs = {
                "instruction": "Retrieve relevant documents for the query.",
                "query": {"text": query},
                "documents": [{"text": doc} for doc in documents],
            }
            scores = self.model.process(inputs)
            return scores

        except Exception as e:
            logger.error(f"Reranking failed: {e}")
            return [0.0] * len(documents)

    def rerank_with_indices(
        self, query: str, documents: list[str], top_k: int = None
    ) -> list[tuple[int, float]]:
        """Rerank and return sorted (index, score) tuples.

        Args:
            query: The search query
            documents: List of documents to rerank
            top_k: Optional limit on number of results

        Returns:
            List of (original_index, score) tuples, sorted by score descending
        """
        scores = self.rerank(query, documents)

        # Create (index, score) pairs and sort by score descending
        indexed_scores = list(enumerate(scores))
        indexed_scores.sort(key=lambda x: x[1], reverse=True)

        if top_k is not None:
            indexed_scores = indexed_scores[:top_k]

        return indexed_scores