"""Real model loading for production (HuggingFace Spaces).

This module loads the production models:
- Vision: Qwen/Qwen3-VL-4B-Thinking (~10GB via vLLM, single GPU)
- Embedding: Qwen/Qwen3-VL-Embedding-2B (~4GB)
- Reranker: Qwen/Qwen3-VL-Reranker-2B (~4GB)
- Total: ~18GB on single L4 GPU (22GB)

Model Loading:
- Vision: vLLM with single GPU (no tensor parallelism needed)
- Embedding: Qwen3VLEmbedder (official scripts from QwenLM/Qwen3-VL-Embedding)
- Reranker: Qwen3VLReranker (official scripts from QwenLM/Qwen3-VL-Embedding)
"""

import os

# vLLM environment variables - MUST be set before importing vLLM
# Note: Using single GPU (TP=1) so NCCL workarounds are not needed

import json
import logging
import re
import time
import torch
from typing import Any
from PIL import Image

from config.inference import vision_config
from config.settings import settings

logger = logging.getLogger(__name__)


class RealModelStack:
    """Real model stack for production on HuggingFace Spaces.

    Loads all 3 models at initialization (~18GB total on single GPU):
    - Vision 4B via vLLM: ~10GB
    - Embedding 2B: ~4GB
    - Reranker 2B: ~4GB
    """

    def __init__(self):
        self.models: dict[str, Any] = {}
        self.processors: dict[str, Any] = {}
        self._loaded = False

    def _log_gpu_status(self):
        """Log current GPU memory status."""
        if torch.cuda.is_available():
            gpu_count = torch.cuda.device_count()
            logger.info(f"GPU memory status ({gpu_count} devices):")
            for i in range(gpu_count):
                total = torch.cuda.get_device_properties(i).total_memory / (1024**3)
                allocated = torch.cuda.memory_allocated(i) / (1024**3)
                cached = torch.cuda.memory_reserved(i) / (1024**3)
                free = total - allocated
                logger.info(f"  GPU {i}: {allocated:.1f}GB allocated, {cached:.1f}GB cached, {free:.1f}GB free / {total:.1f}GB total")

    def load_all(self) -> "RealModelStack":
        """Load all models.

        Loads FP8 vision model via vLLM and RAG models (Embedding + Reranker).
        """
        if self._loaded:
            logger.debug("Models already loaded, skipping")
            return self

        logger.info("Loading production models...")
        self._log_gpu_status()

        total_start = time.time()

        # Vision model via vLLM (~10GB for 4B model)
        logger.info(f"Loading vision model: {settings.vision_model}")
        vision_start = time.time()

        from vllm import LLM, SamplingParams
        from transformers import AutoProcessor

        self.models["vision"] = LLM(
            model=settings.vision_model,
            tensor_parallel_size=settings.vllm_tensor_parallel_size,  # 1 for single GPU
            trust_remote_code=True,
            gpu_memory_utilization=0.55,  # Leave ~10GB for embedding + reranker
            max_model_len=8192,  # Reduced to save KV cache memory
            enforce_eager=True,  # Skip torch.compile to reduce memory overhead
        )

        # Load processor for chat template formatting
        self.processors["vision"] = AutoProcessor.from_pretrained(
            settings.vision_model,
            trust_remote_code=True,
        )

        # Store sampling params for inference
        self.models["vision_sampling_params"] = SamplingParams(
            max_tokens=vision_config.max_tokens,
            temperature=vision_config.temperature,
            top_p=vision_config.top_p,
            top_k=vision_config.top_k,
            repetition_penalty=vision_config.repetition_penalty,
        )

        logger.info(f"Vision model loaded in {time.time() - vision_start:.2f}s")

        # Embedding model (~4GB in BF16) - Using official Qwen3VLEmbedder
        logger.info(f"Loading embedding model: {settings.embedding_model}")
        embed_start = time.time()
        from scripts.qwen3_vl import Qwen3VLEmbedder

        self.models["embedding"] = Qwen3VLEmbedder(
            model_name_or_path=settings.embedding_model,
            torch_dtype=torch.bfloat16,
        )
        self.processors["embedding"] = self.models["embedding"].processor
        logger.info(f"Embedding model loaded in {time.time() - embed_start:.2f}s")

        # Reranker model (~4GB in BF16) - Using official Qwen3VLReranker
        logger.info(f"Loading reranker model: {settings.reranker_model}")
        reranker_start = time.time()
        from scripts.qwen3_vl import Qwen3VLReranker

        self.models["reranker"] = Qwen3VLReranker(
            model_name_or_path=settings.reranker_model,
            torch_dtype=torch.bfloat16,
        )
        self.processors["reranker"] = self.models["reranker"].processor
        logger.info(f"Reranker model loaded in {time.time() - reranker_start:.2f}s")

        self._loaded = True
        total_time = time.time() - total_start
        logger.info(f"All models loaded in {total_time:.2f}s")
        self._log_gpu_status()
        return self

    def is_loaded(self) -> bool:
        """Check if models are loaded."""
        return self._loaded

    @property
    def vision(self) -> "VisionModel":
        """Return FP8 vision model wrapped for pipeline consumption."""
        if not self._loaded:
            raise RuntimeError("Models not loaded. Call load_all() first.")
        return VisionModel(
            model=self.models["vision"],
            processor=self.processors["vision"],
            sampling_params=self.models["vision_sampling_params"],
        )

    @property
    def embedding(self) -> "RealEmbeddingModel":
        """Return embedding model wrapped for pipeline consumption."""
        if not self._loaded:
            raise RuntimeError("Models not loaded. Call load_all() first.")
        return RealEmbeddingModel(self.models["embedding"], self.processors["embedding"])

    @property
    def reranker(self) -> "RealRerankerModel":
        """Return reranker model wrapped for pipeline consumption."""
        if not self._loaded:
            raise RuntimeError("Models not loaded. Call load_all() first.")
        return RealRerankerModel(self.models["reranker"], self.processors["reranker"])


class VisionModel:
    """Vision model for fire damage analysis.

    Uses Qwen/Qwen3-VL-4B-Thinking via vLLM for inference.
    Reasoning-enhanced model handles analysis with extended thinking
    and outputs structured JSON.

    Pipeline: Image -> Thinking Model (reasoning + JSON) -> Output
    """

    # System prompt for FDAM fire damage assessment
    VISION_SYSTEM_PROMPT = """You are an expert industrial hygienist analyzing fire damage images for the FDAM (Fire Damage Assessment Methodology) framework.

## Your Task
Analyze the provided image and return a structured JSON response with fire damage assessment.

## Zone Classification Criteria
- **Burn Zone**: Direct fire involvement. Look for structural char, complete combustion, exposed/damaged structural elements.
- **Near-Field**: Adjacent to burn zone with heavy smoke/heat exposure. Look for heavy soot deposits, heat damage (warping, discoloration), strong visible contamination.
- **Far-Field**: Smoke migration without direct heat exposure. Look for light to moderate deposits, discoloration, no structural damage.

## Condition Assessment Criteria
- **Background**: No visible contamination; surfaces appear normal/clean.
- **Light**: Faint discoloration; minimal visible deposits; would show faint marks on white wipe test.
- **Moderate**: Visible film or deposits; clear contamination; surface color noticeably altered.
- **Heavy**: Thick deposits; surface texture obscured; heavy coating visible.
- **Structural Damage**: Physical damage requiring repair before cleaning (charring, warping, holes, collapse).

## Material Categories
- **Non-porous**: steel, concrete, glass, metal, CMU (concrete masonry unit)
- **Semi-porous**: painted drywall, sealed wood
- **Porous**: unpainted drywall, carpet, insulation, acoustic tile, upholstery
- **HVAC**: rigid ductwork, flexible ductwork

## Combustion Particle Visual Indicators
- **Soot**: Black/dark gray coating with oily/sticky appearance; fine uniform texture
- **Char**: Black angular fragments; visible wood grain or fibrous structure
- **Ash**: Gray/white powdery residue; crystalline appearance"""

    # JSON output format prompt
    JSON_FORMAT_PROMPT = """Analyze this fire damage image and return a JSON response with this exact structure:

{
    "zone": {
        "classification": "burn" | "near-field" | "far-field",
        "confidence": 0.0-1.0,
        "reasoning": "explanation"
    },
    "condition": {
        "level": "background" | "light" | "moderate" | "heavy" | "structural-damage",
        "confidence": 0.0-1.0,
        "reasoning": "explanation"
    },
    "materials": [
        {
            "type": "material type",
            "category": "non-porous" | "semi-porous" | "porous" | "hvac",
            "confidence": 0.0-1.0,
            "location_description": "where in image",
            "bounding_box": {"x": 0.0-1.0, "y": 0.0-1.0, "width": 0.0-1.0, "height": 0.0-1.0}
        }
    ],
    "combustion_indicators": {
        "soot_visible": true/false,
        "soot_pattern": "description or null",
        "char_visible": true/false,
        "char_description": "description or null",
        "ash_visible": true/false,
        "ash_description": "description or null"
    },
    "structural_concerns": ["list of structural issues if any"],
    "access_issues": ["list of access problems if any"],
    "recommended_sampling_locations": [
        {
            "description": "where to sample",
            "sample_type": "tape_lift" | "surface_wipe" | "air_sample",
            "priority": "high" | "medium" | "low"
        }
    ],
    "flags_for_review": ["any items requiring human review"]
}

IMPORTANT: Return ONLY valid JSON, no additional text."""

    def __init__(self, model, processor, sampling_params):
        self.model = model
        self.processor = processor
        self.sampling_params = sampling_params

    def analyze_image(self, image: Image.Image, context: str = "") -> dict[str, Any]:
        """Analyze an image using the FP8 vision model via vLLM.

        Args:
            image: PIL Image to analyze
            context: Optional context string (room info, etc.)

        Returns:
            Structured dict with zone, condition, materials, etc.
        """
        start_time = time.time()
        logger.debug(f"Starting FP8 vision analysis (context: {len(context)} chars)")

        try:
            # Build messages in Qwen3-VL format
            messages = self._build_messages(image, context)

            # Apply chat template to format prompt correctly
            prompt = self.processor.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=True,
            )

            # Generate response using vLLM multimodal API
            # Per vLLM docs: pass PIL image directly in multi_modal_data dict
            outputs = self.model.generate(
                prompts=[{
                    "prompt": prompt,
                    "multi_modal_data": {"image": image},  # Single PIL image
                }],
                sampling_params=self.sampling_params,
            )

            response_text = outputs[0].outputs[0].text

            # Parse JSON from response
            result = self._parse_json_response(response_text)

            # Log result summary
            total_time = time.time() - start_time
            zone = result.get("zone", {}).get("classification", "unknown")
            zone_conf = result.get("zone", {}).get("confidence", 0)
            condition = result.get("condition", {}).get("level", "unknown")
            condition_conf = result.get("condition", {}).get("confidence", 0)
            num_materials = len(result.get("materials", []))
            logger.info(f"Vision analysis complete in {total_time:.2f}s: "
                       f"zone={zone} ({zone_conf:.2f}), condition={condition} ({condition_conf:.2f}), "
                       f"materials={num_materials}")

            return result

        except Exception as e:
            logger.error(f"Vision analysis failed: {e}")
            return self._get_fallback_response(str(e))

    def _build_messages(self, image: Image.Image, context: str) -> list[dict]:
        """Build messages in Qwen3-VL format for chat template.

        Qwen3-VL expects:
        - System message with role="system"
        - User message with mixed content [{"type": "image", ...}, {"type": "text", ...}]
        """
        # Build user text content
        user_text = self.JSON_FORMAT_PROMPT
        if context:
            user_text = f"Context: {context}\n\n{user_text}"

        messages = [
            {"role": "system", "content": self.VISION_SYSTEM_PROMPT},
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": image},
                    {"type": "text", "text": user_text},
                ],
            },
        ]
        return messages

    def _parse_json_response(self, response: str) -> dict[str, Any]:
        """Parse JSON response from model."""
        try:
            # Try to extract JSON from response
            json_match = re.search(r'\{[\s\S]*\}', response)
            if json_match:
                json_str = json_match.group()
                return json.loads(json_str)
            else:
                logger.warning("No JSON found in response")
                return self._get_fallback_response("No JSON in response")
        except json.JSONDecodeError as e:
            logger.warning(f"Failed to parse JSON: {e}")
            return self._get_fallback_response(f"JSON parse error: {e}")

    def _get_fallback_response(self, reason: str) -> dict[str, Any]:
        """Return fallback response when analysis fails."""
        return {
            "zone": {
                "classification": "far-field",
                "confidence": 0.3,
                "reasoning": f"Fallback due to: {reason}",
            },
            "condition": {
                "level": "light",
                "confidence": 0.3,
                "reasoning": f"Fallback due to: {reason}",
            },
            "materials": [
                {
                    "type": "general-surface",
                    "category": "semi-porous",
                    "confidence": 0.3,
                    "location_description": "Unable to determine",
                    "bounding_box": {"x": 0.0, "y": 0.0, "width": 1.0, "height": 1.0},
                }
            ],
            "combustion_indicators": {
                "soot_visible": False,
                "soot_pattern": None,
                "char_visible": False,
                "char_description": None,
                "ash_visible": False,
                "ash_description": None,
            },
            "structural_concerns": [],
            "access_issues": [],
            "recommended_sampling_locations": [],
            "flags_for_review": [f"Analysis failed: {reason}"],
            "_fallback_used": True,
        }


class RealEmbeddingModel:
    """Wrapper for real embedding model inference.

    Uses the official Qwen3VLEmbedder from QwenLM/Qwen3-VL-Embedding.
    The model handles last-token pooling and L2 normalization internally.

    Model: Qwen/Qwen3-VL-Embedding-2B (2048-dim output)
    """

    def __init__(self, model, processor):
        """Initialize with Qwen3VLEmbedder instance.

        Args:
            model: Qwen3VLEmbedder instance (official loader)
            processor: Processor (stored for compatibility, but model has its own)
        """
        self.model = model
        self.processor = processor

    def embed(self, text: str) -> list[float]:
        """Generate embedding for text using official Qwen3VLEmbedder.

        The official model.process() handles:
        - Tokenization and preprocessing
        - Last-token pooling
        - L2 normalization

        Args:
            text: Input text to embed

        Returns:
            List of floats representing the embedding (2048-dim for 2B model)
        """
        try:
            # Use official process() API - expects list of dicts
            inputs = [{"text": text}]
            embeddings = self.model.process(inputs, normalize=True)

            # embeddings is a tensor of shape (1, hidden_dim)
            return embeddings[0].cpu().tolist()

        except Exception as e:
            logger.error(f"Embedding generation failed: {e}")
            # Return zero vector as fallback (2048-dim per Qwen3-VL-Embedding-2B)
            hidden_size = getattr(self.model.model.config, "hidden_size", 2048)
            return [0.0] * hidden_size

    def embed_batch(self, texts: list[str]) -> list[list[float]]:
        """Generate embeddings for a batch of texts.

        Uses official batch processing for efficiency.
        """
        try:
            inputs = [{"text": text} for text in texts]
            embeddings = self.model.process(inputs, normalize=True)
            return [emb.cpu().tolist() for emb in embeddings]
        except Exception as e:
            logger.error(f"Batch embedding generation failed: {e}")
            hidden_size = getattr(self.model.model.config, "hidden_size", 2048)
            return [[0.0] * hidden_size for _ in texts]


class RealRerankerModel:
    """Wrapper for real reranker model inference.

    Uses the official Qwen3VLReranker from QwenLM/Qwen3-VL-Embedding.
    The model handles yes/no scoring internally via:
    - Extracts "yes" and "no" token weights from the LM head
    - Creates a binary linear layer: weight = yes_weight - no_weight
    - Scores = sigmoid(linear(last_token_hidden_state))

    Model: Qwen/Qwen3-VL-Reranker-2B
    """

    def __init__(self, model, processor):
        """Initialize with Qwen3VLReranker instance.

        Args:
            model: Qwen3VLReranker instance (official loader)
            processor: Processor (stored for compatibility, but model has its own)
        """
        self.model = model
        self.processor = processor

    def rerank(self, query: str, documents: list[str]) -> list[float]:
        """Rerank documents by relevance to query using official Qwen3VLReranker.

        The official model.process() handles:
        - Proper message formatting
        - Tokenization
        - Yes/no scoring with LM head weights
        - Sigmoid normalization

        Args:
            query: The search query
            documents: List of documents to rerank

        Returns:
            List of relevance scores (0-1) for each document.
            Higher scores indicate more relevant documents.
        """
        if not documents:
            return []

        try:
            # Use official process() API - expects dict with query and documents
            inputs = {
                "instruction": "Retrieve relevant documents for the query.",
                "query": {"text": query},
                "documents": [{"text": doc} for doc in documents],
            }
            scores = self.model.process(inputs)
            return scores

        except Exception as e:
            logger.error(f"Reranking failed: {e}")
            return [0.0] * len(documents)

    def rerank_with_indices(
        self, query: str, documents: list[str], top_k: int = None
    ) -> list[tuple[int, float]]:
        """Rerank and return sorted (index, score) tuples.

        Args:
            query: The search query
            documents: List of documents to rerank
            top_k: Optional limit on number of results

        Returns:
            List of (original_index, score) tuples, sorted by score descending
        """
        scores = self.rerank(query, documents)

        # Create (index, score) pairs and sort by score descending
        indexed_scores = list(enumerate(scores))
        indexed_scores.sort(key=lambda x: x[1], reverse=True)

        if top_k is not None:
            indexed_scores = indexed_scores[:top_k]

        return indexed_scores