import logging
from typing import Optional, Sequence

import numpy as np
import torch
from PIL import Image
from transformers import Sam3Model, Sam3Processor

from .base import Segmenter, SegmentationResult


class SAM3Segmenter(Segmenter):
    """
    SAM3 (Segment Anything Model 3) segmenter.

    Performs automatic instance segmentation on images without prompts.
    Uses facebook/sam3 model from HuggingFace.
    """

    name = "sam3"

    def __init__(
        self,
        model_id: str = "facebook/sam3",
        device: Optional[str] = None,
        threshold: float = 0.5,
        mask_threshold: float = 0.5,
    ):
        """
        Initialize SAM3 segmenter.

        Args:
            model_id: HuggingFace model ID
            device: Device to run on (cuda/cpu), auto-detected if None
            threshold: Confidence threshold for filtering instances
            mask_threshold: Threshold for binarizing masks
        """
        self.device = device or (
            "cuda" if torch.cuda.is_available() else "cpu"
        )
        self.threshold = threshold
        self.mask_threshold = mask_threshold

        logging.info(
            "Loading SAM3 model %s on device %s", model_id, self.device
        )

        try:
            self.model = Sam3Model.from_pretrained(model_id).to(self.device)
            self.processor = Sam3Processor.from_pretrained(model_id)
            self.model.eval()
        except Exception:
            logging.exception("Failed to load SAM3 model")
            raise

        logging.info("SAM3 model loaded successfully")

    supports_batch = True
    max_batch_size = 8

    def _parse_single_result(self, results, frame_shape) -> SegmentationResult:
        # Extract results
        masks = results.get("masks", [])
        scores = results.get("scores", None)
        boxes = results.get("boxes", None)

        # Convert to numpy arrays
        if len(masks) > 0:
            # Stack masks: list of (H, W) -> (N, H, W)
            masks_array = np.stack([m.cpu().numpy() for m in masks])
        else:
            # No objects detected
            masks_array = np.zeros(
                (0, frame_shape[0], frame_shape[1]), dtype=bool
            )

        scores_array = (
            scores.cpu().numpy() if scores is not None else None
        )
        boxes_array = (
            boxes.cpu().numpy() if boxes is not None else None
        )

        return SegmentationResult(
            masks=masks_array,
            scores=scores_array,
            boxes=boxes_array,
        )

    def _expand_inputs_if_needed(self, inputs):
        """
        Helper to expand vision inputs (pixel_values or vision_embeds) to match text prompts.
        Handles:
        1. 1 image, N texts (Expand 1 -> N)
        2. N images, N*M texts (Expand N -> N*M)
        """
        pixel_values = inputs.get("pixel_values")
        input_ids = inputs.get("input_ids")
        
        if (
            pixel_values is not None 
            and input_ids is not None
        ):
            img_batch = pixel_values.shape[0]
            text_batch = input_ids.shape[0]
            
            should_expand = False
            expansion_factor = 1
            
            if img_batch == 1 and text_batch > 1:
                should_expand = True
                expansion_factor = text_batch
            elif img_batch > 1 and text_batch > img_batch and text_batch % img_batch == 0:
                should_expand = True
                expansion_factor = text_batch // img_batch

            if should_expand:
                logging.debug(f"Expanding SAM3 vision inputs from {img_batch} to {text_batch} (factor {expansion_factor}) using embeddings reuse.")
                
                # 1. Compute vision embeddings once for original images
                with torch.no_grad():
                    vision_outputs = self.model.get_vision_features(
                        pixel_values=pixel_values
                    )
                
                
                # Iterate over keys to expand
                keys_to_expand = list(vision_outputs.keys())
                for key in keys_to_expand:
                    value = getattr(vision_outputs, key, None)
                    if value is None:
                        # Try getItem
                        try:
                            value = vision_outputs[key]
                        except:
                            continue
                            
                    new_value = None
                    if isinstance(value, torch.Tensor):
                        # Ensure we only expand the batch dimension (dim 0)
                        if value.shape[0] == img_batch:
                             new_value = value.repeat_interleave(expansion_factor, dim=0)
                    elif isinstance(value, (list, tuple)):
                        new_list = []
                        valid_expansion = False
                        for i, v in enumerate(value):
                            if isinstance(v, torch.Tensor) and v.shape[0] == img_batch:
                                new_list.append(v.repeat_interleave(expansion_factor, dim=0))
                                valid_expansion = True
                            else:
                                new_list.append(v)
                        
                        if valid_expansion:
                            # Preserve type
                            new_value = type(value)(new_list)
                    
                    if new_value is not None:
                         # Update dict item if possible
                         try:
                            vision_outputs[key] = new_value
                         except:
                            pass
                         # Update attribute explicitly if it exists
                         if hasattr(vision_outputs, key):
                             setattr(vision_outputs, key, new_value)
                         
                
                # 3. Update inputs for model call
                inputs["vision_embeds"] = vision_outputs
                del inputs["pixel_values"] # Mutually exclusive with vision_embeds
                
                # 4. Expand other metadata
                if "original_sizes" in inputs and inputs["original_sizes"].shape[0] == img_batch:
                    inputs["original_sizes"] = inputs["original_sizes"].repeat_interleave(expansion_factor, dim=0)
                
                if "reshape_input_sizes" in inputs and inputs["reshape_input_sizes"].shape[0] == img_batch:
                    inputs["reshape_input_sizes"] = inputs["reshape_input_sizes"].repeat_interleave(expansion_factor, dim=0)

    def predict(self, frame: np.ndarray, text_prompts: Optional[list] = None) -> SegmentationResult:
        """
        Run SAM3 segmentation on a frame.

        Args:
            frame: Input image (HxWx3 numpy array in RGB)
            text_prompts: List of text prompts for segmentation

        Returns:
            SegmentationResult with instance masks
        """
        # Convert numpy array to PIL Image
        if frame.dtype == np.uint8:
            pil_image = Image.fromarray(frame)
        else:
            # Normalize to 0-255 if needed
            frame_uint8 = (frame * 255).astype(np.uint8)
            pil_image = Image.fromarray(frame_uint8)

        # Use default prompts if none provided
        if not text_prompts:
            text_prompts = ["object"]

        # Process image with text prompts
        inputs = self.processor(
            images=pil_image, text=text_prompts, return_tensors="pt"
        ).to(self.device)

        # Handle batch expansion
        self._expand_inputs_if_needed(inputs)


        # Run inference
        try:
            if "pixel_values" in inputs:
                logging.debug(f"SAM3 Input pixel_values shape: {inputs['pixel_values'].shape}")
            with torch.no_grad():
                outputs = self.model(**inputs)
        except RuntimeError as e:
            logging.error(f"RuntimeError during SAM3 inference: {e}")
            logging.error(f"Input keys: {inputs.keys()}")
            if 'pixel_values' in inputs:
                logging.error(f"Pixel values shape: {inputs['pixel_values'].shape}")
            # Re-raise to let user know
            raise

        # Post-process to get instance masks
        try:
            results = self.processor.post_process_instance_segmentation(
                outputs,
                threshold=self.threshold,
                mask_threshold=self.mask_threshold,
                target_sizes=inputs.get("original_sizes").tolist(),
            )[0]
            return self._parse_single_result(results, frame.shape)

        except Exception:
            logging.exception("SAM3 post-processing failed")
            # Return empty result
            return SegmentationResult(
                masks=np.zeros((0, frame.shape[0], frame.shape[1]), dtype=bool),
                scores=None,
                boxes=None,
            )

    def predict_batch(self, frames: Sequence[np.ndarray], text_prompts: Optional[list] = None) -> Sequence[SegmentationResult]:
        pil_images = []
        for f in frames:
            if f.dtype == np.uint8:
                pil_images.append(Image.fromarray(f))
            else:
                f_uint8 = (f * 255).astype(np.uint8)
                pil_images.append(Image.fromarray(f_uint8))
        
        prompts = text_prompts or ["object"]
        
        # Flatten prompts for all images: [img1_p1, img1_p2, img2_p1, img2_p2, ...]
        flattened_prompts = []
        for _ in frames:
            flattened_prompts.extend(prompts)
            
        inputs = self.processor(images=pil_images, text=flattened_prompts, return_tensors="pt").to(self.device)
        
        # Handle batch expansion
        self._expand_inputs_if_needed(inputs)
        
        with torch.no_grad():
            outputs = self.model(**inputs)

        try:
             results_list = self.processor.post_process_instance_segmentation(
                outputs,
                threshold=self.threshold,
                mask_threshold=self.mask_threshold,
                target_sizes=inputs.get("original_sizes").tolist(),
            )
             return [self._parse_single_result(r, f.shape) for r, f in zip(results_list, frames)]
        except Exception:
            logging.exception("SAM3 batch post-processing failed")
            return [
                SegmentationResult(
                    masks=np.zeros((0, f.shape[0], f.shape[1]), dtype=bool),
                    scores=None,
                    boxes=None
                ) for f in frames
            ]