import os
import logging
import hashlib
import sys
import traceback
import tempfile
import cv2
import numpy as np
import torch
import torch.nn.functional as F
import gradio as gr
from PIL import Image, ImageFilter, ImageChops
from huggingface_hub import hf_hub_download
import spaces

# --- IMPORT YOUR CUSTOM MODULES ---
# Ensure these files are present in your file structure
from sam2.build_sam import build_sam2
from sam2.sam2_image_predictor import SAM2ImagePredictor
from plm_adapter_lora_with_image_input_only_text_positions import PLMLanguageAdapter

# ----------------- Configuration -----------------
logging.basicConfig(level=logging.INFO)

# Single Model Configuration
REPO_ID = "aadarsh99/ConvSeg-Stage2"
SAM2_CONFIG = "sam2_hiera_l.yaml"
BASE_CKPT_NAME = "sam2_hiera_large.pt"
FINE_TUNED_SAM = "fine_tuned_sam2_batched_18000.torch"
FINE_TUNED_PLM = "fine_tuned_sam2_batched_plm_18000.torch"

SQUARE_DIM = 1024

# Global Cache
MODEL_CACHE = {"sam": None, "plm": None}

# ----------------- Helper Functions -----------------
def download_if_needed(repo_id, filename):
    try:
        logging.info(f"Checking {filename} in {repo_id}...")
        return hf_hub_download(repo_id=repo_id, filename=filename)
    except Exception as e:
        raise FileNotFoundError(f"Could not find {filename} in {repo_id}. Error: {e}")

def stable_color(key: str):
    h = int(hashlib.sha256(str(key).encode("utf-8")).hexdigest(), 16)
    # Bright, distinct colors for overlays
    EDGE_COLORS_HEX = ["#3A86FF", "#FF006E", "#43AA8B", "#F3722C", "#8338EC", "#90BE6D"]
    colors = [tuple(int(c.lstrip("#")[i:i+2], 16) for i in (0, 2, 4)) for c in EDGE_COLORS_HEX]
    return colors[h % len(colors)]

def make_overlay(rgb: np.ndarray, mask: np.ndarray, key: str = "mask") -> Image.Image:
    base = Image.fromarray(rgb.astype(np.uint8)).convert("RGBA")
    mask_bool = mask > 0
    color = stable_color(key)
    
    # Fill layer (Semi-transparent)
    fill_layer = Image.new("RGBA", base.size, color + (0,))
    fill_alpha = Image.fromarray((mask_bool.astype(np.uint8) * 140), "L")
    fill_layer.putalpha(fill_alpha)

    # Stroke/Edge layer
    m = Image.fromarray((mask_bool.astype(np.uint8) * 255), "L")
    edges = ImageChops.difference(m.filter(ImageFilter.MaxFilter(3)), m.filter(ImageFilter.MinFilter(3)))
    stroke_layer = Image.new("RGBA", base.size, color + (255,))
    stroke_layer.putalpha(edges)

    # Composite
    out = Image.alpha_composite(base, fill_layer)
    out = Image.alpha_composite(out, stroke_layer)
    return out.convert("RGB")

def ensure_models_loaded():
    global MODEL_CACHE
    if MODEL_CACHE["sam"] is not None: 
        return
    
    logging.info(f"Loading models from {REPO_ID}...")
    
    # 1. Load SAM2 Base & Fine-tuned weights
    base_path = download_if_needed(REPO_ID, BASE_CKPT_NAME)
    model = build_sam2(SAM2_CONFIG, base_path, device="cpu")
    
    sam_ckpt_path = download_if_needed(REPO_ID, FINE_TUNED_SAM)
    sd = torch.load(sam_ckpt_path, map_location="cpu")
    model.load_state_dict(sd.get("model", sd), strict=True)
    
    # 2. Load PLM Adapter
    plm_path = download_if_needed(REPO_ID, FINE_TUNED_PLM)
    plm = PLMLanguageAdapter(
        model_name="Qwen/Qwen2.5-VL-3B-Instruct", 
        transformer_dim=model.sam_mask_decoder.transformer_dim, 
        n_sparse_tokens=0, use_dense_bias=True, use_lora=True, 
        lora_r=16, lora_alpha=32, lora_dropout=0.05, 
        dtype=torch.bfloat16, device="cpu"
    )
    plm_sd = torch.load(plm_path, map_location="cpu")
    plm.load_state_dict(plm_sd["plm"], strict=True)
    plm.eval()
    
    MODEL_CACHE["sam"] = model
    MODEL_CACHE["plm"] = plm
    logging.info("Models loaded successfully.")

# ----------------- GPU Inference -----------------

@spaces.GPU(duration=120) 
def run_prediction(image_pil, user_text, threshold=0.5):
    if image_pil is None or not user_text:
        return None, None, None

    # --- Prepend the required prefix ---
    full_prompt = f"Segment the {user_text.strip()}"
    # remove trailing punctuation for consistency
    if full_prompt[-1] in {".", "!", "?"}:
        full_prompt = full_prompt[:-1]
    logging.info(f"Processing prompt: {full_prompt}")

    ensure_models_loaded()
    sam_model = MODEL_CACHE["sam"]
    plm_model = MODEL_CACHE["plm"]
    
    # Move to GPU
    sam_model.to("cuda")
    plm_model.to("cuda")
    
    try:
        with torch.inference_mode():
            predictor = SAM2ImagePredictor(sam_model)
            rgb_orig = np.array(image_pil.convert("RGB"))
            H, W = rgb_orig.shape[:2]
            
            # Smart Resizing & Padding
            scale = SQUARE_DIM / max(H, W)
            nw, nh = int(W * scale), int(H * scale)
            top, left = (SQUARE_DIM - nh) // 2, (SQUARE_DIM - nw) // 2

            rgb_sq = cv2.resize(rgb_orig, (nw, nh), interpolation=cv2.INTER_LINEAR)
            rgb_sq = cv2.copyMakeBorder(rgb_sq, top, SQUARE_DIM-nh-top, left, SQUARE_DIM-nw-left, cv2.BORDER_CONSTANT, value=0)
            
            # Image Encoder
            predictor.set_image(rgb_sq)
            image_emb = predictor._features["image_embed"][-1].unsqueeze(0)
            hi = [lvl[-1].unsqueeze(0) for lvl in predictor._features["high_res_feats"]]

            # PLM Adapter (Text + Image processing)
            with tempfile.NamedTemporaryFile(suffix=".jpg") as tmp:
                image_pil.save(tmp.name)
                # Qwen/PLM processes the text prompt here
                sp, dp = plm_model([full_prompt], image_emb.shape[2], image_emb.shape[3], [tmp.name])

            # SAM2 Mask Decoder
            dec = sam_model.sam_mask_decoder
            dev, dtype = next(dec.parameters()).device, next(dec.parameters()).dtype
            
            low, scores, _, _ = dec(
                image_embeddings=image_emb.to(dev, dtype),
                image_pe=sam_model.sam_prompt_encoder.get_dense_pe().to(dev, dtype),
                sparse_prompt_embeddings=sp.to(dev, dtype),
                dense_prompt_embeddings=dp.to(dev, dtype),
                multimask_output=True, repeat_image=False,
                high_res_features=[h.to(dev, dtype) for h in hi]
            )

            # Post-processing
            logits = predictor._transforms.postprocess_masks(low, (SQUARE_DIM, SQUARE_DIM))
            best_idx = scores.argmax().item()
            
            logit_crop = logits[0, best_idx, top:top+nh, left:left+nw].unsqueeze(0).unsqueeze(0)
            logit_full = F.interpolate(logit_crop, size=(H, W), mode="bilinear", align_corners=False)[0, 0]
            
            prob = torch.sigmoid(logit_full).float().cpu().numpy()

        # Visuals
        heatmap_cv = cv2.applyColorMap((prob * 255).astype(np.uint8), cv2.COLORMAP_JET)
        heatmap_rgb = cv2.cvtColor(heatmap_cv, cv2.COLOR_BGR2RGB)
        
        mask = (prob > threshold).astype(np.uint8) * 255
        # Use full_prompt for key to ensure consistent colors
        overlay = make_overlay(rgb_orig, mask, key=full_prompt)
        
        return overlay, Image.fromarray(heatmap_rgb), prob

    except Exception:
        traceback.print_exc()
        raise gr.Error("Inference failed. Please check logs.")
    finally:
        # Cleanup memory
        sam_model.to("cpu")
        plm_model.to("cpu")
        torch.cuda.empty_cache()

def update_threshold_ui(image_pil, user_text, threshold, cached_prob):
    """Real-time update using CPU only (no GPU quota usage)."""
    if image_pil is None or cached_prob is None:
        return None
    rgb_orig = np.array(image_pil.convert("RGB"))
    mask = (cached_prob > threshold).astype(np.uint8) * 255
    # Reconstruct full prompt to maintain consistent color hashing
    full_prompt = f"Segment the {user_text.strip()}" if user_text else "mask"
    return make_overlay(rgb_orig, mask, key=full_prompt)

# ----------------- UI Styling & Layout -----------------

custom_css = """
h1 {
    text-align: center;
    display: block;
}
.subtitle {
    text-align: center;
    font-size: 1.1em;
    margin-bottom: 20px;
}
.prefix-container {
    display: flex;
    align-items: center; 
    justify-content: center;
    height: 100%;
    /* Match Gradio Textbox font style */
    font-family: var(--font-sans); 
    font-size: var(--input-text-size);
    font-weight: 400;
    color: var(--body-text-color);
}
/* Force the HTML container to match height of neighbor */
.prefix-box {
    display: flex;
    flex-direction: column;
    justify-content: center;
    height: 100% !important;
    min-height: 42px; /* Standard Gradio input height fallback */
}
"""

theme = gr.themes.Soft(
    primary_hue="blue",
    neutral_hue="slate",
).set(
    button_primary_background_fill="*primary_600",
    button_primary_background_fill_hover="*primary_700",
)

def example_handler(text):
    """Callback to strip the prefix when an example is clicked"""
    prefix = "Segment the "
    if text and text.startswith(prefix):
        return text[len(prefix):]
    return text

with gr.Blocks(theme=theme, css=custom_css, title="ConvSeg-Net Demo") as demo:
    prob_state = gr.State()
    
    # Header
    gr.Markdown("# 🧩 Conversational Image Segmentation")
    gr.Markdown(
        "<div class='subtitle'>Grounding abstract concepts and physics-based reasoning into pixel-accurate masks.<br>"
        "Powered by <b>SAM2 + Qwen2.5-VL</b></div>"
    )

    with gr.Row():
        # --- Left Column: Inputs ---
        with gr.Column(scale=1):
            input_image = gr.Image(type="pil", label="Input Image", height=400)
            
            # Custom prompt input layout
            gr.Markdown("**Conversational Prompt**")
            with gr.Group():
                with gr.Row(equal_height=True):
                    # Fixed Prefix
                    gr.HTML(
                        "<div class='prefix-container'>Segment the</div>", 
                        elem_classes="prefix-box",
                        min_width=100,
                        max_width=100
                    )
                    # User Input
                    text_prompt = gr.Textbox(
                        show_label=False,
                        container=False, 
                        placeholder="object that is prone to rolling...",
                        lines=1,
                        scale=5
                    )
            
            with gr.Accordion("⚙️ Advanced Options", open=False):
                threshold_slider = gr.Slider(
                    0.0, 1.0, value=0.5, step=0.01, 
                    label="Mask Confidence Threshold",
                    info="Adjust after running to refine the mask edges."
                )

            run_btn = gr.Button("🚀 Run Segmentation", variant="primary", size="lg")

        # --- Right Column: Outputs ---
        with gr.Column(scale=1):
            out_overlay = gr.Image(label="Segmentation Result", type="pil")
            out_heatmap = gr.Image(label="Confidence Heatmap", type="pil")

    # --- Examples Section ---
    # Hidden textbox to capture the full prompt from the example gallery
    hidden_example_text = gr.Textbox(visible=False)

    gr.Markdown("### 📝 Try Examples")
    gr.Examples(
        examples=[
            ["./examples/elephants.png", "Segment the elephant acting as the vanguard of the herd."],
            ["./examples/luggage.png", "Segment the luggage resting precariously."],
            ["./examples/veggies.png", "Segment the produce harvested from underground."],
        ],
        inputs=[input_image, hidden_example_text], # Output full text to hidden box
    )

    # When hidden box updates (from click), strip the prefix and update the visible box
    hidden_example_text.change(
        fn=example_handler,
        inputs=hidden_example_text,
        outputs=text_prompt
    )

    # --- Event Handling ---
    
    # 1. Run Inference (GPU)
    run_btn.click(
        fn=run_prediction,
        inputs=[input_image, text_prompt, threshold_slider],
        outputs=[out_overlay, out_heatmap, prob_state]
    )

    # 2. Update Threshold (CPU - Instant)
    threshold_slider.change(
        fn=update_threshold_ui,
        inputs=[input_image, text_prompt, threshold_slider, prob_state],
        outputs=[out_overlay]
    )

if __name__ == "__main__":
    demo.queue().launch()