"""
caption-space — vision-language descriptions of inspiration images.

What this does:
  Takes one or more inspiration images (base64, or a JSON array of base64),
  returns a natural-language style description (interior design vocabulary).

Why LLaVA-1.5-7B:
  - Open source, no licensing issues
  - Strong style-vocabulary captioning (trained on web image-text pairs that
    include design/decor blogs)
  - Standard transformers pipeline (well-tested, stable API)
  - 7B params, ~14 GB at fp16 — fits ZeroGPU A10G comfortably

Use case:
  Frontend "Describe this mood board →" button →
    POST /describe-inspiration (FastAPI) →
      this Space's /describe →
        returns prose for user to edit before submitting the redesign job.

Multi-image inputs:
  LLaVA-1.5 only takes one image per call. If the user passes a JSON array of
  N images, we caption each separately and concatenate with " Also: ".
  The user can manually edit the result to merge — fine for POC. Future:
  swap to LLaVA-OneVision or Qwen2-VL which handle multi-image natively.

API contract:
  api_name="/describe"
  Inputs (positional):
    1. images_b64:  str   — single base64 PNG/JPEG OR JSON-array string
    2. instruction: str   — optional override, "" = use default
  Output: {"description": str}
"""

import base64
import io
import json
import os
import traceback

import gradio as gr
import spaces
import torch
from PIL import Image
from transformers import AutoProcessor, LlavaForConditionalGeneration


MODEL_ID = "llava-hf/llava-1.5-7b-hf"

DEFAULT_INSTRUCTION = (
    "You are an interior design expert. Describe the design style of this room "
    "in 2-3 sentences. Mention: style/era (e.g. Japandi, Parisian classical, "
    "industrial loft, mid-century modern), color palette, materials, key "
    "furniture pieces, patterns/textures, and overall vibe. Be specific with "
    "design vocabulary."
)

HF_TOKEN = os.environ.get("HF_TOKEN", "")


# ---------------------------------------------------------------------------
# Model load (CPU at startup, moves to GPU inside @spaces.GPU function)
# ---------------------------------------------------------------------------

print(f"[caption-space] loading {MODEL_ID}...")
processor = AutoProcessor.from_pretrained(
    MODEL_ID,
    token=HF_TOKEN if HF_TOKEN else None,
)
model = LlavaForConditionalGeneration.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    token=HF_TOKEN if HF_TOKEN else None,
)
print("[caption-space] model loaded on CPU (will move to GPU on first call).")


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

def _b64_to_pil(b64: str) -> Image.Image:
    return Image.open(io.BytesIO(base64.b64decode(b64))).convert("RGB")


def _parse_images(images_b64: str) -> list[Image.Image]:
    """Accept either a single base64 string or a JSON array of base64 strings."""
    try:
        parsed = json.loads(images_b64)
        if isinstance(parsed, list):
            return [_b64_to_pil(s) for s in parsed]
    except (json.JSONDecodeError, TypeError, ValueError):
        pass
    return [_b64_to_pil(images_b64)]


# ---------------------------------------------------------------------------
# Inference
# ---------------------------------------------------------------------------

@spaces.GPU(duration=60)
def _generate_one(image: Image.Image, instruction: str) -> str:
    model.to("cuda")

    conversation = [
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": instruction},
            ],
        },
    ]
    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

    inputs = processor(
        images=image,
        text=prompt,
        return_tensors="pt",
    ).to("cuda", torch.float16)

    out = model.generate(
        **inputs,
        max_new_tokens=200,
        do_sample=False,
    )
    # Decode only the newly generated tokens (skip the prompt)
    prompt_len = inputs.input_ids.shape[1]
    text = processor.decode(out[0][prompt_len:], skip_special_tokens=True)

    return text.strip()


# ---------------------------------------------------------------------------
# Main endpoint
# ---------------------------------------------------------------------------

def describe(images_b64: str, instruction: str = "") -> dict:
    try:
        instr = instruction.strip() if instruction else DEFAULT_INSTRUCTION
        images = _parse_images(images_b64)
        print(f"[describe] {len(images)} image(s), instr={instr[:80]!r}")

        descriptions = []
        for i, img in enumerate(images):
            desc = _generate_one(img, instr)
            print(f"[describe] image {i}: {desc[:100]!r}")
            descriptions.append(desc)

        final = descriptions[0] if len(descriptions) == 1 else " Also: ".join(descriptions)
        return {"description": final}

    except Exception as e:
        traceback.print_exc()
        raise ValueError(f"describe failed: {type(e).__name__}: {e}") from e


# ---------------------------------------------------------------------------
# Gradio interface
# ---------------------------------------------------------------------------

with gr.Blocks(title="Caption Space (LLaVA-1.5-7B)") as demo:
    gr.Markdown(
        "## LLaVA-1.5-7B — Inspiration Image Captioner\n\n"
        "Describes interior-design style, colors, materials, and key furniture in "
        "natural language. Used by the main pipeline to let users seed a style "
        "prompt from a mood board."
    )

    images_in = gr.Textbox(
        label="images_b64 (single base64 OR JSON array of base64 strings)",
        lines=4,
    )
    instr_in = gr.Textbox(
        label="instruction (optional override)",
        value=DEFAULT_INSTRUCTION,
        lines=3,
    )
    gr.Button("Describe").click(
        describe,
        inputs=[images_in, instr_in],
        outputs=gr.JSON(label="result"),
        api_name="describe",
    )

demo.launch(show_error=True)