"""
tricket — AI product-ad video studio (Gradio frontend, runs on a HF Space).

Tab 1  🎬 Ad Studio : one prompt -> ad script -> per-scene image + voiceover ->
                       Ken Burns slideshow with burned captions -> 9:16 MP4.
Tab 2  🖼 Single Image : plain FLUX.1-schnell text-to-image.

Heavy lifting runs on Modal (FLUX / Kokoro / ffmpeg). The script is written by
GLM-5.1 through the Hugging Face inference router.

Required Space secrets (Settings -> Variables and secrets):
    MODAL_TOKEN_ID
    MODAL_TOKEN_SECRET
    HF_TOKEN            (for the GLM script-writer via router.huggingface.co)
"""

import io
import json
import os
import re
import tempfile

import gradio as gr
import modal
from PIL import Image

MODAL_APP = "tricket-flux"
SCRIPT_MODEL = "zai-org/GLM-4.6"  # via HF router; overridable with SCRIPT_MODEL env

# 9:16 generation size for FLUX (multiples of 16).
GEN_W, GEN_H = 768, 1344

# Kokoro voices grouped by language.
LANGS = {
    "中文": {
        "lang_code": "z",
        "voices": {
            "晓晓 · 女声": "zf_xiaoxiao",
            "小贝 · 女声": "zf_xiaobei",
            "云健 · 男声": "zm_yunjian",
            "云希 · 男声": "zm_yunxi",
        },
    },
    "English": {
        "lang_code": "a",
        "voices": {
            "Heart · F": "af_heart",
            "Bella · F": "af_bella",
            "Michael · M": "am_michael",
            "Puck · M": "am_puck",
        },
    },
}

# ---------------------------------------------------------------------------
# Modal handles (resolved lazily so import never crashes the Space).
# ---------------------------------------------------------------------------
def _modal_handles():
    model = modal.Cls.from_name(MODAL_APP, "Model")()
    tts = modal.Cls.from_name(MODAL_APP, "TTS")()
    animate = modal.Cls.from_name(MODAL_APP, "Animate")()
    assemble = modal.Function.from_name(MODAL_APP, "assemble_video")
    return model, tts, animate, assemble


def _need_modal():
    if not os.environ.get("MODAL_TOKEN_ID") or not os.environ.get("MODAL_TOKEN_SECRET"):
        raise gr.Error(
            "Modal credentials missing — add MODAL_TOKEN_ID and MODAL_TOKEN_SECRET "
            "to this Space's secrets."
        )


def _need_hf():
    if not os.environ.get("HF_TOKEN"):
        raise gr.Error(
            "HF_TOKEN missing — add it to this Space's secrets (used by the "
            "GLM script writer)."
        )


# ---------------------------------------------------------------------------
# Script generation (GLM via HF router)
# ---------------------------------------------------------------------------
def write_script(product, selling_points, language, num_scenes):
    from openai import OpenAI

    client = OpenAI(
        base_url="https://router.huggingface.co/v1",
        api_key=os.environ["HF_TOKEN"],
    )
    lang_name = "Simplified Chinese" if language == "中文" else "English"
    sys = (
        "You are an award-winning short-form video ad creative. "
        "You write punchy vertical (9:16) product ads."
    )
    user = f"""Create a {num_scenes}-scene vertical product ad.

Product: {product}
Key selling points: {selling_points or "(infer sensible ones)"}

Rules:
- Narration language: {lang_name}. Each scene narration is ONE short spoken sentence (max ~16 words), energetic.
- caption: a VERY short on-screen text overlay in {lang_name} (max ~6 words).
- image_prompt: a vivid ENGLISH text-to-image prompt for that scene, cinematic, vertical composition, no text in image.
- Scene 1 = hook / hero shot. Last scene = call to action.
- Return STRICT JSON only, no markdown, schema:
{{"title": str, "scenes": [{{"narration": str, "caption": str, "image_prompt": str}}]}}"""

    resp = client.chat.completions.create(
        model=os.environ.get("SCRIPT_MODEL", SCRIPT_MODEL),
        messages=[{"role": "system", "content": sys}, {"role": "user", "content": user}],
        temperature=0.8,
    )
    raw = resp.choices[0].message.content or ""
    data = _parse_json(raw)
    scenes = data.get("scenes", [])[:num_scenes]
    if not scenes:
        raise gr.Error("Script generation returned no scenes; try again.")
    return data.get("title", product), scenes


def _parse_json(raw):
    raw = raw.strip()
    raw = re.sub(r"^```(?:json)?", "", raw).strip()
    raw = re.sub(r"```$", "", raw).strip()
    try:
        return json.loads(raw)
    except Exception:
        m = re.search(r"\{.*\}", raw, re.DOTALL)
        if m:
            return json.loads(m.group(0))
        raise gr.Error("Could not parse script JSON from the model.")


# ---------------------------------------------------------------------------
# Ad Studio orchestration (streaming generator)
# ---------------------------------------------------------------------------
def make_ad(product, selling_points, language, voice_label, num_scenes, product_image, motion):
    _need_modal()
    _need_hf()
    if not product or not product.strip():
        raise gr.Error("Please describe your product.")

    num_scenes = int(num_scenes)
    lang_code = LANGS[language]["lang_code"]
    voice = LANGS[language]["voices"].get(voice_label) or next(
        iter(LANGS[language]["voices"].values())
    )
    use_video = "LTX" in (motion or "")
    # LTX needs /32; FLUX needs /16 — 704x1216 satisfies both and is LTX-native.
    gen_w, gen_h = (704, 1216) if use_video else (GEN_W, GEN_H)

    log = []

    def status(msg):
        log.append(msg)
        return "\n\n".join(log)

    yield status("📝 Writing ad script with GLM…"), [], None

    title, scenes = write_script(product, selling_points, language, num_scenes)
    script_preview = "\n".join(
        f"**{i+1}. {s.get('caption','')}** — {s.get('narration','')}"
        for i, s in enumerate(scenes)
    )
    yield status(f"🎬 **{title}**\n\n{script_preview}\n\n🖼 Generating scenes on GPU…"), [], None

    model, tts, animate, assemble = _modal_handles()

    # Fan out image + TTS jobs in parallel across Modal containers.
    img_calls = [
        model.generate.spawn(prompt=s["image_prompt"], width=gen_w, height=gen_h)
        for s in scenes
    ]
    tts_calls = [
        tts.synth.spawn(text=s.get("narration", ""), voice=voice, lang_code=lang_code)
        for s in scenes
    ]

    # Optional: use the uploaded product photo as hero (scene 1) and CTA (last).
    hero_bytes = _image_to_png_bytes(product_image) if product_image else None

    images, gallery = [], []
    for i, call in enumerate(img_calls):
        png = call.get()
        if hero_bytes and (i == 0 or i == len(img_calls) - 1):
            png = hero_bytes
        images.append(png)
        gallery.append(Image.open(io.BytesIO(png)))
        yield status(f"🖼 Scene {i+1}/{len(scenes)} ready…"), list(gallery), None

    # Optional: animate each still into a motion clip with LTX-Video.
    videos = [None] * len(scenes)
    if use_video:
        yield status("🎥 Animating scenes with LTX-Video (this is the slow part)…"), list(gallery), None
        anim_calls = [
            animate.animate.spawn(
                image_png=images[i],
                prompt=scenes[i].get("image_prompt", ""),
                width=gen_w,
                height=gen_h,
            )
            for i in range(len(scenes))
        ]
        for i, c in enumerate(anim_calls):
            videos[i] = c.get()
            yield status(f"🎥 Motion clip {i+1}/{len(scenes)} ready…"), list(gallery), None

    yield status("🔊 Synthesizing voiceover…"), list(gallery), None
    audios = [c.get() for c in tts_calls]

    mode_label = "LTX motion" if use_video else "Ken Burns"
    yield status(f"🎞 Assembling video ({mode_label} + captions + voiceover)…"), list(gallery), None
    scene_payload = [
        {
            "image": images[i],
            "video": videos[i],
            "audio": audios[i],
            "caption": scenes[i].get("caption", ""),
        }
        for i in range(len(scenes))
    ]
    mp4 = assemble.remote(scene_payload)

    out = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
    out.write(mp4)
    out.close()
    yield status(f"✅ Done! **{title}** — {len(scenes)} scenes ({mode_label})."), list(gallery), out.name


def _image_to_png_bytes(path_or_img):
    try:
        img = Image.open(path_or_img) if isinstance(path_or_img, str) else Image.fromarray(path_or_img)
        img = img.convert("RGB")
        buf = io.BytesIO()
        img.save(buf, format="PNG")
        return buf.getvalue()
    except Exception:
        return None


# ---------------------------------------------------------------------------
# Single-image tab
# ---------------------------------------------------------------------------
def single_image(prompt, steps, width, height, seed):
    _need_modal()
    if not prompt or not prompt.strip():
        raise gr.Error("Please enter a prompt.")
    model = modal.Cls.from_name(MODAL_APP, "Model")()
    try:
        png = model.generate.remote(
            prompt=prompt,
            num_inference_steps=int(steps),
            width=int(width),
            height=int(height),
            seed=int(seed),
        )
    except Exception as exc:  # noqa: BLE001
        raise gr.Error(f"Generation failed: {exc}")
    return Image.open(io.BytesIO(png))


# ---------------------------------------------------------------------------
# UI
# ---------------------------------------------------------------------------
def _on_lang_change(language):
    voices = list(LANGS[language]["voices"].keys())
    return gr.update(choices=voices, value=voices[0])


with gr.Blocks(title="tricket", theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        """
        # 🎬 tricket — one prompt → a product ad video
        Describe a product → GLM writes the script → **FLUX** paints each scene →
        **Kokoro** voices it → **ffmpeg** cuts a vertical (9:16) ad.
        GPU runs on-demand on **Modal**. First run after idle ~30–60s (cold start).
        """
    )

    with gr.Tab("🎬 Ad Studio"):
        with gr.Row():
            with gr.Column(scale=3):
                product = gr.Textbox(
                    label="Product",
                    placeholder="Aura Buds — wireless noise-cancelling earbuds with 30h battery",
                    lines=2,
                )
                selling = gr.Textbox(
                    label="Key selling points (optional)",
                    placeholder="noise cancelling, 30h battery, sweat-proof, instant pairing",
                    lines=2,
                )
                with gr.Row():
                    language = gr.Dropdown(
                        list(LANGS.keys()), value="中文", label="Narration language"
                    )
                    voice = gr.Dropdown(
                        list(LANGS["中文"]["voices"].keys()),
                        value=list(LANGS["中文"]["voices"].keys())[0],
                        label="Voice",
                    )
                    scenes_n = gr.Slider(3, 6, value=4, step=1, label="Scenes")
                product_img = gr.Image(
                    label="Product photo (optional — used as hero & end frame)",
                    type="filepath",
                    height=160,
                )
                motion = gr.Radio(
                    ["Ken Burns（快）", "AI 视频 · LTX（慢，更炫）"],
                    value="Ken Burns（快）",
                    label="Motion / 运镜",
                    info="AI 视频会为每个场景生成真实动态片段,明显更慢、更费 GPU。",
                )
                make_btn = gr.Button("🎬 Make Ad Video", variant="primary")
            with gr.Column(scale=4):
                status = gr.Markdown("Ready.")
                gallery = gr.Gallery(label="Scenes", columns=3, height=240)
                video = gr.Video(label="Ad video (9:16)")

        language.change(_on_lang_change, inputs=language, outputs=voice)
        make_btn.click(
            make_ad,
            inputs=[product, selling, language, voice, scenes_n, product_img, motion],
            outputs=[status, gallery, video],
        )

    with gr.Tab("🖼 Single Image"):
        with gr.Row():
            with gr.Column(scale=3):
                s_prompt = gr.Textbox(label="Prompt", lines=3,
                                      placeholder="a cinematic photo of a red panda barista")
                with gr.Row():
                    s_steps = gr.Slider(1, 8, value=4, step=1, label="Steps")
                    s_seed = gr.Number(value=-1, label="Seed (-1=random)", precision=0)
                with gr.Row():
                    s_w = gr.Slider(512, 1024, value=1024, step=64, label="Width")
                    s_h = gr.Slider(512, 1024, value=1024, step=64, label="Height")
                s_btn = gr.Button("Generate", variant="primary")
            with gr.Column(scale=4):
                s_out = gr.Image(label="Result", type="pil", height=512)
        s_btn.click(single_image, inputs=[s_prompt, s_steps, s_w, s_h, s_seed], outputs=s_out)


if __name__ == "__main__":
    demo.launch()