import os
import subprocess
import sys
from pathlib import Path

# Disable torch.compile / dynamo before any torch import
os.environ["TORCH_COMPILE_DISABLE"] = "1"
os.environ["TORCHDYNAMO_DISABLE"] = "1"
# OpenImageIO / OpenEXR readers need this to be set before cv2 import
os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"

# Install xformers for memory-efficient attention
subprocess.run(
    [sys.executable, "-m", "pip", "install", "xformers==0.0.32.post2", "--no-build-isolation"],
    check=False,
)

# Video / HDR dependencies
subprocess.run(
    [sys.executable, "-m", "pip", "install",
     "imageio[ffmpeg]", "scikit-image", "opencv-python-headless",
     "decord", "num2words", "OpenImageIO"],
    check=False,
)
subprocess.run([sys.executable, "-m", "pip", "install", "num2words"], check=True)

# Reinstall torchaudio to match the torch CUDA version on this space.
_tv = subprocess.run(
    [sys.executable, "-c", "import torch; print(torch.__version__)"],
    capture_output=True, text=True,
)
if _tv.returncode == 0:
    _full_ver = _tv.stdout.strip()
    _cuda_suffix = _full_ver.split("+")[-1] if "+" in _full_ver else "cu124"
    _base_ver = _full_ver.split("+")[0]
    print(f"Detected torch {_full_ver}, reinstalling matching torchaudio...")
    subprocess.run(
        [sys.executable, "-m", "pip", "install", "--force-reinstall", "--no-deps",
         f"torchaudio=={_base_ver}",
         "--index-url", f"https://download.pytorch.org/whl/{_cuda_suffix}"],
        check=False,
    )

# ─────────────────────────────────────────────────────────────────────────────
# ltx-core / ltx-pipelines source
#
# The HDRICLoraPipeline and its supporting modules (ltx_core.hdr,
# ltx_pipelines.utils.blocks, load_video_conditioning_hdr, apply_hdr_decode_postprocess,
# save_exr_tensor, encode_exr_sequence_to_mp4) are NOT on the public main
# branch at the pinned commit used by the outpaint app. We install from the
# local ltx-2-internal checkout so the HDR code path actually exists.
# ─────────────────────────────────────────────────────────────────────────────
LTX_INTERNAL = Path(os.environ.get(
    "LTX_INTERNAL_PATH",
    str(Path(__file__).resolve().parent / "ltx-2-internal"),
))
LTX_CORE_PKG = LTX_INTERNAL / "packages" / "ltx-core"
LTX_PIPELINES_PKG = LTX_INTERNAL / "packages" / "ltx-pipelines"

print(f"Installing ltx-core + ltx-pipelines from {LTX_INTERNAL}...")
subprocess.run(
    [sys.executable, "-m", "pip", "install",
     "--force-reinstall", "--no-deps",
     "-e", str(LTX_CORE_PKG),
     "-e", str(LTX_PIPELINES_PKG)],
    check=True,
)
sys.path.insert(0, str(LTX_PIPELINES_PKG / "src"))
sys.path.insert(0, str(LTX_CORE_PKG / "src"))

import logging
import random
import tempfile
import zipfile

import torch
torch._dynamo.config.suppress_errors = True
torch._dynamo.config.disable = True

import spaces
import gradio as gr
import numpy as np
from huggingface_hub import hf_hub_download, snapshot_download

from ltx_core.model.video_vae import TilingConfig
from ltx_core.quantization import QuantizationPolicy
from ltx_pipelines.hdr_ic_lora import HDRICLoraPipeline, _make_tiling_config
from ltx_pipelines.utils.blocks import PromptEncoder
from ltx_pipelines.utils.media_io import (
    encode_exr_sequence_to_mp4,
    get_videostream_metadata,
    save_exr_tensor,
)
from ltx_pipelines.utils.types import OffloadMode

# xformers attention patch (same as the outpaint app).
from ltx_core.model.transformer import attention as _attn_mod
print(f"[ATTN] Before patch: memory_efficient_attention={_attn_mod.memory_efficient_attention}")
try:
    from xformers.ops import memory_efficient_attention as _mea
    _attn_mod.memory_efficient_attention = _mea
    print(f"[ATTN] After patch: memory_efficient_attention={_attn_mod.memory_efficient_attention}")
except Exception as e:
    print(f"[ATTN] xformers patch FAILED: {type(e).__name__}: {e}")

logging.getLogger().setLevel(logging.INFO)


# ─────────────────────────────────────────────────────────────────────────────
# Constants + model downloads
# ─────────────────────────────────────────────────────────────────────────────
MAX_SEED = np.iinfo(np.int32).max

# Frames must satisfy (n-1) % 8 == 0. Aspect-ratio canvas sizes (divisible by 32).
RESOLUTIONS = {
    "low":  {"16:9": (768, 512), "9:16": (512, 768), "1:1": (768, 768),
             "4:3": (768, 576), "3:4": (576, 768), "21:9": (768, 384)},
    "high": {"16:9": (1536, 1024), "9:16": (1024, 1536), "1:1": (1024, 1024),
             "4:3": (1536, 1152), "3:4": (1152, 1536), "21:9": (1536, 768)},
}

LTX_MODEL_REPO = "Lightricks/LTX-2.3"
DISTILLED_CHECKPOINT = "ltx-2.3-22b-distilled-1.1.safetensors"
SPATIAL_UPSCALER = "ltx-2.3-spatial-upscaler-x2-1.1.safetensors"
GEMMA_REPO = "google/gemma-3-12b-it-qat-q4_0-unquantized"

HDR_LORA_REPO = "Lightricks/LTX-2.3-22b-IC-LoRA-HDR"
HDR_LORA_FILENAME = "ltx-2.3-22b-ic-lora-hdr-0.9.safetensors"

print("=" * 80)
print("Downloading LTX-2.3 distilled + spatial upsampler + Gemma + HDR IC-LoRA...")
print("=" * 80)

checkpoint_path = hf_hub_download(repo_id=LTX_MODEL_REPO, filename=DISTILLED_CHECKPOINT)
spatial_upsampler_path = hf_hub_download(repo_id=LTX_MODEL_REPO, filename=SPATIAL_UPSCALER)
hdr_lora_path = hf_hub_download(repo_id=HDR_LORA_REPO, filename=HDR_LORA_FILENAME)
gemma_root = snapshot_download(repo_id=GEMMA_REPO)

print(f"Checkpoint: {checkpoint_path}")
print(f"Spatial upsampler: {spatial_upsampler_path}")
print(f"HDR IC-LoRA: {hdr_lora_path}")
print(f"Gemma root: {gemma_root}")


# ─────────────────────────────────────────────────────────────────────────────
# Text encoding: on-the-fly Gemma -> (video_context, audio_context) for each
# prompt. HDRICLoraPipeline expects a `.pt` path at __init__, so we bootstrap
# one from an empty prompt, then overwrite `pipeline.text_embeddings` in
# memory each generate call.
# ─────────────────────────────────────────────────────────────────────────────
_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
_DTYPE = torch.bfloat16

prompt_encoder = PromptEncoder(
    checkpoint_path=checkpoint_path,
    gemma_root=gemma_root,
    dtype=_DTYPE,
    device=_DEVICE,
)


def encode_prompt_to_contexts(prompt: str) -> tuple[torch.Tensor, torch.Tensor]:
    """Run Gemma + embeddings processor to produce (video_context, audio_context).

    HDRICLoraPipeline only consumes video_context; audio_context is stored for
    shape-compat with the `.pt` interface but ignored during HDR generation.
    MUST be called from inside a @spaces.GPU context on ZeroGPU.
    """
    (out,) = prompt_encoder([prompt])
    v = out.video_encoding
    a = out.audio_encoding if out.audio_encoding is not None else torch.zeros(0, device=v.device, dtype=v.dtype)
    return v, a


# HDRICLoraPipeline.__init__ requires a .pt it can torch.load, but it only
# stores the tensors — __call__ reads `self.text_embeddings` which we overwrite
# on every generate run. So write a placeholder .pt at module-load (CPU, no
# Gemma run — Gemma can only touch GPU inside a @spaces.GPU function on ZeroGPU).
_bootstrap_emb_path = Path(tempfile.gettempdir()) / "ltx_hdr_bootstrap_emb.pt"
_placeholder = torch.zeros(1, 1, 4096, dtype=_DTYPE)
torch.save({"video_context": _placeholder, "audio_context": _placeholder}, _bootstrap_emb_path)


# ─────────────────────────────────────────────────────────────────────────────
# Initialize pipeline
# ─────────────────────────────────────────────────────────────────────────────
# HDRICLoraPipeline is video-only (no audio path). HDR transform (LogC3) and
# reference_downscale_factor are auto-detected from the LoRA metadata.
pipeline = HDRICLoraPipeline(
    distilled_checkpoint_path=checkpoint_path,
    spatial_upsampler_path=spatial_upsampler_path,
    hdr_lora=hdr_lora_path,
    text_embeddings_path=str(_bootstrap_emb_path),
    quantization=QuantizationPolicy.fp8_cast(),
    offload_mode=OffloadMode.NONE,
)
print(f"HDRICLoraPipeline ready. HDR transform: {pipeline.hdr_transform}, "
      f"ref_downscale={pipeline.reference_downscale_factor}")
print("=" * 80)


# ─────────────────────────────────────────────────────────────────────────────
# UI helpers
# ─────────────────────────────────────────────────────────────────────────────
def detect_aspect_ratio(video_path) -> str:
    if video_path is None:
        return "16:9"
    try:
        meta = get_videostream_metadata(str(video_path))
        ratio = meta.width / meta.height
    except Exception:
        return "16:9"
    candidates = {
        "16:9": 16 / 9, "9:16": 9 / 16, "1:1": 1.0,
        "4:3": 4 / 3, "3:4": 3 / 4, "21:9": 21 / 9,
    }
    return min(candidates, key=lambda k: abs(ratio - candidates[k]))


def get_target_resolution(target_aspect: str, high_res: bool) -> tuple[int, int]:
    tier = "high" if high_res else "low"
    return RESOLUTIONS[tier].get(target_aspect, RESOLUTIONS[tier]["16:9"])


def on_video_upload(video):
    if video is None:
        return gr.update(), gr.update(), gr.update()
    try:
        meta = get_videostream_metadata(str(video))
        dur = round(min(meta.frames / meta.fps, 6.7), 1)
        fps = float(meta.fps)
    except Exception:
        dur, fps = 3.0, 24.0
    return gr.update(value=dur), gr.update(value=detect_aspect_ratio(video)), gr.update(value=fps)


# ─────────────────────────────────────────────────────────────────────────────
# Generation
# ─────────────────────────────────────────────────────────────────────────────
@spaces.GPU(duration=300)
@torch.inference_mode()
def generate_video(
    input_video,
    prompt: str,
    duration: float,
    frame_rate: float,
    target_aspect: str,
    high_res: bool,
    seed: int,
    randomize_seed: bool,
    high_quality_hdr: bool,
    export_exr: bool,
    progress=gr.Progress(track_tqdm=True),
):
    current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
    try:
        torch.cuda.reset_peak_memory_stats()

        if input_video is None:
            raise ValueError("Please upload a source video.")

        video_path = str(input_video)

        # Frame count must satisfy (n-1) % 8 == 0
        num_frames = int(duration * frame_rate) + 1
        num_frames = ((num_frames - 1 + 7) // 8) * 8 + 1

        target_w, target_h = get_target_resolution(target_aspect, high_res)

        print(f"[HDR] {target_h}x{target_w}, frames={num_frames}, fps={frame_rate}, "
              f"seed={current_seed}, aspect={target_aspect}, hq_hdr={high_quality_hdr}")

        # Encode prompt -> (video_context, audio_context) and swap into the
        # pipeline. Gemma is loaded, used, and freed inside prompt_encoder.
        print(f"[HDR] Encoding prompt: {prompt!r}")
        video_context, audio_context = encode_prompt_to_contexts(prompt or "")
        pipeline.text_embeddings = (video_context, audio_context)

        # Tiling config: smaller spatial tile on lower-VRAM targets
        tiling_config = _make_tiling_config(spatial_tile=768 if not high_res else 1280)

        hdr_video = pipeline(
            seed=current_seed,
            height=int(target_h),
            width=int(target_w),
            num_frames=num_frames,
            frame_rate=float(frame_rate),
            video_conditioning=[(video_path, 1.0)],
            tiling_config=tiling_config,
            high_quality_hdr=high_quality_hdr,
        )

        # The pipeline's internal resize_and_reflect_pad adds bottom/right
        # reflection when source aspect != target aspect. Its built-in crop
        # (_decode_video's `out[:, :crop_size[1], :crop_size[0], :]`) only
        # undoes the 64-divisor alignment padding — it leaves aspect-mismatch
        # reflection bands in the decoded output. Apply the same top-left
        # slice here with the un-reflected content region.
        try:
            src_meta = get_videostream_metadata(video_path)
            src_aspect = src_meta.width / src_meta.height
            tgt_aspect = target_w / target_h
            if src_aspect > tgt_aspect:
                content_h = int(round(target_w / src_aspect))
                content_h -= content_h % 2  # libx264 yuv420p needs even dims
                if 0 < content_h < hdr_video.shape[1]:
                    print(f"[HDR] Cropping reflected bottom: {hdr_video.shape[1]} -> {content_h}")
                    hdr_video = hdr_video[:, :content_h, :, :]
            elif src_aspect < tgt_aspect:
                content_w = int(round(target_h * src_aspect))
                content_w -= content_w % 2  # libx264 yuv420p needs even dims
                if 0 < content_w < hdr_video.shape[2]:
                    print(f"[HDR] Cropping reflected right: {hdr_video.shape[2]} -> {content_w}")
                    hdr_video = hdr_video[:, :, :content_w, :]
        except Exception as e:
            print(f"[HDR] Post-crop skipped: {type(e).__name__}: {e}")

        # hdr_video is [f, h, w, c] linear HDR float. Write EXR frames, then
        # tonemap to a libx264 mp4 for in-browser preview.
        work_dir = Path(tempfile.mkdtemp(prefix="ltx-hdr-"))
        exr_dir = work_dir / "exr"
        exr_dir.mkdir(parents=True, exist_ok=True)

        for j in range(hdr_video.shape[0]):
            save_exr_tensor(
                hdr_video[j].detach().cpu(),
                str(exr_dir / f"frame_{j:05d}.exr"),
                half=False,
            )
        del hdr_video

        preview_mp4 = work_dir / "preview.mp4"
        encode_exr_sequence_to_mp4(exr_dir, preview_mp4, frame_rate=float(frame_rate))

        exr_zip_path = None
        if export_exr:
            exr_zip_path = work_dir / "exr.zip"
            with zipfile.ZipFile(exr_zip_path, "w", compression=zipfile.ZIP_STORED) as zf:
                for exr in sorted(exr_dir.glob("frame_*.exr")):
                    zf.write(exr, arcname=exr.name)

        return str(preview_mp4), (str(exr_zip_path) if exr_zip_path else None), current_seed

    except Exception as e:
        import traceback
        print(f"Error: {e}\n{traceback.format_exc()}")
        return None, None, current_seed


# ─────────────────────────────────────────────────────────────────────────────
# Gradio UI — LTX 2.3 HDR
# ─────────────────────────────────────────────────────────────────────────────
css = """
.generate-btn { min-height: 52px !important; font-size: 1.1em !important; }
footer { display: none !important; }
video { object-fit: contain !important; }
"""

theme = gr.themes.Citrus(
    primary_hue=gr.themes.colors.purple,
    secondary_hue=gr.themes.colors.purple,
    neutral_hue=gr.themes.colors.gray,
)

with gr.Blocks(title="LTX 2.3 HDR", css=css, theme=theme) as demo:
    gr.Markdown("""
# LTX 2.3 HDR ✨
Reconstruct the highlights and shadows clipped away by the camera with LTX-2.3 HDR:                  
Video-to-video HDR via LTX-2.3 + [HDR IC-LoRA](https://huggingface.co/diffusers-internal-dev/LTX-HDR-LoRA).
Output is linear HDR (LogC3 inverse decoded). 
    """)

    with gr.Row():
        with gr.Column(scale=1):
            input_video = gr.Video(label="Source Video")

            prompt = gr.Textbox(
                label="Prompt",
                info="Describe the scene being regenerated in HDR",
                lines=2,
                placeholder="a cinematic sunset over mountains, high dynamic range, bright sky, deep shadows",
            )

            with gr.Row():
                target_aspect = gr.Dropdown(
                    label="Aspect Ratio",
                    choices=["16:9", "9:16", "1:1", "4:3", "3:4", "21:9"],
                    value="16:9",
                )
                duration = gr.Slider(
                    label="Duration (s)", minimum=1.0, maximum=6.7, value=3.0, step=0.1,
                )
                frame_rate = gr.Number(label="FPS", value=24.0, precision=2)

            generate_btn = gr.Button(
                "Generate HDR", variant="primary", size="lg", elem_classes=["generate-btn"],
            )

            with gr.Accordion("Advanced Settings", open=False):
                high_res = gr.Checkbox(label="High Resolution (2×)", value=False)
                high_quality_hdr = gr.Checkbox(
                    label="High-Quality HDR",
                    value=False,
                    info="Generates at 2× internal frame count, keeps every other frame. ~2× slower, smoother temporally.",
                )
                export_exr = gr.Checkbox(
                    label="Export EXR frames (zip)",
                    value=True,
                    info="Per-frame linear OpenEXR (float32). Uncheck if you only want the mp4 preview.",
                )
                seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, value=10, step=1)
                randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)

        with gr.Column(scale=1):
            output_video = gr.Video(label="Preview (sRGB tonemap)", autoplay=True, height=480)
            gr.Markdown("The preview mp4 is a fixed-EV sRGB tonemap; the EXR zip contains the full linear float frames for grading.")
            output_exr = gr.File(label="EXR frames (zip)")

    input_video.change(
        fn=on_video_upload,
        inputs=[input_video],
        outputs=[duration, target_aspect, frame_rate],
    )

    generate_btn.click(
        fn=generate_video,
        inputs=[
            input_video, prompt, duration, frame_rate, target_aspect, high_res,
            seed, randomize_seed, high_quality_hdr, export_exr,
        ],
        outputs=[output_video, output_exr, seed],
    )


if __name__ == "__main__":
    demo.launch()