import gradio as gr
import torch

import numpy as np
import random
import os
import yaml
from pathlib import Path
import imageio
import tempfile
from PIL import Image
from huggingface_hub import hf_hub_download

from inference import (
    create_ltx_video_pipeline,
    create_latent_upsampler,
    load_image_to_tensor_with_resize_and_crop,
    seed_everething,
    calculate_padding,
    load_media_file
)
from ltx_video.pipelines.pipeline_ltx_video import ConditioningItem, LTXMultiScalePipeline
from ltx_video.utils.skip_layer_strategy import SkipLayerStrategy

# --- Đọc cấu hình và tải mô hình từ HuggingFace ---
CONFIG_YAML = "configs/ltxv-13b-0.9.7-distilled.yaml"
with open(CONFIG_YAML, "r") as f:
    CFG = yaml.safe_load(f)

HF_REPO = "LTTEAM/VideoAI"
MODELS_DIR = "downloaded_models"
Path(MODELS_DIR).mkdir(exist_ok=True)

print("Đang tải mô hình (nếu chưa có)…")
ckpt = hf_hub_download(repo_id=HF_REPO, filename=CFG["checkpoint_path"], local_dir=MODELS_DIR)
CFG["checkpoint_path"] = ckpt
upscaler = hf_hub_download(repo_id=HF_REPO, filename=CFG["spatial_upscaler_model_path"], local_dir=MODELS_DIR)
CFG["spatial_upscaler_model_path"] = upscaler

# --- Khởi tạo pipeline và upsampler trên CPU ---
print("Khởi tạo pipeline trên CPU…")
pipeline = create_ltx_video_pipeline(
    ckpt_path=CFG["checkpoint_path"],
    precision=CFG["precision"],
    text_encoder_model_name_or_path=CFG["text_encoder_model_name_or_path"],
    sampler=CFG["sampler"],
    device="cpu",
    enhance_prompt=False,
    prompt_enhancer_image_caption_model_name_or_path=CFG["prompt_enhancer_image_caption_model_name_or_path"],
    prompt_enhancer_llm_model_name_or_path=CFG["prompt_enhancer_llm_model_name_or_path"],
)
print("Pipeline sẵn sàng.")
print("Khởi tạo latent upsampler trên CPU…")
upsampler = create_latent_upsampler(CFG["spatial_upscaler_model_path"], device="cpu")
print("Upsampler sẵn sàng.")

# --- Thông số cố định ---
FPS = 30.0
MAX_FRAMES = 257
MIN_DIM = 256
FIXED_SIDE = 768
MAX_RES = CFG.get("max_resolution", 1280)

def calc_new_dims(w, h):
    if w==0 or h==0:
        return FIXED_SIDE, FIXED_SIDE
    if w>=h:
        nh = FIXED_SIDE
        nw = round((nh*w/h)/32)*32
    else:
        nw = FIXED_SIDE
        nh = round((nw*h/w)/32)*32
    return (
        int(max(MIN_DIM, min(nh, MAX_RES))),
        int(max(MIN_DIM, min(nw, MAX_RES)))
    )

def get_duration(*args, duration_ui=0, **kwargs):
    return 75 if duration_ui > 7 else 60


def generate(prompt, neg_prompt, img_path, vid_path,
             height, width, mode, duration_ui, frames_to_use,
             seed, rand_seed, cfg_scale, improve_tex, device_choice,
             progress=gr.Progress(track_tqdm=True)):

    # Chọn thiết bị inference
    dev = "cuda" if device_choice=="GPU" and torch.cuda.is_available() else "cpu"
    print(f"Sử dụng thiết bị: {dev}")
    pipeline.to(dev)
    upsampler.to(dev)

    # Seed
    if rand_seed:
        seed = random.randint(0, 2**32 - 1)
    seed_everething(int(seed))

    # Tính số frame
    tf = max(1, round(duration_ui * FPS))
    n8 = round((tf-1)/8)
    n_frames = max(9, min(n8*8+1, MAX_FRAMES))

    # Padding kích thước
    h, w = int(height), int(width)
    h_pad = ((h-1)//32+1)*32
    w_pad = ((w-1)//32+1)*32
    pad = calculate_padding(h, w, h_pad, w_pad)

    # Chuẩn bị kwargs chung
    kwargs = {
        "prompt": prompt,
        "negative_prompt": neg_prompt,
        "height": h_pad,
        "width": w_pad,
        "num_frames": n_frames,
        "frame_rate": int(FPS),
        "generator": torch.Generator(device=dev).manual_seed(int(seed)),
        "output_type": "pt",
        "decode_timestep": CFG["decode_timestep"],
        "decode_noise_scale": CFG["decode_noise_scale"],
        "stochastic_sampling": CFG["stochastic_sampling"],
        "is_video": True,
        "vae_per_channel_normalize": True,
        "mixed_precision": CFG["precision"]=="mixed_precision",
        "offload_to_cpu": False,
        "enhance_prompt": False,
    }
    # Skip-layer strategy
    mode_stg = CFG.get("stg_mode","attention_values").lower()
    stg_map = {
        "stg_av": SkipLayerStrategy.AttentionValues,
        "attention_values": SkipLayerStrategy.AttentionValues,
        "stg_as": SkipLayerStrategy.AttentionSkip,
        "attention_skip": SkipLayerStrategy.AttentionSkip,
        "stg_r": SkipLayerStrategy.Residual,
        "residual": SkipLayerStrategy.Residual,
        "stg_t": SkipLayerStrategy.TransformerBlock,
        "transformer_block": SkipLayerStrategy.TransformerBlock,
    }
    kwargs["skip_layer_strategy"] = stg_map.get(mode_stg, SkipLayerStrategy.AttentionValues)

    # Conditioning
    if mode=="image-to-video" and img_path:
        t = load_image_to_tensor_with_resize_and_crop(img_path, h, w)
        t = torch.nn.functional.pad(t, pad)
        kwargs["conditioning_items"] = [ConditioningItem(t.to(dev), 0, 1.0)]
    elif mode=="video-to-video" and vid_path:
        mi = load_media_file(vid_path, h, w, int(frames_to_use), pad).to(dev)
        kwargs["media_items"] = mi

    # Chọn multi-scale hay single-pass
    if improve_tex:
        pipe_ms = LTXMultiScalePipeline(pipeline, upsampler)
        fp = CFG.get("first_pass",{}).copy()
        fp["guidance_scale"] = float(cfg_scale)
        fp.pop("num_inference_steps", None)
        sp = CFG.get("second_pass",{}).copy()
        sp["guidance_scale"] = float(cfg_scale)
        sp.pop("num_inference_steps", None)
        kwargs.update({
            "downscale_factor": CFG["downscale_factor"],
            "first_pass": fp,
            "second_pass": sp
        })
        images = pipe_ms(**kwargs).images
    else:
        fp0 = CFG.get("first_pass",{})
        kwargs.update({
            "timesteps": fp0.get("timesteps"),
            "guidance_scale": float(cfg_scale),
            "stg_scale": fp0.get("stg_scale"),
            "rescaling_scale": fp0.get("rescaling_scale"),
            "skip_block_list": fp0.get("skip_block_list")
        })
        for k in ["first_pass","second_pass","downscale_factor","num_inference_steps"]:
            kwargs.pop(k, None)
        images = pipeline(**kwargs).images

    # Bỏ pad, lưu video
    l, r, t_, b = pad
    sh = None if b==0 else -b
    sw = None if r==0 else -r
    vid_t = images[0][:,:,:n_frames, t_:sh, l:sw]
    arr = vid_t.permute(1,2,3,0).cpu().numpy()
    arr = (np.clip(arr,0,1)*255).astype(np.uint8)

    out_dir = tempfile.mkdtemp()
    out_path = os.path.join(out_dir, f"output_{random.randint(0,99999)}.mp4")
    with imageio.get_writer(out_path, fps=int(FPS), macro_block_size=1) as writer:
        for i in range(arr.shape[0]):
            progress(i/arr.shape[0], desc="Lưu video")
            writer.append_data(arr[i])

    return out_path, seed

# --- Giao diện Gradio ---
css = """
#col-container { margin:0 auto; max-width:900px; }
"""
with gr.Blocks(css=css) as demo:
    gr.Markdown("## Ứng dụng LTX Video 0.9.7 Distilled")
    gr.Markdown(
        "[Mô hình trên HF](https://huggingface.co/LTTEAM/VideoAI) · "
        "[GitHub](https://github.com/Lightricks/LTX-Video)"
    )

    with gr.Row():
        with gr.Column():
            device = gr.Radio(["CPU", "GPU"], label="Chạy trên thiết bị", value="CPU")

            with gr.Tab("Ảnh→Video"):
                img_in = gr.Image(label="Ảnh đầu vào", type="filepath", sources=["upload","clipboard","webcam"])
                prompt1 = gr.Textbox(label="Mô tả", lines=2, value="Con sinh vật di chuyển")
                btn1 = gr.Button("Tạo từ ảnh")

            with gr.Tab("Văn bản→Video"):
                prompt2 = gr.Textbox(label="Mô tả", lines=2, value="Rồng bay trên lâu đài")
                btn2 = gr.Button("Tạo từ văn bản")

            with gr.Tab("Video→Video"):
                vid_in = gr.Video(label="Video đầu vào", sources=["upload","webcam"])
                frames = gr.Slider(label="Số frame dùng", minimum=9, maximum=MAX_FRAMES, step=8, value=9)
                prompt3 = gr.Textbox(label="Mô tả", lines=2, value="Chuyển phong cách anime")
                btn3 = gr.Button("Tạo từ video")

            duration = gr.Slider(label="Thời lượng (giây)", minimum=0.3, maximum=8.5, step=0.1, value=2)
            improve = gr.Checkbox(label="Cải thiện chi tiết", value=True)

        with gr.Column():
            out_vid = gr.Video(label="Kết quả", interactive=False)

    # Trạng thái ẩn
    mode_state = gr.State("image-to-video")
    seed_state = gr.State(42)
    neg_state = gr.State("worst quality, inconsistent motion, blurry, jittery, distorted")
    cfg_state = gr.State(CFG["first_pass"]["guidance_scale"])
    h_state = gr.State(512)
    w_state = gr.State(704)

    btn1.click(fn=generate,
               inputs=[prompt1, neg_state, img_in, gr.State(""), h_state, w_state,
                       mode_state, duration, frames, seed_state, gr.State(True),
                       cfg_state, improve, device],
               outputs=[out_vid, seed_state])
    btn2.click(fn=generate,
               inputs=[prompt2, neg_state, gr.State(""), gr.State(""), h_state, w_state,
                       mode_state, duration, frames, seed_state, gr.State(True),
                       cfg_state, improve, device],
               outputs=[out_vid, seed_state])
    btn3.click(fn=generate,
               inputs=[prompt3, neg_state, gr.State(""), vid_in, h_state, w_state,
                       mode_state, duration, frames, seed_state, gr.State(True),
                       cfg_state, improve, device],
               outputs=[out_vid, seed_state])

if __name__ == "__main__":
    demo.queue().launch(share=True)