import os
import sys
import gc
import tempfile
import random
import numpy as np
import torch
from PIL import Image

#os.system("pip install spaces-0.1.0-py3-none-any.whl moviepy==1.0.3 imageio[ffmpeg]")

import spaces
import gradio as gr
from diffusers import AutoencoderKLWan, WanPipeline, WanImageToVideoPipeline, UniPCMultistepScheduler
from diffusers.utils import export_to_video
from moviepy.editor import VideoFileClip, concatenate_videoclips

MODEL_ID = "FastVideo/FastWan2.2-TI2V-5B-FullAttn-Diffusers"
vae = AutoencoderKLWan.from_pretrained(MODEL_ID, subfolder="vae", torch_dtype=torch.float32)

text_to_video_pipe = WanPipeline.from_pretrained(MODEL_ID, vae=vae, torch_dtype=torch.bfloat16)
image_to_video_pipe = WanImageToVideoPipeline.from_pretrained(MODEL_ID, vae=vae, torch_dtype=torch.bfloat16)

for pipe in [text_to_video_pipe, image_to_video_pipe]:
    pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=8.0)

MOD_VALUE = 32
DEFAULT_H = 896
DEFAULT_W = 896
MAX_SEED = np.iinfo(np.int32).max
FIXED_FPS = 24
MIN_FRAMES_MODEL = 25
MAX_FRAMES_MODEL = 193

@spaces.GPU()
def _clean_memory():
    gc.collect()

@spaces.GPU()
def generate_video_gpu(input_files, prompt, height, width, negative_prompt, target_frames, guidance_scale, steps, seed, randomize_seed, progress=gr.Progress(track_tqdm=True)):
    target_h = max(MOD_VALUE, (int(height) // MOD_VALUE) * MOD_VALUE)
    target_w = max(MOD_VALUE, (int(width) // MOD_VALUE) * MOD_VALUE)
    
    # Asegurar que los frames estén dentro de los límites del modelo
    num_frames = min(max(int(target_frames), 1), MAX_FRAMES_MODEL)
    
    master_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
    
    video_clips_paths = []
    pil_images = []

    if input_files is not None:
        files_list = input_files if isinstance(input_files, list) else [input_files]
        for f in files_list:
            try:
                path = f.name if hasattr(f, "name") else f
                img = Image.open(path).convert("RGB")
                pil_images.append(img)
            except Exception:
                continue

    if len(pil_images) > 0:
        for i, img in enumerate(pil_images):
            _clean_memory()
            
            local_seed = master_seed + i
            generator = torch.Generator.manual_seed(local_seed)
            
            resized_image = img.resize((target_w, target_h))
            
            try:
                with torch.inference_mode():
                    output_frames = image_to_video_pipe(
                        image=resized_image, 
                        prompt=prompt, 
                        negative_prompt=negative_prompt,
                        height=target_h, 
                        width=target_w, 
                        num_frames=num_frames,
                        guidance_scale=float(guidance_scale), 
                        num_inference_steps=int(steps),
                        generator=generator
                    ).frames[0]
                
                with tempfile.NamedTemporaryFile(suffix=f"_img_{i}.mp4", delete=False) as tmp:
                    export_to_video(output_frames, tmp.name, fps=FIXED_FPS)
                    video_clips_paths.append(tmp.name)
                    
                progress((i + 1) / len(pil_images))
                
            except Exception:
                continue

    else:
        # Modo Texto a Video: Generamos un solo clip con la cantidad de frames solicitada
        _clean_memory()
        
        generator = torch.Generator.manual_seed(master_seed)
        
        with torch.inference_mode():
            output_frames = text_to_video_pipe(
                prompt=prompt, 
                negative_prompt=negative_prompt,
                height=target_h, 
                width=target_w, 
                num_frames=num_frames,
                guidance_scale=float(guidance_scale), 
                num_inference_steps=int(steps),
                generator=generator
            ).frames[0]
        
        with tempfile.NamedTemporaryFile(suffix="_txt2vid.mp4", delete=False) as tmp:
            export_to_video(output_frames, tmp.name, fps=FIXED_FPS)
            video_clips_paths.append(tmp.name)
        
        progress(1.0)

    _clean_memory()
    return video_clips_paths, master_seed

@spaces.GPU()
def stitch_videos(video_paths):
    if not video_paths:
        return None
        
    if len(video_paths) == 1:
        return video_paths[0]
        
    try:
        clips = [VideoFileClip(p) for p in video_paths]
        final_clip = concatenate_videoclips(clips, method="compose")
        
        with tempfile.NamedTemporaryFile(suffix="_final.mp4", delete=False) as final_tmp:
            final_path = final_tmp.name
            
        final_clip.write_videofile(final_path, codec="libx264", audio=False, fps=FIXED_FPS, logger=None)
        
        for c in clips: c.close()
        
        return final_path
    except Exception:
        return video_paths[0]

@spaces.GPU()
def main_process(input_files, prompt, height, width, neg_prompt, frames, scale, steps, seed, rand_seed):
    clips, used_seed = generate_video_gpu(input_files, prompt, height, width, neg_prompt, frames, scale, steps, seed, rand_seed)
    final_video = stitch_videos(clips)
    return final_video, used_seed

with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# Fast Wan 2.2 - Generador de Video")

    with gr.Row():
        with gr.Column(scale=1):
            input_files = gr.File(
                label="Imágenes de Entrada", 
                file_count="multiple", 
                type="filepath",
                file_types=["image"]
            )
            
            prompt = gr.Textbox(label="Prompt", value="Cinematic view, realistic lighting, 4k, slow motion", lines=2)
            
            frames = gr.Slider(
                minimum=MIN_FRAMES_MODEL, 
                maximum=MAX_FRAMES_MODEL, 
                step=1, 
                value=81, 
                label="Duración (Frames)", 
                info=f"Máximo soportado por el modelo: {MAX_FRAMES_MODEL} frames"
            )

            with gr.Accordion("Configuración Avanzada", open=False):
                neg_prompt = gr.Textbox(label="Prompt Negativo", value="low quality, distortion, text, watermark, blurry, ugly", lines=2)
                seed = gr.Slider(label="Semilla", minimum=0, maximum=MAX_SEED, step=1, value=42)
                rand_seed = gr.Checkbox(label="Semilla Aleatoria", value=True)
                
                with gr.Row():
                    height = gr.Slider(minimum=256, maximum=1024, step=32, value=832, label="Altura")
                    width = gr.Slider(minimum=256, maximum=1024, step=32, value=832, label="Anchura")
                
                steps = gr.Slider(minimum=2, maximum=10, step=1, value=4, label="Pasos")
                scale = gr.Slider(minimum=1.0, maximum=8.0, step=0.1, value=5.0, label="Guidance Scale")
            
            btn_gen = gr.Button("Generar", variant="primary", size="lg")
        
        with gr.Column(scale=2):
            output_video = gr.Video(label="Resultado Final", autoplay=True)
            output_seed = gr.Number(label="Semilla Usada")

    btn_gen.click(
        fn=main_process,
        inputs=[input_files, prompt, height, width, neg_prompt, frames, scale, steps, seed, rand_seed],
        outputs=[output_video, output_seed]
    )

if __name__ == "__main__":
    demo.queue().launch()