import gradio as gr
from diffusers import DiffusionPipeline
import torch
import numpy as np
from PIL import Image
import time
import warnings
warnings.filterwarnings("ignore")

# Set to use CPU
torch_device = "cpu"
torch_dtype = torch.float32

def load_model():
    model_id = "damo-vilab/text-to-video-ms-1.7b"
    pipe = DiffusionPipeline.from_pretrained(
        model_id, 
        torch_dtype=torch_dtype
    )
    pipe = pipe.to(torch_device)
    pipe.enable_attention_slicing()
    return pipe

def generate_video(prompt, num_frames=8, num_inference_steps=20):
    start_time = time.time()
    
    if not hasattr(generate_video, "pipe"):
        generate_video.pipe = load_model()
    
    with torch.no_grad():
        output = generate_video.pipe(
            prompt,
            num_frames=min(num_frames, 8),
            num_inference_steps=min(num_inference_steps, 20),
            height=256,
            width=256
        )
    
    # Correct frame conversion - handle the 4D array properly
    video_frames = output.frames
    if isinstance(video_frames, np.ndarray):
        # Reshape from (1, num_frames, height, width, 3) to (num_frames, height, width, 3)
        if video_frames.ndim == 5:
            video_frames = video_frames[0]  # Remove batch dimension
        
        frames = []
        for frame in video_frames:
            # Convert to 8-bit and ensure correct channel order
            frame = (frame * 255).astype(np.uint8)
            frames.append(Image.fromarray(frame))
    else:
        raise ValueError("Unexpected frame format")
    
    # Create GIF
    gif_path = "output.gif"
    frames[0].save(
        gif_path,
        save_all=True,
        append_images=frames[1:],
        duration=100,  # 100ms per frame
        loop=0,
        quality=80
    )
    
    print(f"Generation took {time.time() - start_time:.2f} seconds")
    return gif_path

# Gradio Interface
with gr.Blocks(title="CPU Text-to-Video") as demo:
    gr.Markdown("# 🐢 CPU Text-to-Video Generator")
    
    with gr.Row():
        with gr.Column():
            prompt = gr.Textbox(label="Prompt")
            with gr.Accordion("Advanced Options", open=False):
                frames = gr.Slider(4, 12, value=8, step=4, label="Frames")
                steps = gr.Slider(10, 30, value=20, step=5, label="Steps")
            submit = gr.Button("Generate")
        
        with gr.Column():
            output = gr.Image(label="Result", format="gif")
            gr.Markdown("Note: CPU generation may take several minutes")
    
    submit.click(
        fn=generate_video,
        inputs=[prompt, frames, steps],
        outputs=output
    )

demo.launch()