import os import tempfile from typing import List import gradio as gr import torch from PIL import Image from diffusers import StableVideoDiffusionPipeline from diffusers.utils import export_to_video MODEL_ID_DEFAULT = os.getenv("MODEL_ID", "stabilityai/stable-video-diffusion-img2vid") DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32 pipe = None def load_pipeline(model_id: str = MODEL_ID_DEFAULT): global pipe if pipe is not None: return pipe kwargs = { "torch_dtype": DTYPE, } # fp16 variant helps on GPU spaces if DTYPE == torch.float16: kwargs["variant"] = "fp16" pipe_local = StableVideoDiffusionPipeline.from_pretrained( model_id, **kwargs, ) # memory & speed tweaks if torch.cuda.is_available(): pipe_local.enable_model_cpu_offload() # good default for Spaces GPUs else: pipe_local.enable_sequential_cpu_offload() pipe_local.enable_vae_slicing() pipe_local.enable_attention_slicing() pipe = pipe_local return pipe def _ensure_rgb(img: Image.Image) -> Image.Image: if img.mode != "RGB": return img.convert("RGB") return img def generate( image: Image.Image, num_frames: int = 14, fps: int = 8, motion_bucket_id: int = 127, noise_aug_strength: float = 0.02, seed: int = 0, decode_chunk_size: int = 8, model_id: str = MODEL_ID_DEFAULT, ): if image is None: raise gr.Error("Please upload an image.") pipe = load_pipeline(model_id) # Determinism generator = torch.Generator(device="cuda" if torch.cuda.is_available() else "cpu") if seed is None or seed < 0: seed = torch.seed() % (2**31) generator = generator.manual_seed(int(seed)) image = _ensure_rgb(image) with torch.inference_mode(): result = pipe( image=image, num_frames=int(num_frames), fps=fps, motion_bucket_id=int(motion_bucket_id), noise_aug_strength=float(noise_aug_strength), decode_chunk_size=int(decode_chunk_size), generator=generator, ) frames: List[Image.Image] = result.frames[0] # Save to a temp .mp4 tmpdir = tempfile.mkdtemp() out_path = os.path.join(tmpdir, "output.mp4") export_to_video(frames, out_path, fps=fps) return out_path def build_demo(): with gr.Blocks(theme=gr.themes.Soft(), fill_width=True) as demo: gr.Markdown( """ # Image → Video (Stable Video Diffusion) Pretrained **Stable Video Diffusion (Img2Vid)** from the Hugging Face Hub. - Default model: `stabilityai/stable-video-diffusion-img2vid` - Try alternative ids like `stabilityai/stable-video-diffusion-img2vid-xt` """ ) with gr.Row(): with gr.Column(scale=1): inp_img = gr.Image(type="pil", label="Input image", width=512) model_id = gr.Textbox( value=MODEL_ID_DEFAULT, label="Model repo id", info="Any compatible Img2Vid pipeline on the Hub", ) with gr.Accordion("Advanced", open=False): num_frames = gr.Slider(8, 25, value=14, step=1, label="Frames") fps = gr.Slider(4, 30, value=8, step=1, label="FPS") motion_bucket_id = gr.Slider(1, 255, value=127, step=1, label="Motion bucket id") noise_aug_strength = gr.Slider(0.0, 0.5, value=0.02, step=0.01, label="Noise aug strength") decode_chunk_size = gr.Slider(1, 32, value=8, step=1, label="Decode chunk size") seed = gr.Number(value=0, precision=0, label="Seed (0 for random)") run = gr.Button("Generate", variant="primary") with gr.Column(scale=1): out_vid = gr.Video(label="Output video (.mp4)") run.click( fn=generate, inputs=[ inp_img, num_frames, fps, motion_bucket_id, noise_aug_strength, seed, decode_chunk_size, model_id, ], outputs=[out_vid], queue=True, api_name="predict", ) gr.Examples( examples=[ ["https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/img2img/sketch-mountains-input.jpg", 14, 8, 127, 0.02, 0, 8, MODEL_ID_DEFAULT], ], inputs=[inp_img, num_frames, fps, motion_bucket_id, noise_aug_strength, seed, decode_chunk_size, model_id], label="Try an example (downloads on-click)", ) return demo demo = build_demo() if __name__ == "__main__": demo.queue().launch()