import gradio as gr import torch import spaces import os import tempfile import random from PIL import Image from diffusers import AutoencoderKLWan, WanImageToVideoPipeline from diffusers.utils import export_to_video from transformers import CLIPVisionModel from huggingface_hub import InferenceClient # ── Config ──────────────────────────────────────────────────────────────────── HF_TOKEN = os.environ.get("HF_TOKEN", None) MODEL_REPO = "Wan-AI/Wan2.1-I2V-14B-480P-Diffusers" # ── Prompt expansion LLM ────────────────────────────────────────────────────── llm_client = InferenceClient( model="mistralai/Mistral-7B-Instruct-v0.3", token=HF_TOKEN, ) VIDEO_SYSTEM = """You are an expert at writing motion prompts for AI video generation using Wan I2V. Your job: take a short description of desired motion/animation and expand it into a detailed video motion prompt. Rules: - Focus on MOTION — what moves, how it moves, camera movement - Be specific: "hair gently blowing in breeze", "camera slowly pulls back", "eyes blink naturally" - Keep subjects consistent with what is already in the image - Describe lighting changes if relevant e.g. "light flickers softly" - Do NOT describe the static image content — only the motion - Return ONLY the prompt, no explanation, no preamble - Keep under 80 words""" def expand_video_prompt(raw_prompt): if not raw_prompt.strip(): return "subtle natural movement, gentle camera drift, cinematic atmosphere" try: response = llm_client.chat_completion( messages=[ {"role": "system", "content": VIDEO_SYSTEM}, {"role": "user", "content": f"Expand this motion description:\n{raw_prompt.strip()}"}, ], max_tokens=150, temperature=0.6, ) return response.choices[0].message.content.strip().strip('"').strip("'") except Exception as e: print(f"LLM expansion failed, using raw prompt: {e}") return raw_prompt.strip() # ── Load pipeline ───────────────────────────────────────────────────────────── print("Loading Wan2.1 I2V pipeline...") image_encoder = CLIPVisionModel.from_pretrained( MODEL_REPO, subfolder="image_encoder", torch_dtype=torch.float32, ) vae = AutoencoderKLWan.from_pretrained( MODEL_REPO, subfolder="vae", torch_dtype=torch.float32, ) pipe = WanImageToVideoPipeline.from_pretrained( MODEL_REPO, vae=vae, image_encoder=image_encoder, torch_dtype=torch.bfloat16, ) pipe.enable_model_cpu_offload() print("Pipeline ready.") # ── Negative prompt ─────────────────────────────────────────────────────────── VIDEO_NEG = ( "static, no movement, blurry, low quality, worst quality, " "inconsistent motion, flickering, jitter, artifacts, " "watermark, text, deformed" ) # ── Generation ──────────────────────────────────────────────────────────────── @spaces.GPU(duration=300) def generate_video(input_image, motion_prompt, num_frames, guidance, seed, randomize): if input_image is None: raise gr.Error("Please upload an image first.") if randomize: seed = random.randint(0, 2**32 - 1) seed = int(seed) # Expand motion prompt via LLM expanded_motion = expand_video_prompt(motion_prompt) print(f"Expanded motion: {expanded_motion}") # Resize — Wan I2V works best at 832x480 img = Image.fromarray(input_image).convert("RGB") orig_w, orig_h = img.size aspect = orig_w / orig_h if aspect >= 1: new_w, new_h = 832, 480 else: new_w, new_h = 480, 832 img = img.resize((new_w, new_h), Image.LANCZOS) generator = torch.Generator(device="cpu").manual_seed(seed) output = pipe( image=img, prompt=expanded_motion, negative_prompt=VIDEO_NEG, height=new_h, width=new_w, num_frames=int(num_frames), guidance_scale=float(guidance), num_inference_steps=30, generator=generator, ) frames = output.frames[0] tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) export_to_video(frames, tmp.name, fps=16) return tmp.name, seed, f"**Motion prompt sent to model:**\n\n{expanded_motion}" # ── CSS ─────────────────────────────────────────────────────────────────────── css = """ * { box-sizing: border-box; margin: 0; padding: 0; } body, .gradio-container { background: #07070e !important; font-family: 'Inter', system-ui, sans-serif !important; max-width: 500px !important; margin: 0 auto !important; padding: 8px !important; } .topbar { display: flex; align-items: center; justify-content: space-between; padding: 10px 2px 14px; } .topbar-title { color: #e8e0ff; font-size: 0.95em; font-weight: 800; } .gpu-pill { background: #1aff7a18; border: 1px solid #1aff7a44; color: #1aff7a; font-size: 0.6em; font-weight: 800; padding: 4px 12px; border-radius: 20px; letter-spacing: 1.5px; text-transform: uppercase; } .upload-area { background: #0d0d1a; border: 2px dashed #1e1e35; border-radius: 18px; overflow: hidden; margin-bottom: 8px; min-height: 260px; display: flex; align-items: center; justify-content: center; } .video-out { background: #0d0d1a; border: 1px solid #16162a; border-radius: 18px; overflow: hidden; margin-bottom: 8px; min-height: 260px; } .card { background: #0d0d1a; border: 1px solid #16162a; border-radius: 14px; padding: 14px; margin-bottom: 8px; } .card-label { color: #3d3060; font-size: 0.62em; font-weight: 800; text-transform: uppercase; letter-spacing: 2px; margin-bottom: 8px; } textarea { background: transparent !important; border: none !important; color: #c8b8f0 !important; font-size: 15px !important; line-height: 1.6 !important; padding: 0 !important; resize: none !important; box-shadow: none !important; width: 100% !important; outline: none !important; } textarea::placeholder { color: #252038 !important; } textarea:focus { outline: none !important; box-shadow: none !important; border: none !important; } .gradio-accordion { background: #0d0d1a !important; border: 1px solid #16162a !important; border-radius: 14px !important; margin-bottom: 8px !important; overflow: hidden !important; } .gradio-accordion .label-wrap button { color: #4a3a6a !important; font-size: 0.72em !important; font-weight: 700 !important; text-transform: uppercase !important; letter-spacing: 1.5px !important; padding: 12px 16px !important; } .gradio-slider { background: transparent !important; border: none !important; padding: 4px 0 10px !important; } input[type=range] { accent-color: #3366bb !important; width: 100% !important; } input[type=number] { background: #0a0a14 !important; border: 1px solid #18182a !important; border-radius: 10px !important; color: #7799cc !important; font-size: 13px !important; padding: 8px 10px !important; } input[type=checkbox] { accent-color: #3366bb !important; } .gradio-checkbox label span { color: #4a3a6a !important; font-size: 0.75em !important; font-weight: 600 !important; } label > span:first-child { color: #3a2d55 !important; font-size: 0.7em !important; font-weight: 700 !important; text-transform: uppercase !important; letter-spacing: 1px !important; } .seed-out input[type=number] { background: transparent !important; border: none !important; color: #2e2848 !important; font-size: 0.7em !important; text-align: center !important; padding: 2px !important; } .hint-box { background: #0a0a14; border: 1px solid #111122; border-radius: 10px; padding: 10px 14px; color: #443366; font-size: 0.72em; line-height: 1.7; margin-bottom: 8px; word-break: break-word; } .gen-btn button { background: linear-gradient(135deg, #1a3aaa 0%, #0e1e77 100%) !important; border: 1px solid #2255cc !important; border-radius: 14px !important; color: #fff !important; font-size: 0.88em !important; font-weight: 900 !important; padding: 17px !important; width: 100% !important; letter-spacing: 2px !important; text-transform: uppercase !important; box-shadow: 0 4px 24px #1a3aaa55 !important; transition: all 0.15s ease !important; margin-top: 6px !important; } .gen-btn button:hover { box-shadow: 0 6px 32px #1a3aaa99 !important; transform: translateY(-1px) !important; } .gen-btn button:active { transform: scale(0.98) !important; box-shadow: 0 2px 12px #1a3aaa33 !important; } footer, .built-with { display: none !important; } """ # ── UI ──────────────────────────────────────────────────────────────────────── with gr.Blocks(css=css, title="VideoGen") as demo: gr.HTML("""
🎬 Wan I2V — Image to Video ⚡ ZeroGPU
""") gr.HTML("""
Upload any image → describe the motion → get a ~3–5 second 480P video.

Motion tips: describe what moves, not what's in the image.
e.g. "hair gently blowing, eyes blink, camera slowly pulls back"
""") input_image = gr.Image( label="Input Image", type="numpy", height=300, elem_classes="upload-area", ) gr.HTML('
✦ Motion — what should move?
') motion_prompt = gr.Textbox( show_label=False, placeholder="hair gently blowing, eyes blinking slowly, soft light shimmer...", lines=2, ) gr.HTML('
') generate_btn = gr.Button( "Generate Video ✦", variant="primary", size="lg", elem_classes="gen-btn", ) output_video = gr.Video( label="Generated Video", elem_classes="video-out", height=300, ) used_seed = gr.Number( label="seed", interactive=False, elem_classes="seed-out", ) expanded_out = gr.Markdown( value="", elem_classes="hint-box", ) with gr.Accordion("⚙️ Settings", open=False): gr.HTML('
') num_frames = gr.Slider( minimum=17, maximum=81, value=49, step=16, label="Frames — 17≈1s 49≈3s 81≈5s (at 16fps)", ) guidance = gr.Slider( minimum=1.0, maximum=10.0, value=5.0, step=0.5, label="Guidance Scale", ) with gr.Row(): seed = gr.Number( label="Seed", value=42, precision=0, minimum=0, maximum=2**32-1, scale=3, ) randomize = gr.Checkbox( label="Random seed", value=True, scale=1, ) generate_btn.click( fn=generate_video, inputs=[ input_image, motion_prompt, num_frames, guidance, seed, randomize, ], outputs=[output_video, used_seed, expanded_out], ) demo.launch()