VideoGen / app.py
tomiconic's picture
Update app.py
83f4a0d verified
import gradio as gr
import torch
import spaces
import os
import tempfile
import random
from PIL import Image
from diffusers import AutoencoderKLWan, WanImageToVideoPipeline
from diffusers.utils import export_to_video
from transformers import CLIPVisionModel
from huggingface_hub import InferenceClient
# ── Config ────────────────────────────────────────────────────────────────────
HF_TOKEN = os.environ.get("HF_TOKEN", None)
MODEL_REPO = "Wan-AI/Wan2.1-I2V-14B-480P-Diffusers"
# ── Prompt expansion LLM ──────────────────────────────────────────────────────
llm_client = InferenceClient(
model="mistralai/Mistral-7B-Instruct-v0.3",
token=HF_TOKEN,
)
VIDEO_SYSTEM = """You are an expert at writing motion prompts for AI video generation using Wan I2V.
Your job: take a short description of desired motion/animation and expand it into a detailed video motion prompt.
Rules:
- Focus on MOTION β€” what moves, how it moves, camera movement
- Be specific: "hair gently blowing in breeze", "camera slowly pulls back", "eyes blink naturally"
- Keep subjects consistent with what is already in the image
- Describe lighting changes if relevant e.g. "light flickers softly"
- Do NOT describe the static image content β€” only the motion
- Return ONLY the prompt, no explanation, no preamble
- Keep under 80 words"""
def expand_video_prompt(raw_prompt):
if not raw_prompt.strip():
return "subtle natural movement, gentle camera drift, cinematic atmosphere"
try:
response = llm_client.chat_completion(
messages=[
{"role": "system", "content": VIDEO_SYSTEM},
{"role": "user", "content": f"Expand this motion description:\n{raw_prompt.strip()}"},
],
max_tokens=150,
temperature=0.6,
)
return response.choices[0].message.content.strip().strip('"').strip("'")
except Exception as e:
print(f"LLM expansion failed, using raw prompt: {e}")
return raw_prompt.strip()
# ── Load pipeline ─────────────────────────────────────────────────────────────
print("Loading Wan2.1 I2V pipeline...")
image_encoder = CLIPVisionModel.from_pretrained(
MODEL_REPO,
subfolder="image_encoder",
torch_dtype=torch.float32,
)
vae = AutoencoderKLWan.from_pretrained(
MODEL_REPO,
subfolder="vae",
torch_dtype=torch.float32,
)
pipe = WanImageToVideoPipeline.from_pretrained(
MODEL_REPO,
vae=vae,
image_encoder=image_encoder,
torch_dtype=torch.bfloat16,
)
pipe.enable_model_cpu_offload()
print("Pipeline ready.")
# ── Negative prompt ───────────────────────────────────────────────────────────
VIDEO_NEG = (
"static, no movement, blurry, low quality, worst quality, "
"inconsistent motion, flickering, jitter, artifacts, "
"watermark, text, deformed"
)
# ── Generation ────────────────────────────────────────────────────────────────
@spaces.GPU(duration=300)
def generate_video(input_image, motion_prompt, num_frames, guidance, seed, randomize):
if input_image is None:
raise gr.Error("Please upload an image first.")
if randomize:
seed = random.randint(0, 2**32 - 1)
seed = int(seed)
# Expand motion prompt via LLM
expanded_motion = expand_video_prompt(motion_prompt)
print(f"Expanded motion: {expanded_motion}")
# Resize β€” Wan I2V works best at 832x480
img = Image.fromarray(input_image).convert("RGB")
orig_w, orig_h = img.size
aspect = orig_w / orig_h
if aspect >= 1:
new_w, new_h = 832, 480
else:
new_w, new_h = 480, 832
img = img.resize((new_w, new_h), Image.LANCZOS)
generator = torch.Generator(device="cpu").manual_seed(seed)
output = pipe(
image=img,
prompt=expanded_motion,
negative_prompt=VIDEO_NEG,
height=new_h,
width=new_w,
num_frames=int(num_frames),
guidance_scale=float(guidance),
num_inference_steps=30,
generator=generator,
)
frames = output.frames[0]
tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
export_to_video(frames, tmp.name, fps=16)
return tmp.name, seed, f"**Motion prompt sent to model:**\n\n{expanded_motion}"
# ── CSS ───────────────────────────────────────────────────────────────────────
css = """
* { box-sizing: border-box; margin: 0; padding: 0; }
body, .gradio-container {
background: #07070e !important;
font-family: 'Inter', system-ui, sans-serif !important;
max-width: 500px !important;
margin: 0 auto !important;
padding: 8px !important;
}
.topbar {
display: flex;
align-items: center;
justify-content: space-between;
padding: 10px 2px 14px;
}
.topbar-title {
color: #e8e0ff;
font-size: 0.95em;
font-weight: 800;
}
.gpu-pill {
background: #1aff7a18;
border: 1px solid #1aff7a44;
color: #1aff7a;
font-size: 0.6em;
font-weight: 800;
padding: 4px 12px;
border-radius: 20px;
letter-spacing: 1.5px;
text-transform: uppercase;
}
.upload-area {
background: #0d0d1a;
border: 2px dashed #1e1e35;
border-radius: 18px;
overflow: hidden;
margin-bottom: 8px;
min-height: 260px;
display: flex;
align-items: center;
justify-content: center;
}
.video-out {
background: #0d0d1a;
border: 1px solid #16162a;
border-radius: 18px;
overflow: hidden;
margin-bottom: 8px;
min-height: 260px;
}
.card {
background: #0d0d1a;
border: 1px solid #16162a;
border-radius: 14px;
padding: 14px;
margin-bottom: 8px;
}
.card-label {
color: #3d3060;
font-size: 0.62em;
font-weight: 800;
text-transform: uppercase;
letter-spacing: 2px;
margin-bottom: 8px;
}
textarea {
background: transparent !important;
border: none !important;
color: #c8b8f0 !important;
font-size: 15px !important;
line-height: 1.6 !important;
padding: 0 !important;
resize: none !important;
box-shadow: none !important;
width: 100% !important;
outline: none !important;
}
textarea::placeholder { color: #252038 !important; }
textarea:focus {
outline: none !important;
box-shadow: none !important;
border: none !important;
}
.gradio-accordion {
background: #0d0d1a !important;
border: 1px solid #16162a !important;
border-radius: 14px !important;
margin-bottom: 8px !important;
overflow: hidden !important;
}
.gradio-accordion .label-wrap button {
color: #4a3a6a !important;
font-size: 0.72em !important;
font-weight: 700 !important;
text-transform: uppercase !important;
letter-spacing: 1.5px !important;
padding: 12px 16px !important;
}
.gradio-slider {
background: transparent !important;
border: none !important;
padding: 4px 0 10px !important;
}
input[type=range] {
accent-color: #3366bb !important;
width: 100% !important;
}
input[type=number] {
background: #0a0a14 !important;
border: 1px solid #18182a !important;
border-radius: 10px !important;
color: #7799cc !important;
font-size: 13px !important;
padding: 8px 10px !important;
}
input[type=checkbox] { accent-color: #3366bb !important; }
.gradio-checkbox label span {
color: #4a3a6a !important;
font-size: 0.75em !important;
font-weight: 600 !important;
}
label > span:first-child {
color: #3a2d55 !important;
font-size: 0.7em !important;
font-weight: 700 !important;
text-transform: uppercase !important;
letter-spacing: 1px !important;
}
.seed-out input[type=number] {
background: transparent !important;
border: none !important;
color: #2e2848 !important;
font-size: 0.7em !important;
text-align: center !important;
padding: 2px !important;
}
.hint-box {
background: #0a0a14;
border: 1px solid #111122;
border-radius: 10px;
padding: 10px 14px;
color: #443366;
font-size: 0.72em;
line-height: 1.7;
margin-bottom: 8px;
word-break: break-word;
}
.gen-btn button {
background: linear-gradient(135deg, #1a3aaa 0%, #0e1e77 100%) !important;
border: 1px solid #2255cc !important;
border-radius: 14px !important;
color: #fff !important;
font-size: 0.88em !important;
font-weight: 900 !important;
padding: 17px !important;
width: 100% !important;
letter-spacing: 2px !important;
text-transform: uppercase !important;
box-shadow: 0 4px 24px #1a3aaa55 !important;
transition: all 0.15s ease !important;
margin-top: 6px !important;
}
.gen-btn button:hover {
box-shadow: 0 6px 32px #1a3aaa99 !important;
transform: translateY(-1px) !important;
}
.gen-btn button:active {
transform: scale(0.98) !important;
box-shadow: 0 2px 12px #1a3aaa33 !important;
}
footer, .built-with { display: none !important; }
"""
# ── UI ────────────────────────────────────────────────────────────────────────
with gr.Blocks(css=css, title="VideoGen") as demo:
gr.HTML("""
<div class="topbar">
<span class="topbar-title">🎬 Wan I2V β€” Image to Video</span>
<span class="gpu-pill">⚑ ZeroGPU</span>
</div>
""")
gr.HTML("""
<div class="hint-box">
Upload any image β†’ describe the motion β†’ get a ~3–5 second 480P video.<br><br>
<strong>Motion tips:</strong> describe what moves, not what's in the image.<br>
e.g. <em>"hair gently blowing, eyes blink, camera slowly pulls back"</em>
</div>
""")
input_image = gr.Image(
label="Input Image",
type="numpy",
height=300,
elem_classes="upload-area",
)
gr.HTML('<div class="card"><div class="card-label">✦ Motion β€” what should move?</div>')
motion_prompt = gr.Textbox(
show_label=False,
placeholder="hair gently blowing, eyes blinking slowly, soft light shimmer...",
lines=2,
)
gr.HTML('</div>')
generate_btn = gr.Button(
"Generate Video ✦", variant="primary",
size="lg", elem_classes="gen-btn",
)
output_video = gr.Video(
label="Generated Video",
elem_classes="video-out",
height=300,
)
used_seed = gr.Number(
label="seed", interactive=False,
elem_classes="seed-out",
)
expanded_out = gr.Markdown(
value="",
elem_classes="hint-box",
)
with gr.Accordion("βš™οΈ Settings", open=False):
gr.HTML('<div style="height:6px"></div>')
num_frames = gr.Slider(
minimum=17,
maximum=81,
value=49,
step=16,
label="Frames β€” 17β‰ˆ1s 49β‰ˆ3s 81β‰ˆ5s (at 16fps)",
)
guidance = gr.Slider(
minimum=1.0,
maximum=10.0,
value=5.0,
step=0.5,
label="Guidance Scale",
)
with gr.Row():
seed = gr.Number(
label="Seed", value=42, precision=0,
minimum=0, maximum=2**32-1, scale=3,
)
randomize = gr.Checkbox(
label="Random seed", value=True, scale=1,
)
generate_btn.click(
fn=generate_video,
inputs=[
input_image, motion_prompt, num_frames,
guidance, seed, randomize,
],
outputs=[output_video, used_seed, expanded_out],
)
demo.launch()