Spaces:

tomiconic
/

VideoGen

Sleeping

File size: 12,288 Bytes

import gradio as gr
import torch
import spaces
import os
import tempfile
import random
from PIL import Image
from diffusers import AutoencoderKLWan, WanImageToVideoPipeline
from diffusers.utils import export_to_video
from transformers import CLIPVisionModel
from huggingface_hub import InferenceClient

# ── Config ────────────────────────────────────────────────────────────────────
HF_TOKEN   = os.environ.get("HF_TOKEN", None)
MODEL_REPO = "Wan-AI/Wan2.1-I2V-14B-480P-Diffusers"

# ── Prompt expansion LLM ──────────────────────────────────────────────────────
llm_client = InferenceClient(
    model="mistralai/Mistral-7B-Instruct-v0.3",
    token=HF_TOKEN,
)

VIDEO_SYSTEM = """You are an expert at writing motion prompts for AI video generation using Wan I2V.

Your job: take a short description of desired motion/animation and expand it into a detailed video motion prompt.

Rules:
- Focus on MOTION — what moves, how it moves, camera movement
- Be specific: "hair gently blowing in breeze", "camera slowly pulls back", "eyes blink naturally"
- Keep subjects consistent with what is already in the image
- Describe lighting changes if relevant e.g. "light flickers softly"
- Do NOT describe the static image content — only the motion
- Return ONLY the prompt, no explanation, no preamble
- Keep under 80 words"""

def expand_video_prompt(raw_prompt):
    if not raw_prompt.strip():
        return "subtle natural movement, gentle camera drift, cinematic atmosphere"
    try:
        response = llm_client.chat_completion(
            messages=[
                {"role": "system", "content": VIDEO_SYSTEM},
                {"role": "user",   "content": f"Expand this motion description:\n{raw_prompt.strip()}"},
            ],
            max_tokens=150,
            temperature=0.6,
        )
        return response.choices[0].message.content.strip().strip('"').strip("'")
    except Exception as e:
        print(f"LLM expansion failed, using raw prompt: {e}")
        return raw_prompt.strip()

# ── Load pipeline ─────────────────────────────────────────────────────────────
print("Loading Wan2.1 I2V pipeline...")

image_encoder = CLIPVisionModel.from_pretrained(
    MODEL_REPO,
    subfolder="image_encoder",
    torch_dtype=torch.float32,
)

vae = AutoencoderKLWan.from_pretrained(
    MODEL_REPO,
    subfolder="vae",
    torch_dtype=torch.float32,
)

pipe = WanImageToVideoPipeline.from_pretrained(
    MODEL_REPO,
    vae=vae,
    image_encoder=image_encoder,
    torch_dtype=torch.bfloat16,
)

pipe.enable_model_cpu_offload()
print("Pipeline ready.")

# ── Negative prompt ───────────────────────────────────────────────────────────
VIDEO_NEG = (
    "static, no movement, blurry, low quality, worst quality, "
    "inconsistent motion, flickering, jitter, artifacts, "
    "watermark, text, deformed"
)

# ── Generation ────────────────────────────────────────────────────────────────
@spaces.GPU(duration=300)
def generate_video(input_image, motion_prompt, num_frames, guidance, seed, randomize):

    if input_image is None:
        raise gr.Error("Please upload an image first.")

    if randomize:
        seed = random.randint(0, 2**32 - 1)
    seed = int(seed)

    # Expand motion prompt via LLM
    expanded_motion = expand_video_prompt(motion_prompt)
    print(f"Expanded motion: {expanded_motion}")

    # Resize — Wan I2V works best at 832x480
    img = Image.fromarray(input_image).convert("RGB")
    orig_w, orig_h = img.size
    aspect = orig_w / orig_h
    if aspect >= 1:
        new_w, new_h = 832, 480
    else:
        new_w, new_h = 480, 832
    img = img.resize((new_w, new_h), Image.LANCZOS)

    generator = torch.Generator(device="cpu").manual_seed(seed)

    output = pipe(
        image=img,
        prompt=expanded_motion,
        negative_prompt=VIDEO_NEG,
        height=new_h,
        width=new_w,
        num_frames=int(num_frames),
        guidance_scale=float(guidance),
        num_inference_steps=30,
        generator=generator,
    )

    frames = output.frames[0]

    tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
    export_to_video(frames, tmp.name, fps=16)

    return tmp.name, seed, f"**Motion prompt sent to model:**\n\n{expanded_motion}"

# ── CSS ───────────────────────────────────────────────────────────────────────
css = """
* { box-sizing: border-box; margin: 0; padding: 0; }

body, .gradio-container {
    background: #07070e !important;
    font-family: 'Inter', system-ui, sans-serif !important;
    max-width: 500px !important;
    margin: 0 auto !important;
    padding: 8px !important;
}

.topbar {
    display: flex;
    align-items: center;
    justify-content: space-between;
    padding: 10px 2px 14px;
}
.topbar-title {
    color: #e8e0ff;
    font-size: 0.95em;
    font-weight: 800;
}
.gpu-pill {
    background: #1aff7a18;
    border: 1px solid #1aff7a44;
    color: #1aff7a;
    font-size: 0.6em;
    font-weight: 800;
    padding: 4px 12px;
    border-radius: 20px;
    letter-spacing: 1.5px;
    text-transform: uppercase;
}

.upload-area {
    background: #0d0d1a;
    border: 2px dashed #1e1e35;
    border-radius: 18px;
    overflow: hidden;
    margin-bottom: 8px;
    min-height: 260px;
    display: flex;
    align-items: center;
    justify-content: center;
}

.video-out {
    background: #0d0d1a;
    border: 1px solid #16162a;
    border-radius: 18px;
    overflow: hidden;
    margin-bottom: 8px;
    min-height: 260px;
}

.card {
    background: #0d0d1a;
    border: 1px solid #16162a;
    border-radius: 14px;
    padding: 14px;
    margin-bottom: 8px;
}
.card-label {
    color: #3d3060;
    font-size: 0.62em;
    font-weight: 800;
    text-transform: uppercase;
    letter-spacing: 2px;
    margin-bottom: 8px;
}

textarea {
    background: transparent !important;
    border: none !important;
    color: #c8b8f0 !important;
    font-size: 15px !important;
    line-height: 1.6 !important;
    padding: 0 !important;
    resize: none !important;
    box-shadow: none !important;
    width: 100% !important;
    outline: none !important;
}
textarea::placeholder { color: #252038 !important; }
textarea:focus {
    outline: none !important;
    box-shadow: none !important;
    border: none !important;
}

.gradio-accordion {
    background: #0d0d1a !important;
    border: 1px solid #16162a !important;
    border-radius: 14px !important;
    margin-bottom: 8px !important;
    overflow: hidden !important;
}
.gradio-accordion .label-wrap button {
    color: #4a3a6a !important;
    font-size: 0.72em !important;
    font-weight: 700 !important;
    text-transform: uppercase !important;
    letter-spacing: 1.5px !important;
    padding: 12px 16px !important;
}

.gradio-slider {
    background: transparent !important;
    border: none !important;
    padding: 4px 0 10px !important;
}
input[type=range] {
    accent-color: #3366bb !important;
    width: 100% !important;
}

input[type=number] {
    background: #0a0a14 !important;
    border: 1px solid #18182a !important;
    border-radius: 10px !important;
    color: #7799cc !important;
    font-size: 13px !important;
    padding: 8px 10px !important;
}

input[type=checkbox] { accent-color: #3366bb !important; }
.gradio-checkbox label span {
    color: #4a3a6a !important;
    font-size: 0.75em !important;
    font-weight: 600 !important;
}

label > span:first-child {
    color: #3a2d55 !important;
    font-size: 0.7em !important;
    font-weight: 700 !important;
    text-transform: uppercase !important;
    letter-spacing: 1px !important;
}

.seed-out input[type=number] {
    background: transparent !important;
    border: none !important;
    color: #2e2848 !important;
    font-size: 0.7em !important;
    text-align: center !important;
    padding: 2px !important;
}

.hint-box {
    background: #0a0a14;
    border: 1px solid #111122;
    border-radius: 10px;
    padding: 10px 14px;
    color: #443366;
    font-size: 0.72em;
    line-height: 1.7;
    margin-bottom: 8px;
    word-break: break-word;
}

.gen-btn button {
    background: linear-gradient(135deg, #1a3aaa 0%, #0e1e77 100%) !important;
    border: 1px solid #2255cc !important;
    border-radius: 14px !important;
    color: #fff !important;
    font-size: 0.88em !important;
    font-weight: 900 !important;
    padding: 17px !important;
    width: 100% !important;
    letter-spacing: 2px !important;
    text-transform: uppercase !important;
    box-shadow: 0 4px 24px #1a3aaa55 !important;
    transition: all 0.15s ease !important;
    margin-top: 6px !important;
}
.gen-btn button:hover {
    box-shadow: 0 6px 32px #1a3aaa99 !important;
    transform: translateY(-1px) !important;
}
.gen-btn button:active {
    transform: scale(0.98) !important;
    box-shadow: 0 2px 12px #1a3aaa33 !important;
}

footer, .built-with { display: none !important; }
"""

# ── UI ────────────────────────────────────────────────────────────────────────
with gr.Blocks(css=css, title="VideoGen") as demo:

    gr.HTML("""
    <div class="topbar">
        <span class="topbar-title">🎬 Wan I2V — Image to Video</span>
        <span class="gpu-pill">⚡ ZeroGPU</span>
    </div>
    """)

    gr.HTML("""
    <div class="hint-box">
        Upload any image → describe the motion → get a ~3–5 second 480P video.<br><br>
        <strong>Motion tips:</strong> describe what moves, not what's in the image.<br>
        e.g. <em>"hair gently blowing, eyes blink, camera slowly pulls back"</em>
    </div>
    """)

    input_image = gr.Image(
        label="Input Image",
        type="numpy",
        height=300,
        elem_classes="upload-area",
    )

    gr.HTML('<div class="card"><div class="card-label">✦ Motion — what should move?</div>')
    motion_prompt = gr.Textbox(
        show_label=False,
        placeholder="hair gently blowing, eyes blinking slowly, soft light shimmer...",
        lines=2,
    )
    gr.HTML('</div>')

    generate_btn = gr.Button(
        "Generate Video ✦", variant="primary",
        size="lg", elem_classes="gen-btn",
    )

    output_video = gr.Video(
        label="Generated Video",
        elem_classes="video-out",
        height=300,
    )

    used_seed = gr.Number(
        label="seed", interactive=False,
        elem_classes="seed-out",
    )

    expanded_out = gr.Markdown(
        value="",
        elem_classes="hint-box",
    )

    with gr.Accordion("⚙️  Settings", open=False):
        gr.HTML('<div style="height:6px"></div>')

        num_frames = gr.Slider(
            minimum=17,
            maximum=81,
            value=49,
            step=16,
            label="Frames — 17≈1s  49≈3s  81≈5s  (at 16fps)",
        )
        guidance = gr.Slider(
            minimum=1.0,
            maximum=10.0,
            value=5.0,
            step=0.5,
            label="Guidance Scale",
        )
        with gr.Row():
            seed = gr.Number(
                label="Seed", value=42, precision=0,
                minimum=0, maximum=2**32-1, scale=3,
            )
            randomize = gr.Checkbox(
                label="Random seed", value=True, scale=1,
            )

    generate_btn.click(
        fn=generate_video,
        inputs=[
            input_image, motion_prompt, num_frames,
            guidance, seed, randomize,
        ],
        outputs=[output_video, used_seed, expanded_out],
    )

demo.launch()