Spaces:

tomiconic
/

VideoGen

Sleeping

App Files Files Community

tomiconic commited on 11 days ago

Commit

5765f32

verified ·

1 Parent(s): fea8014

Create app.py

Browse files

Files changed (1) hide show

app.py +391 -0

app.py ADDED Viewed

	@@ -0,0 +1,391 @@

+import gradio as gr
+import torch
+import spaces
+import os
+import tempfile
+import numpy as np
+from PIL import Image
+from diffusers import AutoencoderKLWan, WanImageToVideoPipeline
+from diffusers.utils import export_to_video
+from transformers import CLIPVisionModel
+from huggingface_hub import InferenceClient
+# ── Config ────────────────────────────────────────────────────────────────────
+HF_TOKEN   = os.environ.get("HF_TOKEN", None)
+MODEL_REPO = "Wan-AI/Wan2.1-I2V-14B-480P"
+# ── Prompt expansion LLM ──────────────────────────────────────────────────────
+llm_client = InferenceClient(
+    model="mistralai/Mistral-7B-Instruct-v0.3",
+    token=HF_TOKEN,
+)
+VIDEO_SYSTEM = """You are an expert at writing motion prompts for AI video generation using Wan I2V.
+Your job: take a short description of desired motion/animation and expand it into a detailed video motion prompt.
+Rules:
+- Focus on MOTION — what moves, how it moves, camera movement
+- Be specific: "hair gently blowing in breeze", "camera slowly pulls back", "eyes blink naturally"
+- Keep subjects consistent with what's already in the image
+- Describe lighting changes if relevant (e.g. "light flickers softly")
+- Do NOT describe the static image content — only the motion
+- Return ONLY the prompt, no explanation, no preamble
+- Keep under 80 words"""
+def expand_video_prompt(raw_prompt):
+    if not raw_prompt.strip():
+        return "subtle natural movement, gentle camera drift, cinematic atmosphere"
+    try:
+        response = llm_client.chat_completion(
+            messages=[
+                {"role": "system", "content": VIDEO_SYSTEM},
+                {"role": "user",   "content": f"Expand this motion description:\n{raw_prompt.strip()}"},
+            ],
+            max_tokens=150,
+            temperature=0.6,
+        )
+        return response.choices[0].message.content.strip().strip('"').strip("'")
+    except Exception as e:
+        print(f"LLM failed: {e}")
+        return raw_prompt.strip()
+# ── Load pipeline ─────────────────────────────────────────────────────────────
+print("Loading Wan2.1 I2V pipeline...")
+vae = AutoencoderKLWan.from_pretrained(
+    MODEL_REPO,
+    subfolder="vae",
+    torch_dtype=torch.float32,
+)
+pipe = WanImageToVideoPipeline.from_pretrained(
+    MODEL_REPO,
+    vae=vae,
+    torch_dtype=torch.bfloat16,
+)
+# CPU offload keeps VRAM usage manageable on ZeroGPU
+pipe.enable_model_cpu_offload()
+print("Pipeline ready.")
+# ── Negative prompt for video ─────────────────────────────────────────────────
+VIDEO_NEG = (
+    "static, no movement, blurry, low quality, worst quality, "
+    "inconsistent motion, flickering, jitter, artifacts, "
+    "watermark, text, deformed"
+)
+# ── Generation ────────────────────────────────────────────────────────────────
+@spaces.GPU(duration=300)
+def generate_video(input_image, motion_prompt, num_frames, guidance, seed, randomize):
+    if input_image is None:
+        raise gr.Error("Please upload an image first.")
+    if randomize:
+        seed = random.randint(0, 2**32 - 1)
+    seed = int(seed)
+    # Expand motion prompt via LLM
+    expanded_motion = expand_video_prompt(motion_prompt)
+    print(f"Motion prompt: {expanded_motion}")
+    # Resize image — Wan I2V works best at 832x480 area
+    img = Image.fromarray(input_image).convert("RGB")
+    orig_w, orig_h = img.size
+    aspect = orig_w / orig_h
+    if aspect >= 1:
+        new_w, new_h = 832, 480
+    else:
+        new_w, new_h = 480, 832
+    img = img.resize((new_w, new_h), Image.LANCZOS)
+    generator = torch.Generator(device="cpu").manual_seed(seed)
+    output = pipe(
+        image=img,
+        prompt=expanded_motion,
+        negative_prompt=VIDEO_NEG,
+        height=new_h,
+        width=new_w,
+        num_frames=int(num_frames),
+        guidance_scale=float(guidance),
+        num_inference_steps=30,
+        generator=generator,
+    )
+    frames = output.frames[0]
+    # Export to mp4
+    tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
+    export_to_video(frames, tmp.name, fps=16)
+    return tmp.name, seed, f"**Motion prompt sent to model:**\n\n{expanded_motion}"
+# ── CSS ───────────────────────────────────────────────────────────────────────
+import random
+css = """
+* { box-sizing: border-box; margin: 0; padding: 0; }
+body, .gradio-container {
+    background: #07070e !important;
+    font-family: 'Inter', system-ui, sans-serif !important;
+    max-width: 500px !important;
+    margin: 0 auto !important;
+    padding: 8px !important;
+}
+.topbar {
+    display: flex;
+    align-items: center;
+    justify-content: space-between;
+    padding: 10px 2px 14px;
+}
+.topbar-title {
+    color: #e8e0ff;
+    font-size: 0.95em;
+    font-weight: 800;
+}
+.gpu-pill {
+    background: #1aff7a18;
+    border: 1px solid #1aff7a44;
+    color: #1aff7a;
+    font-size: 0.6em;
+    font-weight: 800;
+    padding: 4px 12px;
+    border-radius: 20px;
+    letter-spacing: 1.5px;
+    text-transform: uppercase;
+}
+.upload-area {
+    background: #0d0d1a;
+    border: 2px dashed #1e1e35;
+    border-radius: 18px;
+    overflow: hidden;
+    margin-bottom: 8px;
+    min-height: 260px;
+    display: flex;
+    align-items: center;
+    justify-content: center;
+}
+.upload-area img { width: 100% !important; border-radius: 16px; }
+.video-out {
+    background: #0d0d1a;
+    border: 1px solid #16162a;
+    border-radius: 18px;
+    overflow: hidden;
+    margin-bottom: 8px;
+    min-height: 260px;
+}
+.card {
+    background: #0d0d1a;
+    border: 1px solid #16162a;
+    border-radius: 14px;
+    padding: 14px;
+    margin-bottom: 8px;
+}
+.card-label {
+    color: #3d3060;
+    font-size: 0.62em;
+    font-weight: 800;
+    text-transform: uppercase;
+    letter-spacing: 2px;
+    margin-bottom: 8px;
+}
+textarea {
+    background: transparent !important;
+    border: none !important;
+    color: #c8b8f0 !important;
+    font-size: 15px !important;
+    line-height: 1.6 !important;
+    padding: 0 !important;
+    resize: none !important;
+    box-shadow: none !important;
+    width: 100% !important;
+    outline: none !important;
+}
+textarea::placeholder { color: #252038 !important; }
+textarea:focus { outline: none !important; box-shadow: none !important; }
+.gradio-accordion {
+    background: #0d0d1a !important;
+    border: 1px solid #16162a !important;
+    border-radius: 14px !important;
+    margin-bottom: 8px !important;
+    overflow: hidden !important;
+}
+.gradio-accordion .label-wrap button {
+    color: #4a3a6a !important;
+    font-size: 0.72em !important;
+    font-weight: 700 !important;
+    text-transform: uppercase !important;
+    letter-spacing: 1.5px !important;
+    padding: 12px 16px !important;
+}
+.gradio-slider {
+    background: transparent !important;
+    border: none !important;
+    padding: 4px 0 10px !important;
+}
+input[type=range] { accent-color: #6633bb !important; width: 100% !important; }
+input[type=number] {
+    background: #0a0a14 !important;
+    border: 1px solid #18182a !important;
+    border-radius: 10px !important;
+    color: #9977cc !important;
+    font-size: 13px !important;
+    padding: 8px 10px !important;
+}
+input[type=checkbox] { accent-color: #6633bb !important; }
+.gradio-checkbox label span {
+    color: #4a3a6a !important;
+    font-size: 0.75em !important;
+    font-weight: 600 !important;
+}
+label > span:first-child {
+    color: #3a2d55 !important;
+    font-size: 0.7em !important;
+    font-weight: 700 !important;
+    text-transform: uppercase !important;
+    letter-spacing: 1px !important;
+}
+.seed-out input[type=number] {
+    background: transparent !important;
+    border: none !important;
+    color: #2e2848 !important;
+    font-size: 0.7em !important;
+    text-align: center !important;
+}
+.hint-box {
+    background: #0a0a14;
+    border: 1px solid #111122;
+    border-radius: 10px;
+    padding: 10px 14px;
+    color: #443366;
+    font-size: 0.72em;
+    line-height: 1.7;
+    margin-bottom: 8px;
+}
+.gen-btn button {
+    background: linear-gradient(135deg, #1a4aaa 0%, #0e2d77 100%) !important;
+    border: 1px solid #3366cc !important;
+    border-radius: 14px !important;
+    color: #fff !important;
+    font-size: 0.88em !important;
+    font-weight: 900 !important;
+    padding: 17px !important;
+    width: 100% !important;
+    letter-spacing: 2px !important;
+    text-transform: uppercase !important;
+    box-shadow: 0 4px 24px #1a4aaa55 !important;
+    transition: all 0.15s ease !important;
+    margin-top: 6px !important;
+}
+.gen-btn button:hover {
+    box-shadow: 0 6px 32px #1a4aaa99 !important;
+    transform: translateY(-1px) !important;
+}
+.gen-btn button:active { transform: scale(0.98) !important; }
+footer, .built-with { display: none !important; }
+"""
+# ── UI ────────────────────────────────────────────────────────────────────────
+with gr.Blocks(css=css, title="VideoGen") as demo:
+    gr.HTML("""
+    <div class="topbar">
+        <span class="topbar-title">🎬 Wan I2V — Image to Video</span>
+        <span class="gpu-pill">⚡ ZeroGPU</span>
+    </div>
+    """)
+    gr.HTML("""
+    <div class="hint-box">
+        Upload any image → describe the motion you want → generate a ~5 second 480P video.<br>
+        <strong>Tips:</strong> describe motion, not the image itself. "hair blowing in wind", "camera slowly zooms out", "candle flame flickers".
+    </div>
+    """)
+    # Input image
+    input_image = gr.Image(
+        label="Input Image",
+        type="numpy",
+        height=300,
+        elem_classes="upload-area",
+    )
+    # Motion prompt
+    gr.HTML('<div class="card"><div class="card-label">✦ Motion — describe what should move</div>')
+    motion_prompt = gr.Textbox(
+        show_label=False,
+        placeholder="hair gently blowing, eyes blinking slowly, soft light shimmer...",
+        lines=2,
+    )
+    gr.HTML('</div>')
+    # Generate button
+    generate_btn = gr.Button(
+        "Generate Video ✦", variant="primary",
+        size="lg", elem_classes="gen-btn",
+    )
+    # Output video
+    output_video = gr.Video(
+        label="Generated Video",
+        elem_classes="video-out",
+        height=300,
+    )
+    used_seed = gr.Number(
+        label="seed", interactive=False,
+        elem_classes="seed-out",
+    )
+    expanded_out = gr.Markdown(elem_classes="hint-box")
+    # Advanced settings
+    with gr.Accordion("⚙️  Settings", open=False):
+        gr.HTML('<div style="height:6px"></div>')
+        num_frames = gr.Slider(
+            minimum=17,
+            maximum=81,
+            value=49,
+            step=16,
+            label="Frames (17=~1s, 49=~3s, 81=~5s at 16fps)",
+        )
+        guidance = gr.Slider(
+            minimum=1.0,
+            maximum=10.0,
+            value=5.0,
+            step=0.5,
+            label="Guidance Scale",
+        )
+        with gr.Row():
+            seed = gr.Number(
+                label="Seed", value=42, precision=0,
+                minimum=0, maximum=2**32-1, scale=3,
+            )
+            randomize = gr.Checkbox(label="Random seed", value=True, scale=1)
+    generate_btn.click(
+        fn=generate_video,
+        inputs=[input_image, motion_prompt, num_frames, guidance, seed, randomize],
+        outputs=[output_video, used_seed, expanded_out],
+    )
+demo.launch()