Spaces:

multimodalart
/

Helios-Distilled

Running on Zero

File size: 10,113 Bytes

import os
import subprocess
import sys
import time
import tempfile
import zipfile
import torch

# ---------------------------------------------------------------------------
# Install private diffusers fork
# ---------------------------------------------------------------------------
_APP_DIR = os.path.dirname(os.path.abspath(__file__))
ZIP_PATH = os.path.join(_APP_DIR, "helios_diffusers.zip")
EXTRACT_DIR = os.path.join(_APP_DIR, "_helios_diffusers")
_PKG_ROOT = os.path.join(EXTRACT_DIR, "diffusers-new-model-addition-helios-helios")

if not os.path.isdir(_PKG_ROOT):
    print(f"[setup] Extracting {ZIP_PATH}")
    with zipfile.ZipFile(ZIP_PATH, "r") as zf:
        zf.extractall(EXTRACT_DIR)

print(f"[setup] Installing diffusers from {_PKG_ROOT}")
try:
    subprocess.check_call([sys.executable, "-m", "pip", "install", _PKG_ROOT])
except subprocess.CalledProcessError as e:
    print(f"[setup] pip install failed (exit {e.returncode}), falling back to sys.path")

_SRC_DIR = os.path.join(_PKG_ROOT, "src")
if os.path.isdir(_SRC_DIR):
    sys.path.insert(0, _SRC_DIR)

import gradio as gr
import spaces
from diffusers import (
    AutoencoderKLWan,
    HeliosPyramidPipeline,
    HeliosDMDScheduler
)
from diffusers.utils import export_to_video, load_image, load_video
from aoti import aoti_load_

# ---------------------------------------------------------------------------
# Pre-load model
# ---------------------------------------------------------------------------
MODEL_ID = "BestWishYsh/Helios-Distilled"

vae = AutoencoderKLWan.from_pretrained(MODEL_ID, subfolder="vae", torch_dtype=torch.float32)
scheduler = HeliosDMDScheduler.from_pretrained(MODEL_ID, subfolder="scheduler")
pipe = HeliosPyramidPipeline.from_pretrained(
    MODEL_ID, 
    vae=vae, 
    scheduler=scheduler,
    torch_dtype=torch.bfloat16,
    is_distilled=True
)

# aoti_load_(pipe.transformer, "multimodalart/helios-distilled-transformer", "helios_distilled_transformer.pt2")

pipe.to("cuda")

pipe.transformer.set_attention_backend("_flash_3_hub")

# ---------------------------------------------------------------------------
# Generation
# ---------------------------------------------------------------------------
@spaces.GPU(duration=300)
def generate_video(
    mode: str,
    prompt: str,
    image_input,
    video_input,
    height: int,
    width: int,
    num_frames: int,
    num_inference_steps: int,
    seed: int,
    is_amplify_first_chunk: bool,
    progress=gr.Progress(track_tqdm=True),
):
    if not prompt:
        raise gr.Error("Please provide a prompt.")

    generator = torch.Generator(device="cuda").manual_seed(int(seed))

    kwargs = {
        "prompt": prompt,
        "height": int(height),
        "width": int(width),
        "num_frames": int(num_frames),
        "guidance_scale": 1.0,
        "generator": generator,
        "output_type": "np",
        "pyramid_num_inference_steps_list": [
            int(num_inference_steps),
            int(num_inference_steps),
            int(num_inference_steps),
        ],
        "is_amplify_first_chunk": is_amplify_first_chunk,
    }

    if mode == "Image-to-Video" and image_input is not None:
        img = load_image(image_input).resize((int(width), int(height)))
        kwargs["image"] = img
    elif mode == "Video-to-Video" and video_input is not None:
        kwargs["video"] = load_video(video_input)

    t0 = time.time()
    output = pipe(**kwargs).frames[0]
    elapsed = time.time() - t0

    tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
    export_to_video(output, tmp.name, fps=24)
    info = f"Generated in {elapsed:.1f}s · {num_frames} frames · {height}×{width}"
    return tmp.name, info

# ---------------------------------------------------------------------------
# UI Setup
# ---------------------------------------------------------------------------
def update_conditional_visibility(mode):
    if mode == "Image-to-Video":
        return gr.update(visible=True), gr.update(visible=False)
    elif mode == "Video-to-Video":
        return gr.update(visible=False), gr.update(visible=True)
    else:
        return gr.update(visible=False), gr.update(visible=False)

CSS = """
#header { text-align: center; margin-bottom: 0.5em; }
#header h1 { font-size: 2.2em; margin-bottom: 0; }
.contain { max-width: 1350px; margin: 0 auto !important; }
"""

with gr.Blocks(css=CSS, title="Helios Video Generation", theme=gr.themes.Soft()) as demo:
    gr.HTML(
        """
        <div id="header">
            <h1>🎬 Helios 14B distilled</h1>
        </div>
        """
    )

    with gr.Row():
        with gr.Column(scale=1):
            mode = gr.Radio(
                choices=["Text-to-Video", "Image-to-Video", "Video-to-Video"],
                value="Text-to-Video",
                label="Generation Mode",
            )
            image_input = gr.Image(label="Image (for I2V)", type="filepath", visible=False)
            video_input = gr.Video(label="Video (for V2V)", visible=False)
            prompt = gr.Textbox(
                label="Prompt",
                lines=4,
                value=(
                    "A vibrant tropical fish swimming gracefully among colorful coral reefs in "
                    "a clear, turquoise ocean. The fish has bright blue and yellow scales with a "
                    "small, distinctive orange spot on its side, its fins moving fluidly. The coral "
                    "reefs are alive with a variety of marine life, including small schools of "
                    "colorful fish and sea turtles gliding by. The water is crystal clear, allowing "
                    "for a view of the sandy ocean floor below. The reef itself is adorned with a mix "
                    "of hard and soft corals in shades of red, orange, and green. The photo captures "
                    "the fish from a slightly elevated angle, emphasizing its lively movements and the "
                    "vivid colors of its surroundings. A close-up shot with dynamic movement."
                )
            )
            with gr.Accordion("Advanced Settings", open=False):
                with gr.Row():
                    height = gr.Number(value=384, label="Height", precision=0, interactive=False)
                    width = gr.Number(value=640, label="Width", precision=0, interactive=False)
                with gr.Row():
                    num_frames = gr.Slider(33, 231, value=231, step=33, label="Num Frames")
                    num_inference_steps = gr.Slider(1, 10, value=2, step=1, label="Steps per stage")
                with gr.Row():
                    seed = gr.Number(value=42, label="Seed", precision=0)
                    is_amplify_first_chunk = gr.Checkbox(label="Amplify First Chunk", value=True)

            generate_btn = gr.Button("🚀 Generate Video", variant="primary", size="lg")

        with gr.Column(scale=1):
            video_output = gr.Video(label="Generated Video", autoplay=True)
            info_output = gr.Textbox(label="Info", interactive=False)

    mode.change(fn=update_conditional_visibility, inputs=[mode], outputs=[image_input, video_input])
    generate_btn.click(
        fn=generate_video,
        inputs=[mode, prompt, image_input, video_input, height, width, num_frames, num_inference_steps, seed, is_amplify_first_chunk],
        outputs=[video_output, info_output],
    )

    gr.Examples(
        examples=[
            [
                "Text-to-Video",
                "A vibrant tropical fish swimming gracefully among colorful coral reefs in "
                "a clear, turquoise ocean. The fish has bright blue and yellow scales with a "
                "small, distinctive orange spot on its side, its fins moving fluidly. The coral "
                "reefs are alive with a variety of marine life, including small schools of "
                "colorful fish and sea turtles gliding by. The water is crystal clear, allowing "
                "for a view of the sandy ocean floor below. The reef itself is adorned with a mix "
                "of hard and soft corals in shades of red, orange, and green. The photo captures "
                "the fish from a slightly elevated angle, emphasizing its lively movements and the "
                "vivid colors of its surroundings. A close-up shot with dynamic movement.",
            ],
            [
                "Text-to-Video",
                "An extreme close-up of an gray-haired man with a beard in his 60s, he is deep in "
                "thought pondering the history of the universe as he sits at a cafe in Paris, his eyes "
                "focus on people offscreen as they walk as he sits mostly motionless, he is dressed in "
                "a wool coat suit coat with a button-down shirt , he wears a brown beret and glasses "
                "and has a very professorial appearance, and the end he offers a subtle closed-mouth "
                "smile as if he found the answer to the mystery of life, the lighting is very cinematic "
                "with the golden light and the Parisian streets and city in the background, depth of "
                "field, cinematic 35mm film.",
            ],
            [
                "Text-to-Video",
                "A drone camera circles around a beautiful historic church built on a rocky outcropping "
                "along the Amalfi Coast, the view showcases historic and magnificent architectural "
                "details and tiered pathways and patios, waves are seen crashing against the rocks "
                "below as the view overlooks the horizon of the coastal waters and hilly landscapes "
                "of the Amalfi Coast Italy, several distant people are seen walking and enjoying vistas "
                "on patios of the dramatic ocean views, the warm glow of the afternoon sun creates a "
                "magical and romantic feeling to the scene, the view is stunning captured with beautiful photography.",
            ],
        ],
        inputs=[mode, prompt],
        label="Example Prompts",
    )

if __name__ == "__main__":
    demo.launch()