import os
import subprocess
import sys
import time
import tempfile
import zipfile
import torch
# ---------------------------------------------------------------------------
# Install private diffusers fork
# ---------------------------------------------------------------------------
_APP_DIR = os.path.dirname(os.path.abspath(__file__))
ZIP_PATH = os.path.join(_APP_DIR, "helios_diffusers.zip")
EXTRACT_DIR = os.path.join(_APP_DIR, "_helios_diffusers")
_PKG_ROOT = os.path.join(EXTRACT_DIR, "diffusers-new-model-addition-helios-helios")
if not os.path.isdir(_PKG_ROOT):
print(f"[setup] Extracting {ZIP_PATH}")
with zipfile.ZipFile(ZIP_PATH, "r") as zf:
zf.extractall(EXTRACT_DIR)
print(f"[setup] Installing diffusers from {_PKG_ROOT}")
try:
subprocess.check_call([sys.executable, "-m", "pip", "install", _PKG_ROOT])
except subprocess.CalledProcessError as e:
print(f"[setup] pip install failed (exit {e.returncode}), falling back to sys.path")
_SRC_DIR = os.path.join(_PKG_ROOT, "src")
if os.path.isdir(_SRC_DIR):
sys.path.insert(0, _SRC_DIR)
import gradio as gr
import spaces
from diffusers import (
AutoencoderKLWan,
HeliosPyramidPipeline,
HeliosDMDScheduler
)
from diffusers.utils import export_to_video, load_image, load_video
from aoti import aoti_load_
# ---------------------------------------------------------------------------
# Pre-load model
# ---------------------------------------------------------------------------
MODEL_ID = "BestWishYsh/Helios-Distilled"
vae = AutoencoderKLWan.from_pretrained(MODEL_ID, subfolder="vae", torch_dtype=torch.float32)
scheduler = HeliosDMDScheduler.from_pretrained(MODEL_ID, subfolder="scheduler")
pipe = HeliosPyramidPipeline.from_pretrained(
MODEL_ID,
vae=vae,
scheduler=scheduler,
torch_dtype=torch.bfloat16,
is_distilled=True
)
# aoti_load_(pipe.transformer, "multimodalart/helios-distilled-transformer", "helios_distilled_transformer.pt2")
pipe.to("cuda")
pipe.transformer.set_attention_backend("_flash_3_hub")
# ---------------------------------------------------------------------------
# Generation
# ---------------------------------------------------------------------------
@spaces.GPU(duration=300)
def generate_video(
mode: str,
prompt: str,
image_input,
video_input,
height: int,
width: int,
num_frames: int,
num_inference_steps: int,
seed: int,
is_amplify_first_chunk: bool,
progress=gr.Progress(track_tqdm=True),
):
if not prompt:
raise gr.Error("Please provide a prompt.")
generator = torch.Generator(device="cuda").manual_seed(int(seed))
kwargs = {
"prompt": prompt,
"height": int(height),
"width": int(width),
"num_frames": int(num_frames),
"guidance_scale": 1.0,
"generator": generator,
"output_type": "np",
"pyramid_num_inference_steps_list": [
int(num_inference_steps),
int(num_inference_steps),
int(num_inference_steps),
],
"is_amplify_first_chunk": is_amplify_first_chunk,
}
if mode == "Image-to-Video" and image_input is not None:
img = load_image(image_input).resize((int(width), int(height)))
kwargs["image"] = img
elif mode == "Video-to-Video" and video_input is not None:
kwargs["video"] = load_video(video_input)
t0 = time.time()
output = pipe(**kwargs).frames[0]
elapsed = time.time() - t0
tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
export_to_video(output, tmp.name, fps=24)
info = f"Generated in {elapsed:.1f}s · {num_frames} frames · {height}×{width}"
return tmp.name, info
# ---------------------------------------------------------------------------
# UI Setup
# ---------------------------------------------------------------------------
def update_conditional_visibility(mode):
if mode == "Image-to-Video":
return gr.update(visible=True), gr.update(visible=False)
elif mode == "Video-to-Video":
return gr.update(visible=False), gr.update(visible=True)
else:
return gr.update(visible=False), gr.update(visible=False)
CSS = """
#header { text-align: center; margin-bottom: 0.5em; }
#header h1 { font-size: 2.2em; margin-bottom: 0; }
.contain { max-width: 1350px; margin: 0 auto !important; }
"""
with gr.Blocks(css=CSS, title="Helios Video Generation", theme=gr.themes.Soft()) as demo:
gr.HTML(
"""
"""
)
with gr.Row():
with gr.Column(scale=1):
mode = gr.Radio(
choices=["Text-to-Video", "Image-to-Video", "Video-to-Video"],
value="Text-to-Video",
label="Generation Mode",
)
image_input = gr.Image(label="Image (for I2V)", type="filepath", visible=False)
video_input = gr.Video(label="Video (for V2V)", visible=False)
prompt = gr.Textbox(
label="Prompt",
lines=4,
value=(
"A vibrant tropical fish swimming gracefully among colorful coral reefs in "
"a clear, turquoise ocean. The fish has bright blue and yellow scales with a "
"small, distinctive orange spot on its side, its fins moving fluidly. The coral "
"reefs are alive with a variety of marine life, including small schools of "
"colorful fish and sea turtles gliding by. The water is crystal clear, allowing "
"for a view of the sandy ocean floor below. The reef itself is adorned with a mix "
"of hard and soft corals in shades of red, orange, and green. The photo captures "
"the fish from a slightly elevated angle, emphasizing its lively movements and the "
"vivid colors of its surroundings. A close-up shot with dynamic movement."
)
)
with gr.Accordion("Advanced Settings", open=False):
with gr.Row():
height = gr.Number(value=384, label="Height", precision=0, interactive=False)
width = gr.Number(value=640, label="Width", precision=0, interactive=False)
with gr.Row():
num_frames = gr.Slider(33, 231, value=231, step=33, label="Num Frames")
num_inference_steps = gr.Slider(1, 10, value=2, step=1, label="Steps per stage")
with gr.Row():
seed = gr.Number(value=42, label="Seed", precision=0)
is_amplify_first_chunk = gr.Checkbox(label="Amplify First Chunk", value=True)
generate_btn = gr.Button("🚀 Generate Video", variant="primary", size="lg")
with gr.Column(scale=1):
video_output = gr.Video(label="Generated Video", autoplay=True)
info_output = gr.Textbox(label="Info", interactive=False)
mode.change(fn=update_conditional_visibility, inputs=[mode], outputs=[image_input, video_input])
generate_btn.click(
fn=generate_video,
inputs=[mode, prompt, image_input, video_input, height, width, num_frames, num_inference_steps, seed, is_amplify_first_chunk],
outputs=[video_output, info_output],
)
gr.Examples(
examples=[
[
"Text-to-Video",
"A vibrant tropical fish swimming gracefully among colorful coral reefs in "
"a clear, turquoise ocean. The fish has bright blue and yellow scales with a "
"small, distinctive orange spot on its side, its fins moving fluidly. The coral "
"reefs are alive with a variety of marine life, including small schools of "
"colorful fish and sea turtles gliding by. The water is crystal clear, allowing "
"for a view of the sandy ocean floor below. The reef itself is adorned with a mix "
"of hard and soft corals in shades of red, orange, and green. The photo captures "
"the fish from a slightly elevated angle, emphasizing its lively movements and the "
"vivid colors of its surroundings. A close-up shot with dynamic movement.",
],
[
"Text-to-Video",
"An extreme close-up of an gray-haired man with a beard in his 60s, he is deep in "
"thought pondering the history of the universe as he sits at a cafe in Paris, his eyes "
"focus on people offscreen as they walk as he sits mostly motionless, he is dressed in "
"a wool coat suit coat with a button-down shirt , he wears a brown beret and glasses "
"and has a very professorial appearance, and the end he offers a subtle closed-mouth "
"smile as if he found the answer to the mystery of life, the lighting is very cinematic "
"with the golden light and the Parisian streets and city in the background, depth of "
"field, cinematic 35mm film.",
],
[
"Text-to-Video",
"A drone camera circles around a beautiful historic church built on a rocky outcropping "
"along the Amalfi Coast, the view showcases historic and magnificent architectural "
"details and tiered pathways and patios, waves are seen crashing against the rocks "
"below as the view overlooks the horizon of the coastal waters and hilly landscapes "
"of the Amalfi Coast Italy, several distant people are seen walking and enjoying vistas "
"on patios of the dramatic ocean views, the warm glow of the afternoon sun creates a "
"magical and romantic feeling to the scene, the view is stunning captured with beautiful photography.",
],
],
inputs=[mode, prompt],
label="Example Prompts",
)
if __name__ == "__main__":
demo.launch()