import os import subprocess import sys import time import tempfile import zipfile import torch # --------------------------------------------------------------------------- # Install private diffusers fork # --------------------------------------------------------------------------- _APP_DIR = os.path.dirname(os.path.abspath(__file__)) ZIP_PATH = os.path.join(_APP_DIR, "helios_diffusers.zip") EXTRACT_DIR = os.path.join(_APP_DIR, "_helios_diffusers") _PKG_ROOT = os.path.join(EXTRACT_DIR, "diffusers-new-model-addition-helios-helios") if not os.path.isdir(_PKG_ROOT): print(f"[setup] Extracting {ZIP_PATH}") with zipfile.ZipFile(ZIP_PATH, "r") as zf: zf.extractall(EXTRACT_DIR) print(f"[setup] Installing diffusers from {_PKG_ROOT}") try: subprocess.check_call([sys.executable, "-m", "pip", "install", _PKG_ROOT]) except subprocess.CalledProcessError as e: print(f"[setup] pip install failed (exit {e.returncode}), falling back to sys.path") _SRC_DIR = os.path.join(_PKG_ROOT, "src") if os.path.isdir(_SRC_DIR): sys.path.insert(0, _SRC_DIR) import gradio as gr import spaces from diffusers import ( AutoencoderKLWan, HeliosPyramidPipeline, HeliosDMDScheduler ) from diffusers.utils import export_to_video, load_image, load_video from aoti import aoti_load_ # --------------------------------------------------------------------------- # Pre-load model # --------------------------------------------------------------------------- MODEL_ID = "BestWishYsh/Helios-Distilled" vae = AutoencoderKLWan.from_pretrained(MODEL_ID, subfolder="vae", torch_dtype=torch.float32) scheduler = HeliosDMDScheduler.from_pretrained(MODEL_ID, subfolder="scheduler") pipe = HeliosPyramidPipeline.from_pretrained( MODEL_ID, vae=vae, scheduler=scheduler, torch_dtype=torch.bfloat16, is_distilled=True ) # aoti_load_(pipe.transformer, "multimodalart/helios-distilled-transformer", "helios_distilled_transformer.pt2") pipe.to("cuda") pipe.transformer.set_attention_backend("_flash_3_hub") # --------------------------------------------------------------------------- # Generation # --------------------------------------------------------------------------- @spaces.GPU(duration=300) def generate_video( mode: str, prompt: str, image_input, video_input, height: int, width: int, num_frames: int, num_inference_steps: int, seed: int, is_amplify_first_chunk: bool, progress=gr.Progress(track_tqdm=True), ): if not prompt: raise gr.Error("Please provide a prompt.") generator = torch.Generator(device="cuda").manual_seed(int(seed)) kwargs = { "prompt": prompt, "height": int(height), "width": int(width), "num_frames": int(num_frames), "guidance_scale": 1.0, "generator": generator, "output_type": "np", "pyramid_num_inference_steps_list": [ int(num_inference_steps), int(num_inference_steps), int(num_inference_steps), ], "is_amplify_first_chunk": is_amplify_first_chunk, } if mode == "Image-to-Video" and image_input is not None: img = load_image(image_input).resize((int(width), int(height))) kwargs["image"] = img elif mode == "Video-to-Video" and video_input is not None: kwargs["video"] = load_video(video_input) t0 = time.time() output = pipe(**kwargs).frames[0] elapsed = time.time() - t0 tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) export_to_video(output, tmp.name, fps=24) info = f"Generated in {elapsed:.1f}s · {num_frames} frames · {height}×{width}" return tmp.name, info # --------------------------------------------------------------------------- # UI Setup # --------------------------------------------------------------------------- def update_conditional_visibility(mode): if mode == "Image-to-Video": return gr.update(visible=True), gr.update(visible=False) elif mode == "Video-to-Video": return gr.update(visible=False), gr.update(visible=True) else: return gr.update(visible=False), gr.update(visible=False) CSS = """ #header { text-align: center; margin-bottom: 0.5em; } #header h1 { font-size: 2.2em; margin-bottom: 0; } .contain { max-width: 1350px; margin: 0 auto !important; } """ with gr.Blocks(css=CSS, title="Helios Video Generation", theme=gr.themes.Soft()) as demo: gr.HTML( """ """ ) with gr.Row(): with gr.Column(scale=1): mode = gr.Radio( choices=["Text-to-Video", "Image-to-Video", "Video-to-Video"], value="Text-to-Video", label="Generation Mode", ) image_input = gr.Image(label="Image (for I2V)", type="filepath", visible=False) video_input = gr.Video(label="Video (for V2V)", visible=False) prompt = gr.Textbox( label="Prompt", lines=4, value=( "A vibrant tropical fish swimming gracefully among colorful coral reefs in " "a clear, turquoise ocean. The fish has bright blue and yellow scales with a " "small, distinctive orange spot on its side, its fins moving fluidly. The coral " "reefs are alive with a variety of marine life, including small schools of " "colorful fish and sea turtles gliding by. The water is crystal clear, allowing " "for a view of the sandy ocean floor below. The reef itself is adorned with a mix " "of hard and soft corals in shades of red, orange, and green. The photo captures " "the fish from a slightly elevated angle, emphasizing its lively movements and the " "vivid colors of its surroundings. A close-up shot with dynamic movement." ) ) with gr.Accordion("Advanced Settings", open=False): with gr.Row(): height = gr.Number(value=384, label="Height", precision=0, interactive=False) width = gr.Number(value=640, label="Width", precision=0, interactive=False) with gr.Row(): num_frames = gr.Slider(33, 231, value=231, step=33, label="Num Frames") num_inference_steps = gr.Slider(1, 10, value=2, step=1, label="Steps per stage") with gr.Row(): seed = gr.Number(value=42, label="Seed", precision=0) is_amplify_first_chunk = gr.Checkbox(label="Amplify First Chunk", value=True) generate_btn = gr.Button("🚀 Generate Video", variant="primary", size="lg") with gr.Column(scale=1): video_output = gr.Video(label="Generated Video", autoplay=True) info_output = gr.Textbox(label="Info", interactive=False) mode.change(fn=update_conditional_visibility, inputs=[mode], outputs=[image_input, video_input]) generate_btn.click( fn=generate_video, inputs=[mode, prompt, image_input, video_input, height, width, num_frames, num_inference_steps, seed, is_amplify_first_chunk], outputs=[video_output, info_output], ) gr.Examples( examples=[ [ "Text-to-Video", "A vibrant tropical fish swimming gracefully among colorful coral reefs in " "a clear, turquoise ocean. The fish has bright blue and yellow scales with a " "small, distinctive orange spot on its side, its fins moving fluidly. The coral " "reefs are alive with a variety of marine life, including small schools of " "colorful fish and sea turtles gliding by. The water is crystal clear, allowing " "for a view of the sandy ocean floor below. The reef itself is adorned with a mix " "of hard and soft corals in shades of red, orange, and green. The photo captures " "the fish from a slightly elevated angle, emphasizing its lively movements and the " "vivid colors of its surroundings. A close-up shot with dynamic movement.", ], [ "Text-to-Video", "An extreme close-up of an gray-haired man with a beard in his 60s, he is deep in " "thought pondering the history of the universe as he sits at a cafe in Paris, his eyes " "focus on people offscreen as they walk as he sits mostly motionless, he is dressed in " "a wool coat suit coat with a button-down shirt , he wears a brown beret and glasses " "and has a very professorial appearance, and the end he offers a subtle closed-mouth " "smile as if he found the answer to the mystery of life, the lighting is very cinematic " "with the golden light and the Parisian streets and city in the background, depth of " "field, cinematic 35mm film.", ], [ "Text-to-Video", "A drone camera circles around a beautiful historic church built on a rocky outcropping " "along the Amalfi Coast, the view showcases historic and magnificent architectural " "details and tiered pathways and patios, waves are seen crashing against the rocks " "below as the view overlooks the horizon of the coastal waters and hilly landscapes " "of the Amalfi Coast Italy, several distant people are seen walking and enjoying vistas " "on patios of the dramatic ocean views, the warm glow of the afternoon sun creates a " "magical and romantic feeling to the scene, the view is stunning captured with beautiful photography.", ], ], inputs=[mode, prompt], label="Example Prompts", ) if __name__ == "__main__": demo.launch()