Spaces:
Running on Zero
Running on Zero
| import os | |
| import subprocess | |
| import sys | |
| import time | |
| import tempfile | |
| import zipfile | |
| import torch | |
| # --------------------------------------------------------------------------- | |
| # Install private diffusers fork | |
| # --------------------------------------------------------------------------- | |
| _APP_DIR = os.path.dirname(os.path.abspath(__file__)) | |
| ZIP_PATH = os.path.join(_APP_DIR, "helios_diffusers.zip") | |
| EXTRACT_DIR = os.path.join(_APP_DIR, "_helios_diffusers") | |
| _PKG_ROOT = os.path.join(EXTRACT_DIR, "diffusers-new-model-addition-helios-helios") | |
| if not os.path.isdir(_PKG_ROOT): | |
| print(f"[setup] Extracting {ZIP_PATH}") | |
| with zipfile.ZipFile(ZIP_PATH, "r") as zf: | |
| zf.extractall(EXTRACT_DIR) | |
| print(f"[setup] Installing diffusers from {_PKG_ROOT}") | |
| try: | |
| subprocess.check_call([sys.executable, "-m", "pip", "install", _PKG_ROOT]) | |
| except subprocess.CalledProcessError as e: | |
| print(f"[setup] pip install failed (exit {e.returncode}), falling back to sys.path") | |
| _SRC_DIR = os.path.join(_PKG_ROOT, "src") | |
| if os.path.isdir(_SRC_DIR): | |
| sys.path.insert(0, _SRC_DIR) | |
| import gradio as gr | |
| import spaces | |
| from diffusers import ( | |
| AutoencoderKLWan, | |
| HeliosPyramidPipeline, | |
| HeliosDMDScheduler | |
| ) | |
| from diffusers.utils import export_to_video, load_image, load_video | |
| from aoti import aoti_load_ | |
| # --------------------------------------------------------------------------- | |
| # Pre-load model | |
| # --------------------------------------------------------------------------- | |
| MODEL_ID = "BestWishYsh/Helios-Distilled" | |
| vae = AutoencoderKLWan.from_pretrained(MODEL_ID, subfolder="vae", torch_dtype=torch.float32) | |
| scheduler = HeliosDMDScheduler.from_pretrained(MODEL_ID, subfolder="scheduler") | |
| pipe = HeliosPyramidPipeline.from_pretrained( | |
| MODEL_ID, | |
| vae=vae, | |
| scheduler=scheduler, | |
| torch_dtype=torch.bfloat16, | |
| is_distilled=True | |
| ) | |
| # aoti_load_(pipe.transformer, "multimodalart/helios-distilled-transformer", "helios_distilled_transformer.pt2") | |
| pipe.to("cuda") | |
| pipe.transformer.set_attention_backend("_flash_3_hub") | |
| # --------------------------------------------------------------------------- | |
| # Generation | |
| # --------------------------------------------------------------------------- | |
| def generate_video( | |
| mode: str, | |
| prompt: str, | |
| image_input, | |
| video_input, | |
| height: int, | |
| width: int, | |
| num_frames: int, | |
| num_inference_steps: int, | |
| seed: int, | |
| is_amplify_first_chunk: bool, | |
| progress=gr.Progress(track_tqdm=True), | |
| ): | |
| if not prompt: | |
| raise gr.Error("Please provide a prompt.") | |
| generator = torch.Generator(device="cuda").manual_seed(int(seed)) | |
| kwargs = { | |
| "prompt": prompt, | |
| "height": int(height), | |
| "width": int(width), | |
| "num_frames": int(num_frames), | |
| "guidance_scale": 1.0, | |
| "generator": generator, | |
| "output_type": "np", | |
| "pyramid_num_inference_steps_list": [ | |
| int(num_inference_steps), | |
| int(num_inference_steps), | |
| int(num_inference_steps), | |
| ], | |
| "is_amplify_first_chunk": is_amplify_first_chunk, | |
| } | |
| if mode == "Image-to-Video" and image_input is not None: | |
| img = load_image(image_input).resize((int(width), int(height))) | |
| kwargs["image"] = img | |
| elif mode == "Video-to-Video" and video_input is not None: | |
| kwargs["video"] = load_video(video_input) | |
| t0 = time.time() | |
| output = pipe(**kwargs).frames[0] | |
| elapsed = time.time() - t0 | |
| tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) | |
| export_to_video(output, tmp.name, fps=24) | |
| info = f"Generated in {elapsed:.1f}s · {num_frames} frames · {height}×{width}" | |
| return tmp.name, info | |
| # --------------------------------------------------------------------------- | |
| # UI Setup | |
| # --------------------------------------------------------------------------- | |
| def update_conditional_visibility(mode): | |
| if mode == "Image-to-Video": | |
| return gr.update(visible=True), gr.update(visible=False) | |
| elif mode == "Video-to-Video": | |
| return gr.update(visible=False), gr.update(visible=True) | |
| else: | |
| return gr.update(visible=False), gr.update(visible=False) | |
| CSS = """ | |
| #header { text-align: center; margin-bottom: 0.5em; } | |
| #header h1 { font-size: 2.2em; margin-bottom: 0; } | |
| .contain { max-width: 1350px; margin: 0 auto !important; } | |
| """ | |
| with gr.Blocks(css=CSS, title="Helios Video Generation", theme=gr.themes.Soft()) as demo: | |
| gr.HTML( | |
| """ | |
| <div id="header"> | |
| <h1>🎬 Helios 14B distilled</h1> | |
| </div> | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| mode = gr.Radio( | |
| choices=["Text-to-Video", "Image-to-Video", "Video-to-Video"], | |
| value="Text-to-Video", | |
| label="Generation Mode", | |
| ) | |
| image_input = gr.Image(label="Image (for I2V)", type="filepath", visible=False) | |
| video_input = gr.Video(label="Video (for V2V)", visible=False) | |
| prompt = gr.Textbox( | |
| label="Prompt", | |
| lines=4, | |
| value=( | |
| "A vibrant tropical fish swimming gracefully among colorful coral reefs in " | |
| "a clear, turquoise ocean. The fish has bright blue and yellow scales with a " | |
| "small, distinctive orange spot on its side, its fins moving fluidly. The coral " | |
| "reefs are alive with a variety of marine life, including small schools of " | |
| "colorful fish and sea turtles gliding by. The water is crystal clear, allowing " | |
| "for a view of the sandy ocean floor below. The reef itself is adorned with a mix " | |
| "of hard and soft corals in shades of red, orange, and green. The photo captures " | |
| "the fish from a slightly elevated angle, emphasizing its lively movements and the " | |
| "vivid colors of its surroundings. A close-up shot with dynamic movement." | |
| ) | |
| ) | |
| with gr.Accordion("Advanced Settings", open=False): | |
| with gr.Row(): | |
| height = gr.Number(value=384, label="Height", precision=0, interactive=False) | |
| width = gr.Number(value=640, label="Width", precision=0, interactive=False) | |
| with gr.Row(): | |
| num_frames = gr.Slider(33, 231, value=231, step=33, label="Num Frames") | |
| num_inference_steps = gr.Slider(1, 10, value=2, step=1, label="Steps per stage") | |
| with gr.Row(): | |
| seed = gr.Number(value=42, label="Seed", precision=0) | |
| is_amplify_first_chunk = gr.Checkbox(label="Amplify First Chunk", value=True) | |
| generate_btn = gr.Button("🚀 Generate Video", variant="primary", size="lg") | |
| with gr.Column(scale=1): | |
| video_output = gr.Video(label="Generated Video", autoplay=True) | |
| info_output = gr.Textbox(label="Info", interactive=False) | |
| mode.change(fn=update_conditional_visibility, inputs=[mode], outputs=[image_input, video_input]) | |
| generate_btn.click( | |
| fn=generate_video, | |
| inputs=[mode, prompt, image_input, video_input, height, width, num_frames, num_inference_steps, seed, is_amplify_first_chunk], | |
| outputs=[video_output, info_output], | |
| ) | |
| gr.Examples( | |
| examples=[ | |
| [ | |
| "Text-to-Video", | |
| "A vibrant tropical fish swimming gracefully among colorful coral reefs in " | |
| "a clear, turquoise ocean. The fish has bright blue and yellow scales with a " | |
| "small, distinctive orange spot on its side, its fins moving fluidly. The coral " | |
| "reefs are alive with a variety of marine life, including small schools of " | |
| "colorful fish and sea turtles gliding by. The water is crystal clear, allowing " | |
| "for a view of the sandy ocean floor below. The reef itself is adorned with a mix " | |
| "of hard and soft corals in shades of red, orange, and green. The photo captures " | |
| "the fish from a slightly elevated angle, emphasizing its lively movements and the " | |
| "vivid colors of its surroundings. A close-up shot with dynamic movement.", | |
| ], | |
| [ | |
| "Text-to-Video", | |
| "An extreme close-up of an gray-haired man with a beard in his 60s, he is deep in " | |
| "thought pondering the history of the universe as he sits at a cafe in Paris, his eyes " | |
| "focus on people offscreen as they walk as he sits mostly motionless, he is dressed in " | |
| "a wool coat suit coat with a button-down shirt , he wears a brown beret and glasses " | |
| "and has a very professorial appearance, and the end he offers a subtle closed-mouth " | |
| "smile as if he found the answer to the mystery of life, the lighting is very cinematic " | |
| "with the golden light and the Parisian streets and city in the background, depth of " | |
| "field, cinematic 35mm film.", | |
| ], | |
| [ | |
| "Text-to-Video", | |
| "A drone camera circles around a beautiful historic church built on a rocky outcropping " | |
| "along the Amalfi Coast, the view showcases historic and magnificent architectural " | |
| "details and tiered pathways and patios, waves are seen crashing against the rocks " | |
| "below as the view overlooks the horizon of the coastal waters and hilly landscapes " | |
| "of the Amalfi Coast Italy, several distant people are seen walking and enjoying vistas " | |
| "on patios of the dramatic ocean views, the warm glow of the afternoon sun creates a " | |
| "magical and romantic feeling to the scene, the view is stunning captured with beautiful photography.", | |
| ], | |
| ], | |
| inputs=[mode, prompt], | |
| label="Example Prompts", | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |