Spaces:

multimodalart
/

Helios-Distilled

Running on Zero

App Files Files Community

Helios-Distilled / app.py

BestWishYsh

optimize sth

6c772db 1 day ago

raw

history blame

10.1 kB

	import os
	import subprocess
	import sys
	import time
	import tempfile
	import zipfile
	import torch

	# ---------------------------------------------------------------------------
	# Install private diffusers fork
	# ---------------------------------------------------------------------------
	_APP_DIR = os.path.dirname(os.path.abspath(__file__))
	ZIP_PATH = os.path.join(_APP_DIR, "helios_diffusers.zip")
	EXTRACT_DIR = os.path.join(_APP_DIR, "_helios_diffusers")
	_PKG_ROOT = os.path.join(EXTRACT_DIR, "diffusers-new-model-addition-helios-helios")

	if not os.path.isdir(_PKG_ROOT):
	print(f"[setup] Extracting {ZIP_PATH}")
	with zipfile.ZipFile(ZIP_PATH, "r") as zf:
	zf.extractall(EXTRACT_DIR)

	print(f"[setup] Installing diffusers from {_PKG_ROOT}")
	try:
	subprocess.check_call([sys.executable, "-m", "pip", "install", _PKG_ROOT])
	except subprocess.CalledProcessError as e:
	print(f"[setup] pip install failed (exit {e.returncode}), falling back to sys.path")

	_SRC_DIR = os.path.join(_PKG_ROOT, "src")
	if os.path.isdir(_SRC_DIR):
	sys.path.insert(0, _SRC_DIR)

	import gradio as gr
	import spaces
	from diffusers import (
	AutoencoderKLWan,
	HeliosPyramidPipeline,
	HeliosDMDScheduler
	)
	from diffusers.utils import export_to_video, load_image, load_video
	from aoti import aoti_load_

	# ---------------------------------------------------------------------------
	# Pre-load model
	# ---------------------------------------------------------------------------
	MODEL_ID = "BestWishYsh/Helios-Distilled"

	vae = AutoencoderKLWan.from_pretrained(MODEL_ID, subfolder="vae", torch_dtype=torch.float32)
	scheduler = HeliosDMDScheduler.from_pretrained(MODEL_ID, subfolder="scheduler")
	pipe = HeliosPyramidPipeline.from_pretrained(
	MODEL_ID,
	vae=vae,
	scheduler=scheduler,
	torch_dtype=torch.bfloat16,
	is_distilled=True
	)

	# aoti_load_(pipe.transformer, "multimodalart/helios-distilled-transformer", "helios_distilled_transformer.pt2")

	pipe.to("cuda")

	pipe.transformer.set_attention_backend("_flash_3_hub")

	# ---------------------------------------------------------------------------
	# Generation
	# ---------------------------------------------------------------------------
	@spaces.GPU(duration=300)
	def generate_video(
	mode: str,
	prompt: str,
	image_input,
	video_input,
	height: int,
	width: int,
	num_frames: int,
	num_inference_steps: int,
	seed: int,
	is_amplify_first_chunk: bool,
	progress=gr.Progress(track_tqdm=True),
	):
	if not prompt:
	raise gr.Error("Please provide a prompt.")

	generator = torch.Generator(device="cuda").manual_seed(int(seed))

	kwargs = {
	"prompt": prompt,
	"height": int(height),
	"width": int(width),
	"num_frames": int(num_frames),
	"guidance_scale": 1.0,
	"generator": generator,
	"output_type": "np",
	"pyramid_num_inference_steps_list": [
	int(num_inference_steps),
	int(num_inference_steps),
	int(num_inference_steps),
	],
	"is_amplify_first_chunk": is_amplify_first_chunk,
	}

	if mode == "Image-to-Video" and image_input is not None:
	img = load_image(image_input).resize((int(width), int(height)))
	kwargs["image"] = img
	elif mode == "Video-to-Video" and video_input is not None:
	kwargs["video"] = load_video(video_input)

	t0 = time.time()
	output = pipe(**kwargs).frames[0]
	elapsed = time.time() - t0

	tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
	export_to_video(output, tmp.name, fps=24)
	info = f"Generated in {elapsed:.1f}s · {num_frames} frames · {height}×{width}"
	return tmp.name, info

	# ---------------------------------------------------------------------------
	# UI Setup
	# ---------------------------------------------------------------------------
	def update_conditional_visibility(mode):
	if mode == "Image-to-Video":
	return gr.update(visible=True), gr.update(visible=False)
	elif mode == "Video-to-Video":
	return gr.update(visible=False), gr.update(visible=True)
	else:
	return gr.update(visible=False), gr.update(visible=False)

	CSS = """
	#header { text-align: center; margin-bottom: 0.5em; }
	#header h1 { font-size: 2.2em; margin-bottom: 0; }
	.contain { max-width: 1350px; margin: 0 auto !important; }
	"""

	with gr.Blocks(css=CSS, title="Helios Video Generation", theme=gr.themes.Soft()) as demo:
	gr.HTML(
	"""
	<div id="header">
	<h1>🎬 Helios 14B distilled</h1>
	</div>
	"""
	)

	with gr.Row():
	with gr.Column(scale=1):
	mode = gr.Radio(
	choices=["Text-to-Video", "Image-to-Video", "Video-to-Video"],
	value="Text-to-Video",
	label="Generation Mode",
	)
	image_input = gr.Image(label="Image (for I2V)", type="filepath", visible=False)
	video_input = gr.Video(label="Video (for V2V)", visible=False)
	prompt = gr.Textbox(
	label="Prompt",
	lines=4,
	value=(
	"A vibrant tropical fish swimming gracefully among colorful coral reefs in "
	"a clear, turquoise ocean. The fish has bright blue and yellow scales with a "
	"small, distinctive orange spot on its side, its fins moving fluidly. The coral "
	"reefs are alive with a variety of marine life, including small schools of "
	"colorful fish and sea turtles gliding by. The water is crystal clear, allowing "
	"for a view of the sandy ocean floor below. The reef itself is adorned with a mix "
	"of hard and soft corals in shades of red, orange, and green. The photo captures "
	"the fish from a slightly elevated angle, emphasizing its lively movements and the "
	"vivid colors of its surroundings. A close-up shot with dynamic movement."
	)
	)
	with gr.Accordion("Advanced Settings", open=False):
	with gr.Row():
	height = gr.Number(value=384, label="Height", precision=0, interactive=False)
	width = gr.Number(value=640, label="Width", precision=0, interactive=False)
	with gr.Row():
	num_frames = gr.Slider(33, 231, value=231, step=33, label="Num Frames")
	num_inference_steps = gr.Slider(1, 10, value=2, step=1, label="Steps per stage")
	with gr.Row():
	seed = gr.Number(value=42, label="Seed", precision=0)
	is_amplify_first_chunk = gr.Checkbox(label="Amplify First Chunk", value=True)

	generate_btn = gr.Button("🚀 Generate Video", variant="primary", size="lg")

	with gr.Column(scale=1):
	video_output = gr.Video(label="Generated Video", autoplay=True)
	info_output = gr.Textbox(label="Info", interactive=False)

	mode.change(fn=update_conditional_visibility, inputs=[mode], outputs=[image_input, video_input])
	generate_btn.click(
	fn=generate_video,
	inputs=[mode, prompt, image_input, video_input, height, width, num_frames, num_inference_steps, seed, is_amplify_first_chunk],
	outputs=[video_output, info_output],
	)

	gr.Examples(
	examples=[
	[
	"Text-to-Video",
	"A vibrant tropical fish swimming gracefully among colorful coral reefs in "
	"a clear, turquoise ocean. The fish has bright blue and yellow scales with a "
	"small, distinctive orange spot on its side, its fins moving fluidly. The coral "
	"reefs are alive with a variety of marine life, including small schools of "
	"colorful fish and sea turtles gliding by. The water is crystal clear, allowing "
	"for a view of the sandy ocean floor below. The reef itself is adorned with a mix "
	"of hard and soft corals in shades of red, orange, and green. The photo captures "
	"the fish from a slightly elevated angle, emphasizing its lively movements and the "
	"vivid colors of its surroundings. A close-up shot with dynamic movement.",
	],
	[
	"Text-to-Video",
	"An extreme close-up of an gray-haired man with a beard in his 60s, he is deep in "
	"thought pondering the history of the universe as he sits at a cafe in Paris, his eyes "
	"focus on people offscreen as they walk as he sits mostly motionless, he is dressed in "
	"a wool coat suit coat with a button-down shirt , he wears a brown beret and glasses "
	"and has a very professorial appearance, and the end he offers a subtle closed-mouth "
	"smile as if he found the answer to the mystery of life, the lighting is very cinematic "
	"with the golden light and the Parisian streets and city in the background, depth of "
	"field, cinematic 35mm film.",
	],
	[
	"Text-to-Video",
	"A drone camera circles around a beautiful historic church built on a rocky outcropping "
	"along the Amalfi Coast, the view showcases historic and magnificent architectural "
	"details and tiered pathways and patios, waves are seen crashing against the rocks "
	"below as the view overlooks the horizon of the coastal waters and hilly landscapes "
	"of the Amalfi Coast Italy, several distant people are seen walking and enjoying vistas "
	"on patios of the dramatic ocean views, the warm glow of the afternoon sun creates a "
	"magical and romantic feeling to the scene, the view is stunning captured with beautiful photography.",
	],
	],
	inputs=[mode, prompt],
	label="Example Prompts",
	)

	if __name__ == "__main__":
	demo.launch()