Spaces:
Runtime error
Runtime error
File size: 14,999 Bytes
f802e61 2f1331b f802e61 9a6e819 f802e61 2f1331b f802e61 2f1331b f802e61 2f1331b f802e61 2f1331b 3621eec a1230df a739c6a a1230df f802e61 2f1331b f802e61 4eb25f1 f802e61 2f1331b f802e61 2f1331b f802e61 2f1331b f802e61 806125a f802e61 65e1f43 4d902de f802e61 ad56a95 f802e61 2f1331b f802e61 2f1331b f802e61 9a6e819 f802e61 4376fa5 f802e61 4376fa5 f802e61 9a6e819 f802e61 806125a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 | import tempfile
import time
import gradio as gr
import spaces
import torch
from diffusers import AutoencoderKLWan, HeliosDMDScheduler, HeliosPyramidPipeline
from diffusers.utils import export_to_video, load_image, load_video
# ---------------------------------------------------------------------------
# Pre-load model
# ---------------------------------------------------------------------------
MODEL_ID = "BestWishYsh/Helios-Distilled"
vae = AutoencoderKLWan.from_pretrained(MODEL_ID, subfolder="vae", torch_dtype=torch.float32)
scheduler = HeliosDMDScheduler.from_pretrained(MODEL_ID, subfolder="scheduler")
pipe = HeliosPyramidPipeline.from_pretrained(
MODEL_ID, vae=vae, scheduler=scheduler, torch_dtype=torch.bfloat16, is_distilled=True
)
pipe.to("cuda")
try:
pipe.transformer.set_attention_backend("_flash_3_hub")
except Exception:
pipe.transformer.set_attention_backend("flash_hub")
# ----------------------------- JIT -----------------------------
# pipe.transformer.compile(mode="max-autotune-no-cudagraphs", dynamic=False)
# pipe.transformer.compile_repeated_blocks(fullgraph=True)
# ----------------------------- AoTI -----------------------------
# def make_exported_contiguous(exported):
# for key, val in exported.constants.items():
# if not val.is_contiguous():
# exported.constants[key] = val.contiguous()
# return exported
# @spaces.GPU(duration=1500) # maximum duration allowed during startup
# def compile():
# pipe("prompt", width=640, height=384, pyramid_num_inference_steps_list=[1, 1, 1])
# with spaces.aoti_capture(pipe.transformer) as call_low:
# pipe("prompt", width=160, height=96)
# exported_low = torch.export.export(pipe.transformer, args=call_low.args, kwargs=call_low.kwargs)
# exported_low = make_exported_contiguous(exported_low)
# compiled_low = spaces.aoti_compile(exported_low)
# with spaces.aoti_capture(pipe.transformer) as call_mid:
# pipe("prompt", width=320, height=192)
# exported_mid = torch.export.export(pipe.transformer, args=call_mid.args, kwargs=call_mid.kwargs)
# exported_mid = make_exported_contiguous(exported_mid)
# compiled_mid = spaces.aoti_compile(exported_mid)
# with spaces.aoti_capture(pipe.transformer) as call_high:
# pipe("prompt", width=640, height=384)
# exported_high = torch.export.export(pipe.transformer, args=call_high.args, kwargs=call_high.kwargs)
# exported_high = make_exported_contiguous(exported_high)
# compiled_high = spaces.aoti_compile(exported_high)
# # push_to_hub(compiled_low, "BestWishYsh/HeliosBench-Weights", "transformer_low.pt")
# # push_to_hub(compiled_mid, "BestWishYsh/HeliosBench-Weights", "transformer_mid.pt")
# # push_to_hub(compiled_high, "BestWishYsh/HeliosBench-Weights", "transformer_high.pt")
# compiled_mid.weights = compiled_low.weights
# compiled_high.weights = compiled_low.weights
# return compiled_low, compiled_mid, compiled_high
# compiled_low, compiled_mid, compiled_high = compile()
# def combined(*args, **kwargs):
# hidden_states = kwargs['hidden_states']
# if hidden_states.shape[-1] == 20:
# return compiled_low(*args, **kwargs)
# elif hidden_states.shape[-1] == 40:
# return compiled_mid(*args, **kwargs)
# else:
# return compiled_high(*args, **kwargs)
# spaces.aoti_apply(combined, pipe.transformer)
# ---------------------------------------------------------------------------
# Generation
# ---------------------------------------------------------------------------
@spaces.GPU(duration=120)
def generate_video(
mode: str,
prompt: str,
image_input,
video_input,
height: int,
width: int,
num_frames: int,
num_inference_steps: int,
seed: int,
is_amplify_first_chunk: bool,
progress=gr.Progress(track_tqdm=True),
):
if not prompt:
raise gr.Error("Please provide a prompt.")
generator = torch.Generator(device="cuda").manual_seed(int(seed))
kwargs = {
"prompt": prompt,
"height": int(height),
"width": int(width),
"num_frames": int(num_frames),
"guidance_scale": 1.0,
"generator": generator,
"output_type": "np",
"pyramid_num_inference_steps_list": [
int(num_inference_steps),
int(num_inference_steps),
int(num_inference_steps),
],
"is_amplify_first_chunk": is_amplify_first_chunk,
}
if mode == "Image-to-Video" and image_input is not None:
img = load_image(image_input).resize((int(width), int(height)))
kwargs["image"] = img
elif mode == "Video-to-Video" and video_input is not None:
kwargs["video"] = load_video(video_input)
t0 = time.time()
output = pipe(**kwargs).frames[0]
elapsed = time.time() - t0
tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
export_to_video(output, tmp.name, fps=24)
info = f"Generated in {elapsed:.1f}s · {num_frames} frames · {height}×{width}"
return tmp.name, info
# ---------------------------------------------------------------------------
# UI Setup
# ---------------------------------------------------------------------------
def update_conditional_visibility(mode):
if mode == "Image-to-Video":
return gr.update(visible=True), gr.update(visible=False)
elif mode == "Video-to-Video":
return gr.update(visible=False), gr.update(visible=True)
else:
return gr.update(visible=False), gr.update(visible=False)
CSS = """
#header { text-align: center; margin-bottom: 1.5em; }
#header h1 { font-size: 2.2em; margin-bottom: 0.2em; }
.logo { max-height: 100px; margin: 0 auto 10px auto; display: block; }
.link-buttons { display: flex; justify-content: center; gap: 15px; margin-top: 10px; }
.link-buttons a {
background-color: #2b3137;
color: #ffffff !important;
padding: 8px 20px;
border-radius: 6px;
text-decoration: none;
font-weight: 600;
font-size: 1em;
transition: all 0.2s ease-in-out;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
}
.link-buttons a:hover { background-color: #4a535c; transform: translateY(-1px); }
.contain { max-width: 1350px; margin: 0 auto !important; }
"""
with gr.Blocks(title="Helios Video Generation") as demo:
gr.HTML(
"""
<div style='display: flex; align-items: center; justify-content: center; width: 100%;'>
<img src="https://raw.githubusercontent.com/SHYuanBest/shyuanbest_media/main/Helios/logo_white.png" style='width: 400px; height: auto;' />
</div>
<div id="header">
<h1>🎬 Helios 14B Distilled: Real Real-Time Long Video Generation Model</h1>
<p style="font-size: 1.1em; color: #666; margin-top: 0.5em; margin-bottom: 1em;">
If you like our project, please give us a star ⭐ on GitHub for the latest update.
</p>
<div class="link-buttons">
<a href="https://github.com/PKU-YuanGroup/Helios" target="_blank">💻 Code</a>
<a href="https://pku-yuangroup.github.io/Helios-Page" target="_blank">📄 Page</a>
<a href="https://www.youtube.com/watch?v=vd_AgHtOUFQ" target="_blank">🎥 Main Feature</a>
<a href="https://www.youtube.com/watch?v=1GeIU2Dn7UY" target="_blank">⚡ Inference Speed</a>
</div>
</div>
"""
)
with gr.Row():
with gr.Column(scale=1):
mode = gr.Radio(
choices=["Text-to-Video", "Image-to-Video", "Video-to-Video"],
value="Text-to-Video",
label="Generation Mode",
)
image_input = gr.Image(label="Image (for I2V)", type="filepath", visible=False)
video_input = gr.Video(label="Video (for V2V)", visible=False)
prompt = gr.Textbox(
label="Prompt",
lines=4,
value=(
"A vibrant tropical fish swimming gracefully among colorful coral reefs in "
"a clear, turquoise ocean. The fish has bright blue and yellow scales with a "
"small, distinctive orange spot on its side, its fins moving fluidly. The coral "
"reefs are alive with a variety of marine life, including small schools of "
"colorful fish and sea turtles gliding by. The water is crystal clear, allowing "
"for a view of the sandy ocean floor below. The reef itself is adorned with a mix "
"of hard and soft corals in shades of red, orange, and green. The photo captures "
"the fish from a slightly elevated angle, emphasizing its lively movements and the "
"vivid colors of its surroundings. A close-up shot with dynamic movement."
),
)
with gr.Accordion("Advanced Settings", open=False):
with gr.Row():
height = gr.Number(value=384, label="Height", precision=0, interactive=False)
width = gr.Number(value=640, label="Width", precision=0, interactive=False)
with gr.Row():
num_frames = gr.Slider(33, 231, value=231, step=33, label="Num Frames")
num_inference_steps = gr.Slider(1, 10, value=2, step=1, label="Steps per stage")
with gr.Row():
seed = gr.Number(value=42, label="Seed", precision=0)
is_amplify_first_chunk = gr.Checkbox(label="Amplify First Chunk", value=True)
generate_btn = gr.Button("🚀 Generate Video", variant="primary", size="lg")
with gr.Column(scale=1):
video_output = gr.Video(label="Generated Video", autoplay=True)
info_output = gr.Textbox(label="Info", interactive=False)
mode.change(fn=update_conditional_visibility, inputs=[mode], outputs=[image_input, video_input])
generate_btn.click(
fn=generate_video,
inputs=[
mode,
prompt,
image_input,
video_input,
height,
width,
num_frames,
num_inference_steps,
seed,
is_amplify_first_chunk,
],
outputs=[video_output, info_output],
)
gr.Examples(
examples=[
[
"Text-to-Video",
"A vibrant tropical fish swimming gracefully among colorful coral reefs in "
"a clear, turquoise ocean. The fish has bright blue and yellow scales with a "
"small, distinctive orange spot on its side, its fins moving fluidly. The coral "
"reefs are alive with a variety of marine life, including small schools of "
"colorful fish and sea turtles gliding by. The water is crystal clear, allowing "
"for a view of the sandy ocean floor below. The reef itself is adorned with a mix "
"of hard and soft corals in shades of red, orange, and green. The photo captures "
"the fish from a slightly elevated angle, emphasizing its lively movements and the "
"vivid colors of its surroundings. A close-up shot with dynamic movement.",
None,
None,
],
[
"Text-to-Video",
"An extreme close-up of an gray-haired man with a beard in his 60s, he is deep in "
"thought pondering the history of the universe as he sits at a cafe in Paris, his eyes "
"focus on people offscreen as they walk as he sits mostly motionless, he is dressed in "
"a wool coat suit coat with a button-down shirt , he wears a brown beret and glasses "
"and has a very professorial appearance, and the end he offers a subtle closed-mouth "
"smile as if he found the answer to the mystery of life, the lighting is very cinematic "
"with the golden light and the Parisian streets and city in the background, depth of "
"field, cinematic 35mm film.",
None,
None,
],
[
"Text-to-Video",
"A drone camera circles around a beautiful historic church built on a rocky outcropping "
"along the Amalfi Coast, the view showcases historic and magnificent architectural "
"details and tiered pathways and patios, waves are seen crashing against the rocks "
"below as the view overlooks the horizon of the coastal waters and hilly landscapes "
"of the Amalfi Coast Italy, several distant people are seen walking and enjoying vistas "
"on patios of the dramatic ocean views, the warm glow of the afternoon sun creates a "
"magical and romantic feeling to the scene, the view is stunning captured with beautiful photography.",
None,
None,
],
[
"Image-to-Video",
"A towering emerald wave surges forward, its crest curling with raw power and energy. Sunlight glints off the translucent water, illuminating the intricate textures and deep green hues within the wave’s body. A thick spray erupts from the breaking crest, casting a misty veil that dances above the churning surface. As the perspective widens, the immense scale of the wave becomes apparent, revealing the restless expanse of the ocean stretching beyond. The scene captures the ocean’s untamed beauty and relentless force, with every droplet and ripple shimmering in the light. The dynamic motion and vivid colors evoke both awe and respect for nature’s might.",
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/helios/wave.jpg",
None,
],
[
"Video-to-Video",
"A bright yellow Lamborghini Huracn Tecnica speeds along a curving mountain road, surrounded by lush green trees under a partly cloudy sky. The car's sleek design and vibrant color stand out against the natural backdrop, emphasizing its dynamic movement. The road curves gently, with a guardrail visible on one side, adding depth to the scene. The motion blur captures the sense of speed and energy, creating a thrilling and exhilarating atmosphere. A front-facing shot from a slightly elevated angle, highlighting the car's aggressive stance and the surrounding greenery.",
None,
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/helios/car.mp4",
],
],
inputs=[mode, prompt, image_input, video_input],
label="Example Prompts",
)
if __name__ == "__main__":
demo.launch(share=True, css=CSS, theme=gr.themes.Soft())
|