File size: 10,113 Bytes
1775cd6
16ca8db
 
1775cd6
 
16ca8db
9f13b69
1775cd6
5d3bb27
9f13b69
5d3bb27
f629787
 
 
dacc5a0
f629787
 
 
5d3bb27
 
 
f629787
8ddf1ab
 
 
 
 
 
 
 
5d3bb27
1775cd6
ee824ed
1775cd6
 
 
7b20338
1775cd6
 
14237e0
1775cd6
 
9f13b69
1775cd6
 
 
 
7b20338
 
 
 
 
 
 
 
14237e0
34757ae
14237e0
1775cd6
53123ce
cb99a0b
1775cd6
 
9f13b69
1775cd6
ee824ed
1775cd6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9ea2322
9f13b69
9ea2322
 
 
 
 
 
 
 
 
1775cd6
 
 
b3d4063
1775cd6
 
 
 
 
 
a24b191
1775cd6
 
 
 
 
 
 
 
 
 
 
9f13b69
 
1775cd6
 
 
d792322
441905a
 
 
 
 
 
 
 
 
d792322
1775cd6
 
 
34c9450
 
1775cd6
441905a
9f13b69
1775cd6
 
9f13b69
1775cd6
 
 
 
 
 
 
9f13b69
1775cd6
 
9f13b69
1775cd6
 
 
 
 
 
 
441905a
 
 
 
 
 
 
 
 
1775cd6
 
 
441905a
 
 
 
 
 
 
 
1775cd6
 
 
441905a
 
 
 
 
 
 
1775cd6
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
import os
import subprocess
import sys
import time
import tempfile
import zipfile
import torch

# ---------------------------------------------------------------------------
# Install private diffusers fork
# ---------------------------------------------------------------------------
_APP_DIR = os.path.dirname(os.path.abspath(__file__))
ZIP_PATH = os.path.join(_APP_DIR, "helios_diffusers.zip")
EXTRACT_DIR = os.path.join(_APP_DIR, "_helios_diffusers")
_PKG_ROOT = os.path.join(EXTRACT_DIR, "diffusers-new-model-addition-helios-helios")

if not os.path.isdir(_PKG_ROOT):
    print(f"[setup] Extracting {ZIP_PATH}")
    with zipfile.ZipFile(ZIP_PATH, "r") as zf:
        zf.extractall(EXTRACT_DIR)

print(f"[setup] Installing diffusers from {_PKG_ROOT}")
try:
    subprocess.check_call([sys.executable, "-m", "pip", "install", _PKG_ROOT])
except subprocess.CalledProcessError as e:
    print(f"[setup] pip install failed (exit {e.returncode}), falling back to sys.path")

_SRC_DIR = os.path.join(_PKG_ROOT, "src")
if os.path.isdir(_SRC_DIR):
    sys.path.insert(0, _SRC_DIR)

import gradio as gr
import spaces
from diffusers import (
    AutoencoderKLWan,
    HeliosPyramidPipeline,
    HeliosDMDScheduler
)
from diffusers.utils import export_to_video, load_image, load_video
from aoti import aoti_load_

# ---------------------------------------------------------------------------
# Pre-load model
# ---------------------------------------------------------------------------
MODEL_ID = "BestWishYsh/Helios-Distilled"

vae = AutoencoderKLWan.from_pretrained(MODEL_ID, subfolder="vae", torch_dtype=torch.float32)
scheduler = HeliosDMDScheduler.from_pretrained(MODEL_ID, subfolder="scheduler")
pipe = HeliosPyramidPipeline.from_pretrained(
    MODEL_ID, 
    vae=vae, 
    scheduler=scheduler,
    torch_dtype=torch.bfloat16,
    is_distilled=True
)

# aoti_load_(pipe.transformer, "multimodalart/helios-distilled-transformer", "helios_distilled_transformer.pt2")

pipe.to("cuda")

pipe.transformer.set_attention_backend("_flash_3_hub")

# ---------------------------------------------------------------------------
# Generation
# ---------------------------------------------------------------------------
@spaces.GPU(duration=300)
def generate_video(
    mode: str,
    prompt: str,
    image_input,
    video_input,
    height: int,
    width: int,
    num_frames: int,
    num_inference_steps: int,
    seed: int,
    is_amplify_first_chunk: bool,
    progress=gr.Progress(track_tqdm=True),
):
    if not prompt:
        raise gr.Error("Please provide a prompt.")

    generator = torch.Generator(device="cuda").manual_seed(int(seed))

    kwargs = {
        "prompt": prompt,
        "height": int(height),
        "width": int(width),
        "num_frames": int(num_frames),
        "guidance_scale": 1.0,
        "generator": generator,
        "output_type": "np",
        "pyramid_num_inference_steps_list": [
            int(num_inference_steps),
            int(num_inference_steps),
            int(num_inference_steps),
        ],
        "is_amplify_first_chunk": is_amplify_first_chunk,
    }

    if mode == "Image-to-Video" and image_input is not None:
        img = load_image(image_input).resize((int(width), int(height)))
        kwargs["image"] = img
    elif mode == "Video-to-Video" and video_input is not None:
        kwargs["video"] = load_video(video_input)

    t0 = time.time()
    output = pipe(**kwargs).frames[0]
    elapsed = time.time() - t0

    tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
    export_to_video(output, tmp.name, fps=24)
    info = f"Generated in {elapsed:.1f}s · {num_frames} frames · {height}×{width}"
    return tmp.name, info

# ---------------------------------------------------------------------------
# UI Setup
# ---------------------------------------------------------------------------
def update_conditional_visibility(mode):
    if mode == "Image-to-Video":
        return gr.update(visible=True), gr.update(visible=False)
    elif mode == "Video-to-Video":
        return gr.update(visible=False), gr.update(visible=True)
    else:
        return gr.update(visible=False), gr.update(visible=False)

CSS = """
#header { text-align: center; margin-bottom: 0.5em; }
#header h1 { font-size: 2.2em; margin-bottom: 0; }
.contain { max-width: 1350px; margin: 0 auto !important; }
"""

with gr.Blocks(css=CSS, title="Helios Video Generation", theme=gr.themes.Soft()) as demo:
    gr.HTML(
        """
        <div id="header">
            <h1>🎬 Helios 14B distilled</h1>
        </div>
        """
    )

    with gr.Row():
        with gr.Column(scale=1):
            mode = gr.Radio(
                choices=["Text-to-Video", "Image-to-Video", "Video-to-Video"],
                value="Text-to-Video",
                label="Generation Mode",
            )
            image_input = gr.Image(label="Image (for I2V)", type="filepath", visible=False)
            video_input = gr.Video(label="Video (for V2V)", visible=False)
            prompt = gr.Textbox(
                label="Prompt",
                lines=4,
                value=(
                    "A vibrant tropical fish swimming gracefully among colorful coral reefs in "
                    "a clear, turquoise ocean. The fish has bright blue and yellow scales with a "
                    "small, distinctive orange spot on its side, its fins moving fluidly. The coral "
                    "reefs are alive with a variety of marine life, including small schools of "
                    "colorful fish and sea turtles gliding by. The water is crystal clear, allowing "
                    "for a view of the sandy ocean floor below. The reef itself is adorned with a mix "
                    "of hard and soft corals in shades of red, orange, and green. The photo captures "
                    "the fish from a slightly elevated angle, emphasizing its lively movements and the "
                    "vivid colors of its surroundings. A close-up shot with dynamic movement."
                )
            )
            with gr.Accordion("Advanced Settings", open=False):
                with gr.Row():
                    height = gr.Number(value=384, label="Height", precision=0, interactive=False)
                    width = gr.Number(value=640, label="Width", precision=0, interactive=False)
                with gr.Row():
                    num_frames = gr.Slider(33, 231, value=231, step=33, label="Num Frames")
                    num_inference_steps = gr.Slider(1, 10, value=2, step=1, label="Steps per stage")
                with gr.Row():
                    seed = gr.Number(value=42, label="Seed", precision=0)
                    is_amplify_first_chunk = gr.Checkbox(label="Amplify First Chunk", value=True)

            generate_btn = gr.Button("🚀 Generate Video", variant="primary", size="lg")

        with gr.Column(scale=1):
            video_output = gr.Video(label="Generated Video", autoplay=True)
            info_output = gr.Textbox(label="Info", interactive=False)

    mode.change(fn=update_conditional_visibility, inputs=[mode], outputs=[image_input, video_input])
    generate_btn.click(
        fn=generate_video,
        inputs=[mode, prompt, image_input, video_input, height, width, num_frames, num_inference_steps, seed, is_amplify_first_chunk],
        outputs=[video_output, info_output],
    )

    gr.Examples(
        examples=[
            [
                "Text-to-Video",
                "A vibrant tropical fish swimming gracefully among colorful coral reefs in "
                "a clear, turquoise ocean. The fish has bright blue and yellow scales with a "
                "small, distinctive orange spot on its side, its fins moving fluidly. The coral "
                "reefs are alive with a variety of marine life, including small schools of "
                "colorful fish and sea turtles gliding by. The water is crystal clear, allowing "
                "for a view of the sandy ocean floor below. The reef itself is adorned with a mix "
                "of hard and soft corals in shades of red, orange, and green. The photo captures "
                "the fish from a slightly elevated angle, emphasizing its lively movements and the "
                "vivid colors of its surroundings. A close-up shot with dynamic movement.",
            ],
            [
                "Text-to-Video",
                "An extreme close-up of an gray-haired man with a beard in his 60s, he is deep in "
                "thought pondering the history of the universe as he sits at a cafe in Paris, his eyes "
                "focus on people offscreen as they walk as he sits mostly motionless, he is dressed in "
                "a wool coat suit coat with a button-down shirt , he wears a brown beret and glasses "
                "and has a very professorial appearance, and the end he offers a subtle closed-mouth "
                "smile as if he found the answer to the mystery of life, the lighting is very cinematic "
                "with the golden light and the Parisian streets and city in the background, depth of "
                "field, cinematic 35mm film.",
            ],
            [
                "Text-to-Video",
                "A drone camera circles around a beautiful historic church built on a rocky outcropping "
                "along the Amalfi Coast, the view showcases historic and magnificent architectural "
                "details and tiered pathways and patios, waves are seen crashing against the rocks "
                "below as the view overlooks the horizon of the coastal waters and hilly landscapes "
                "of the Amalfi Coast Italy, several distant people are seen walking and enjoying vistas "
                "on patios of the dramatic ocean views, the warm glow of the afternoon sun creates a "
                "magical and romantic feeling to the scene, the view is stunning captured with beautiful photography.",
            ],
        ],
        inputs=[mode, prompt],
        label="Example Prompts",
    )

if __name__ == "__main__":
    demo.launch()