alexnasa commited on
Commit
abb49c0
·
verified ·
1 Parent(s): fa5da10

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +265 -230
app.py CHANGED
@@ -1,230 +1,265 @@
1
- import spaces
2
- import gradio as gr
3
- import torch
4
- import argparse
5
- from ovi.ovi_fusion_engine import OviFusionEngine, DEFAULT_CONFIG
6
- from diffusers import FluxPipeline
7
- import tempfile
8
- from ovi.utils.io_utils import save_video
9
- from ovi.utils.processing_utils import clean_text, scale_hw_to_area_divisible
10
- from huggingface_hub import snapshot_download
11
- import os
12
-
13
- # ----------------------------
14
- # Parse CLI Args
15
- # ----------------------------
16
- parser = argparse.ArgumentParser(description="Ovi Joint Video + Audio Gradio Demo")
17
- parser.add_argument(
18
- "--use_image_gen",
19
- action="store_true",
20
- help="Enable image generation UI with FluxPipeline"
21
- )
22
- parser.add_argument(
23
- "--cpu_offload",
24
- action="store_true",
25
- help="Enable CPU offload for both OviFusionEngine and FluxPipeline"
26
- )
27
- args = parser.parse_args()
28
-
29
- ckpt_dir = "./ckpts"
30
-
31
- # Wan2.2
32
- wan_dir = os.path.join(ckpt_dir, "Wan2.2-TI2V-5B")
33
- snapshot_download(
34
- repo_id="Wan-AI/Wan2.2-TI2V-5B",
35
- local_dir=wan_dir,
36
- allow_patterns=[
37
- "google/*",
38
- "models_t5_umt5-xxl-enc-bf16.pth",
39
- "Wan2.2_VAE.pth"
40
- ]
41
- )
42
-
43
- # MMAudio
44
- mm_audio_dir = os.path.join(ckpt_dir, "MMAudio")
45
- snapshot_download(
46
- repo_id="hkchengrex/MMAudio",
47
- local_dir=mm_audio_dir,
48
- allow_patterns=[
49
- "ext_weights/best_netG.pt",
50
- "ext_weights/v1-16.pth"
51
- ]
52
- )
53
-
54
- ovi_dir = os.path.join(ckpt_dir, "Ovi")
55
- snapshot_download(
56
- repo_id="chetwinlow1/Ovi",
57
- local_dir=ovi_dir,
58
- allow_patterns=[
59
- "model.safetensors"
60
- ]
61
- )
62
-
63
- # Initialize OviFusionEngine
64
- enable_cpu_offload = args.cpu_offload or args.use_image_gen
65
- use_image_gen = args.use_image_gen
66
- print(f"loading model... {enable_cpu_offload=}, {use_image_gen=} for gradio demo")
67
- DEFAULT_CONFIG['cpu_offload'] = enable_cpu_offload # always use cpu offload if image generation is enabled
68
- DEFAULT_CONFIG['mode'] = "t2v" # hardcoded since it is always cpu offloaded
69
- ovi_engine = OviFusionEngine()
70
- flux_model = None
71
- if use_image_gen:
72
- flux_model = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-Krea-dev", torch_dtype=torch.bfloat16)
73
- flux_model.enable_model_cpu_offload() #save some VRAM by offloading the model to CPU. Remove this if you have enough GPU VRAM
74
- print("loaded model")
75
-
76
-
77
- @spaces.GPU()
78
- def generate_video(
79
- text_prompt,
80
- image,
81
- video_frame_height,
82
- video_frame_width,
83
- video_seed,
84
- solver_name,
85
- sample_steps,
86
- shift,
87
- video_guidance_scale,
88
- audio_guidance_scale,
89
- slg_layer,
90
- video_negative_prompt,
91
- audio_negative_prompt,
92
- ):
93
- try:
94
- image_path = None
95
- if image is not None:
96
- image_path = image
97
-
98
- generated_video, generated_audio, _ = ovi_engine.generate(
99
- text_prompt=text_prompt,
100
- image_path=image_path,
101
- video_frame_height_width=[video_frame_height, video_frame_width],
102
- seed=video_seed,
103
- solver_name=solver_name,
104
- sample_steps=sample_steps,
105
- shift=shift,
106
- video_guidance_scale=video_guidance_scale,
107
- audio_guidance_scale=audio_guidance_scale,
108
- slg_layer=slg_layer,
109
- video_negative_prompt=video_negative_prompt,
110
- audio_negative_prompt=audio_negative_prompt,
111
- )
112
-
113
- tmpfile = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
114
- output_path = tmpfile.name
115
- save_video(output_path, generated_video, generated_audio, fps=24, sample_rate=16000)
116
-
117
- return output_path
118
- except Exception as e:
119
- print(f"Error during video generation: {e}")
120
- return None
121
-
122
-
123
- def generate_image(text_prompt, image_seed, image_height, image_width):
124
- if flux_model is None:
125
- return None
126
- text_prompt = clean_text(text_prompt)
127
- print(f"Generating image with prompt='{text_prompt}', seed={image_seed}, size=({image_height},{image_width})")
128
-
129
- image_h, image_w = scale_hw_to_area_divisible(image_height, image_width, area=1024 * 1024)
130
- image = flux_model(
131
- text_prompt,
132
- height=image_h,
133
- width=image_w,
134
- guidance_scale=4.5,
135
- generator=torch.Generator().manual_seed(int(image_seed))
136
- ).images[0]
137
-
138
- tmpfile = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
139
- image.save(tmpfile.name)
140
- return tmpfile.name
141
-
142
-
143
- # Build UI
144
- with gr.Blocks() as demo:
145
- gr.Markdown("# 🎥 Ovi Joint Video + Audio Generation Demo")
146
- gr.Markdown(
147
- """
148
- ## 📘 Instructions
149
-
150
- Follow the steps in order:
151
-
152
- 1️⃣ **Enter a Text Prompt** — describe your video. (This text prompt will be shared for image generation if enabled.)
153
- 2️⃣ **Upload or Generate an Image** — Upload an image or generate one if image generation is enabled. (If you do not see the image generation options, make sure to run the script with `--use_image_gen`.)
154
- 3️⃣ **Configure Video Options** — set resolution, seed, solver, and other parameters. (It will automatically use the uploaded/generated image as the first frame, whichever is rendered on your screen at the time of video generation.)
155
- 4️⃣ **Generate Video** — click the button to produce your final video with audio.
156
- 5️⃣ **View the Result** — your generated video will appear below.
157
-
158
- ---
159
-
160
- ### 💡 Tips
161
- 1. For best results, use detailed and specific text prompts.
162
- 2. Ensure text prompt format is correct, i.e speech to be said should be wrapped with `<S>...<E>`. Can provide optional audio description at the end, wrapping them in `<AUDCAP> ... <ENDAUDCAP>`, refer to examples
163
- 3. Do not be discouraged by bad or weird results, check prompt format and try different seeds, cfg values and slg layers.
164
- """
165
- )
166
-
167
-
168
- with gr.Row():
169
- with gr.Column():
170
- # Image section
171
- image = gr.Image(type="filepath", label="First Frame Image (upload or generate)")
172
-
173
- if args.use_image_gen:
174
- with gr.Accordion("🖼️ Image Generation Options", visible=True):
175
- image_text_prompt = gr.Textbox(label="Image Prompt", placeholder="Describe the image you want to generate...")
176
- image_seed = gr.Number(minimum=0, maximum=100000, value=42, label="Image Seed")
177
- image_height = gr.Number(minimum=128, maximum=1280, value=720, step=32, label="Image Height")
178
- image_width = gr.Number(minimum=128, maximum=1280, value=1280, step=32, label="Image Width")
179
- gen_img_btn = gr.Button("Generate Image 🎨")
180
- else:
181
- gen_img_btn = None
182
-
183
- with gr.Accordion("🎬 Video Generation Options", open=True):
184
- video_text_prompt = gr.Textbox(label="Video Prompt", placeholder="Describe your video...")
185
- video_height = gr.Number(minimum=128, maximum=1280, value=512, step=32, label="Video Height")
186
- video_width = gr.Number(minimum=128, maximum=1280, value=992, step=32, label="Video Width")
187
-
188
- video_seed = gr.Number(minimum=0, maximum=100000, value=100, label="Video Seed")
189
- solver_name = gr.Dropdown(
190
- choices=["unipc", "euler", "dpm++"], value="unipc", label="Solver Name"
191
- )
192
- sample_steps = gr.Number(
193
- value=50,
194
- label="Sample Steps",
195
- precision=0,
196
- minimum=20,
197
- maximum=100
198
- )
199
- shift = gr.Slider(minimum=0.0, maximum=20.0, value=5.0, step=1.0, label="Shift")
200
- video_guidance_scale = gr.Slider(minimum=0.0, maximum=10.0, value=4.0, step=0.5, label="Video Guidance Scale")
201
- audio_guidance_scale = gr.Slider(minimum=0.0, maximum=10.0, value=3.0, step=0.5, label="Audio Guidance Scale")
202
- slg_layer = gr.Number(minimum=-1, maximum=30, value=11, step=1, label="SLG Layer")
203
- video_negative_prompt = gr.Textbox(label="Video Negative Prompt", placeholder="Things to avoid in video")
204
- audio_negative_prompt = gr.Textbox(label="Audio Negative Prompt", placeholder="Things to avoid in audio")
205
-
206
- run_btn = gr.Button("Generate Video 🚀")
207
-
208
- with gr.Column():
209
- output_path = gr.Video(label="Generated Video")
210
-
211
- if args.use_image_gen and gen_img_btn is not None:
212
- gen_img_btn.click(
213
- fn=generate_image,
214
- inputs=[image_text_prompt, image_seed, image_height, image_width],
215
- outputs=[image],
216
- )
217
-
218
- # Hook up video generation
219
- run_btn.click(
220
- fn=generate_video,
221
- inputs=[
222
- video_text_prompt, image, video_height, video_width, video_seed, solver_name,
223
- sample_steps, shift, video_guidance_scale, audio_guidance_scale,
224
- slg_layer, video_negative_prompt, audio_negative_prompt,
225
- ],
226
- outputs=[output_path],
227
- )
228
-
229
- if __name__ == "__main__":
230
- demo.launch(share=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces
2
+ import gradio as gr
3
+ import argparse
4
+ from ovi.ovi_fusion_engine import OviFusionEngine, DEFAULT_CONFIG
5
+ from diffusers import FluxPipeline
6
+ import tempfile
7
+ from ovi.utils.io_utils import save_video
8
+ from ovi.utils.processing_utils import clean_text, scale_hw_to_area_divisible
9
+ from huggingface_hub import snapshot_download
10
+ import os
11
+ import subprocess
12
+
13
+ # Re-discover all .pth/.egg-link files
14
+ for sitedir in site.getsitepackages():
15
+ site.addsitedir(sitedir)
16
+
17
+ # Clear caches so importlib will pick up new modules
18
+ importlib.invalidate_caches()
19
+
20
+ def sh(cmd): subprocess.check_call(cmd, shell=True)
21
+
22
+ flash_attention_installed = False
23
+
24
+ try:
25
+ print("Attempting to download and install FlashAttention wheel...")
26
+ flash_attention_wheel = hf_hub_download(
27
+ repo_id="alexnasa/flash-attn-3",
28
+ repo_type="model",
29
+ filename="128/flash_attn_3-3.0.0b1-cp39-abi3-linux_x86_64.whl",
30
+ )
31
+
32
+ sh(f"pip install {flash_attention_wheel}")
33
+
34
+ # tell Python to re-scan site-packages now that the egg-link exists
35
+ import importlib, site; site.addsitedir(site.getsitepackages()[0]); importlib.invalidate_caches()
36
+
37
+ flash_attention_installed = True
38
+ print("FlashAttention installed successfully.")
39
+
40
+ except Exception as e:
41
+ print(f"⚠️ Could not install FlashAttention: {e}")
42
+ print("Continuing without FlashAttention...")
43
+
44
+ import torch
45
+ print(f"Torch version: {torch.__version__}")
46
+ print(f"FlashAttention available: {flash_attention_installed}")
47
+
48
+ # ----------------------------
49
+ # Parse CLI Args
50
+ # ----------------------------
51
+ parser = argparse.ArgumentParser(description="Ovi Joint Video + Audio Gradio Demo")
52
+ parser.add_argument(
53
+ "--use_image_gen",
54
+ action="store_true",
55
+ help="Enable image generation UI with FluxPipeline"
56
+ )
57
+ parser.add_argument(
58
+ "--cpu_offload",
59
+ action="store_true",
60
+ help="Enable CPU offload for both OviFusionEngine and FluxPipeline"
61
+ )
62
+ args = parser.parse_args()
63
+
64
+ ckpt_dir = "./ckpts"
65
+
66
+ # Wan2.2
67
+ wan_dir = os.path.join(ckpt_dir, "Wan2.2-TI2V-5B")
68
+ snapshot_download(
69
+ repo_id="Wan-AI/Wan2.2-TI2V-5B",
70
+ local_dir=wan_dir,
71
+ allow_patterns=[
72
+ "google/*",
73
+ "models_t5_umt5-xxl-enc-bf16.pth",
74
+ "Wan2.2_VAE.pth"
75
+ ]
76
+ )
77
+
78
+ # MMAudio
79
+ mm_audio_dir = os.path.join(ckpt_dir, "MMAudio")
80
+ snapshot_download(
81
+ repo_id="hkchengrex/MMAudio",
82
+ local_dir=mm_audio_dir,
83
+ allow_patterns=[
84
+ "ext_weights/best_netG.pt",
85
+ "ext_weights/v1-16.pth"
86
+ ]
87
+ )
88
+
89
+ ovi_dir = os.path.join(ckpt_dir, "Ovi")
90
+ snapshot_download(
91
+ repo_id="chetwinlow1/Ovi",
92
+ local_dir=ovi_dir,
93
+ allow_patterns=[
94
+ "model.safetensors"
95
+ ]
96
+ )
97
+
98
+ # Initialize OviFusionEngine
99
+ enable_cpu_offload = args.cpu_offload or args.use_image_gen
100
+ use_image_gen = args.use_image_gen
101
+ print(f"loading model... {enable_cpu_offload=}, {use_image_gen=} for gradio demo")
102
+ DEFAULT_CONFIG['cpu_offload'] = enable_cpu_offload # always use cpu offload if image generation is enabled
103
+ DEFAULT_CONFIG['mode'] = "t2v" # hardcoded since it is always cpu offloaded
104
+ ovi_engine = OviFusionEngine()
105
+ flux_model = None
106
+ if use_image_gen:
107
+ flux_model = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-Krea-dev", torch_dtype=torch.bfloat16)
108
+ flux_model.enable_model_cpu_offload() #save some VRAM by offloading the model to CPU. Remove this if you have enough GPU VRAM
109
+ print("loaded model")
110
+
111
+
112
+ @spaces.GPU()
113
+ def generate_video(
114
+ text_prompt,
115
+ image,
116
+ video_frame_height,
117
+ video_frame_width,
118
+ video_seed,
119
+ solver_name,
120
+ sample_steps,
121
+ shift,
122
+ video_guidance_scale,
123
+ audio_guidance_scale,
124
+ slg_layer,
125
+ video_negative_prompt,
126
+ audio_negative_prompt,
127
+ ):
128
+ try:
129
+ image_path = None
130
+ if image is not None:
131
+ image_path = image
132
+
133
+ generated_video, generated_audio, _ = ovi_engine.generate(
134
+ text_prompt=text_prompt,
135
+ image_path=image_path,
136
+ video_frame_height_width=[video_frame_height, video_frame_width],
137
+ seed=video_seed,
138
+ solver_name=solver_name,
139
+ sample_steps=sample_steps,
140
+ shift=shift,
141
+ video_guidance_scale=video_guidance_scale,
142
+ audio_guidance_scale=audio_guidance_scale,
143
+ slg_layer=slg_layer,
144
+ video_negative_prompt=video_negative_prompt,
145
+ audio_negative_prompt=audio_negative_prompt,
146
+ )
147
+
148
+ tmpfile = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
149
+ output_path = tmpfile.name
150
+ save_video(output_path, generated_video, generated_audio, fps=24, sample_rate=16000)
151
+
152
+ return output_path
153
+ except Exception as e:
154
+ print(f"Error during video generation: {e}")
155
+ return None
156
+
157
+
158
+ def generate_image(text_prompt, image_seed, image_height, image_width):
159
+ if flux_model is None:
160
+ return None
161
+ text_prompt = clean_text(text_prompt)
162
+ print(f"Generating image with prompt='{text_prompt}', seed={image_seed}, size=({image_height},{image_width})")
163
+
164
+ image_h, image_w = scale_hw_to_area_divisible(image_height, image_width, area=1024 * 1024)
165
+ image = flux_model(
166
+ text_prompt,
167
+ height=image_h,
168
+ width=image_w,
169
+ guidance_scale=4.5,
170
+ generator=torch.Generator().manual_seed(int(image_seed))
171
+ ).images[0]
172
+
173
+ tmpfile = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
174
+ image.save(tmpfile.name)
175
+ return tmpfile.name
176
+
177
+
178
+ # Build UI
179
+ with gr.Blocks() as demo:
180
+ gr.Markdown("# 🎥 Ovi Joint Video + Audio Generation Demo")
181
+ gr.Markdown(
182
+ """
183
+ ## 📘 Instructions
184
+
185
+ Follow the steps in order:
186
+
187
+ 1️⃣ **Enter a Text Prompt** — describe your video. (This text prompt will be shared for image generation if enabled.)
188
+ 2️⃣ **Upload or Generate an Image** — Upload an image or generate one if image generation is enabled. (If you do not see the image generation options, make sure to run the script with `--use_image_gen`.)
189
+ 3️⃣ **Configure Video Options** — set resolution, seed, solver, and other parameters. (It will automatically use the uploaded/generated image as the first frame, whichever is rendered on your screen at the time of video generation.)
190
+ 4️⃣ **Generate Video** click the button to produce your final video with audio.
191
+ 5️⃣ **View the Result** — your generated video will appear below.
192
+
193
+ ---
194
+
195
+ ### 💡 Tips
196
+ 1. For best results, use detailed and specific text prompts.
197
+ 2. Ensure text prompt format is correct, i.e speech to be said should be wrapped with `<S>...<E>`. Can provide optional audio description at the end, wrapping them in `<AUDCAP> ... <ENDAUDCAP>`, refer to examples
198
+ 3. Do not be discouraged by bad or weird results, check prompt format and try different seeds, cfg values and slg layers.
199
+ """
200
+ )
201
+
202
+
203
+ with gr.Row():
204
+ with gr.Column():
205
+ # Image section
206
+ image = gr.Image(type="filepath", label="First Frame Image (upload or generate)")
207
+
208
+ if args.use_image_gen:
209
+ with gr.Accordion("🖼️ Image Generation Options", visible=True):
210
+ image_text_prompt = gr.Textbox(label="Image Prompt", placeholder="Describe the image you want to generate...")
211
+ image_seed = gr.Number(minimum=0, maximum=100000, value=42, label="Image Seed")
212
+ image_height = gr.Number(minimum=128, maximum=1280, value=720, step=32, label="Image Height")
213
+ image_width = gr.Number(minimum=128, maximum=1280, value=1280, step=32, label="Image Width")
214
+ gen_img_btn = gr.Button("Generate Image 🎨")
215
+ else:
216
+ gen_img_btn = None
217
+
218
+ with gr.Accordion("🎬 Video Generation Options", open=True):
219
+ video_text_prompt = gr.Textbox(label="Video Prompt", placeholder="Describe your video...")
220
+ video_height = gr.Number(minimum=128, maximum=1280, value=512, step=32, label="Video Height")
221
+ video_width = gr.Number(minimum=128, maximum=1280, value=992, step=32, label="Video Width")
222
+
223
+ video_seed = gr.Number(minimum=0, maximum=100000, value=100, label="Video Seed")
224
+ solver_name = gr.Dropdown(
225
+ choices=["unipc", "euler", "dpm++"], value="unipc", label="Solver Name"
226
+ )
227
+ sample_steps = gr.Number(
228
+ value=50,
229
+ label="Sample Steps",
230
+ precision=0,
231
+ minimum=20,
232
+ maximum=100
233
+ )
234
+ shift = gr.Slider(minimum=0.0, maximum=20.0, value=5.0, step=1.0, label="Shift")
235
+ video_guidance_scale = gr.Slider(minimum=0.0, maximum=10.0, value=4.0, step=0.5, label="Video Guidance Scale")
236
+ audio_guidance_scale = gr.Slider(minimum=0.0, maximum=10.0, value=3.0, step=0.5, label="Audio Guidance Scale")
237
+ slg_layer = gr.Number(minimum=-1, maximum=30, value=11, step=1, label="SLG Layer")
238
+ video_negative_prompt = gr.Textbox(label="Video Negative Prompt", placeholder="Things to avoid in video")
239
+ audio_negative_prompt = gr.Textbox(label="Audio Negative Prompt", placeholder="Things to avoid in audio")
240
+
241
+ run_btn = gr.Button("Generate Video 🚀")
242
+
243
+ with gr.Column():
244
+ output_path = gr.Video(label="Generated Video")
245
+
246
+ if args.use_image_gen and gen_img_btn is not None:
247
+ gen_img_btn.click(
248
+ fn=generate_image,
249
+ inputs=[image_text_prompt, image_seed, image_height, image_width],
250
+ outputs=[image],
251
+ )
252
+
253
+ # Hook up video generation
254
+ run_btn.click(
255
+ fn=generate_video,
256
+ inputs=[
257
+ video_text_prompt, image, video_height, video_width, video_seed, solver_name,
258
+ sample_steps, shift, video_guidance_scale, audio_guidance_scale,
259
+ slg_layer, video_negative_prompt, audio_negative_prompt,
260
+ ],
261
+ outputs=[output_path],
262
+ )
263
+
264
+ if __name__ == "__main__":
265
+ demo.launch(share=True)