Ovi

Runtime error

App Files Files Community

alexnasa commited on Oct 3, 2025

Commit

abb49c0

verified ·

1 Parent(s): fa5da10

Update app.py

Browse files

Files changed (1) hide show

app.py +265 -230

app.py CHANGED Viewed

@@ -1,230 +1,265 @@
-import spaces
-import gradio as gr
-import torch
-import argparse
-from ovi.ovi_fusion_engine import OviFusionEngine, DEFAULT_CONFIG
-from diffusers import FluxPipeline
-import tempfile
-from ovi.utils.io_utils import save_video
-from ovi.utils.processing_utils import clean_text, scale_hw_to_area_divisible
-from huggingface_hub import snapshot_download
-import os
-# ----------------------------
-# Parse CLI Args
-# ----------------------------
-parser = argparse.ArgumentParser(description="Ovi Joint Video + Audio Gradio Demo")
-parser.add_argument(
-    "--use_image_gen",
-    action="store_true",
-    help="Enable image generation UI with FluxPipeline"
-)
-parser.add_argument(
-    "--cpu_offload",
-    action="store_true",
-    help="Enable CPU offload for both OviFusionEngine and FluxPipeline"
-)
-args = parser.parse_args()
-ckpt_dir = "./ckpts"
-# Wan2.2
-wan_dir = os.path.join(ckpt_dir, "Wan2.2-TI2V-5B")
-snapshot_download(
-    repo_id="Wan-AI/Wan2.2-TI2V-5B",
-    local_dir=wan_dir,
-    allow_patterns=[
-        "google/*",
-        "models_t5_umt5-xxl-enc-bf16.pth",
-        "Wan2.2_VAE.pth"
-    ]
-)
-# MMAudio
-mm_audio_dir = os.path.join(ckpt_dir, "MMAudio")
-snapshot_download(
-    repo_id="hkchengrex/MMAudio",
-    local_dir=mm_audio_dir,
-    allow_patterns=[
-        "ext_weights/best_netG.pt",
-        "ext_weights/v1-16.pth"
-    ]
-)
-ovi_dir = os.path.join(ckpt_dir, "Ovi")
-snapshot_download(
-    repo_id="chetwinlow1/Ovi",
-    local_dir=ovi_dir,
-    allow_patterns=[
-        "model.safetensors"
-    ]
-)
-# Initialize OviFusionEngine
-enable_cpu_offload = args.cpu_offload or args.use_image_gen
-use_image_gen = args.use_image_gen
-print(f"loading model... {enable_cpu_offload=}, {use_image_gen=} for gradio demo")
-DEFAULT_CONFIG['cpu_offload'] = enable_cpu_offload # always use cpu offload if image generation is enabled
-DEFAULT_CONFIG['mode'] = "t2v"  # hardcoded since it is always cpu offloaded
-ovi_engine = OviFusionEngine()
-flux_model = None
-if use_image_gen:
-    flux_model = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-Krea-dev", torch_dtype=torch.bfloat16)
-    flux_model.enable_model_cpu_offload() #save some VRAM by offloading the model to CPU. Remove this if you have enough GPU VRAM
-print("loaded model")
-@spaces.GPU()
-def generate_video(
-    text_prompt,
-    image,
-    video_frame_height,
-    video_frame_width,
-    video_seed,
-    solver_name,
-    sample_steps,
-    shift,
-    video_guidance_scale,
-    audio_guidance_scale,
-    slg_layer,
-    video_negative_prompt,
-    audio_negative_prompt,
-):
-    try:
-        image_path = None
-        if image is not None:
-            image_path = image
-        generated_video, generated_audio, _ = ovi_engine.generate(
-            text_prompt=text_prompt,
-            image_path=image_path,
-            video_frame_height_width=[video_frame_height, video_frame_width],
-            seed=video_seed,
-            solver_name=solver_name,
-            sample_steps=sample_steps,
-            shift=shift,
-            video_guidance_scale=video_guidance_scale,
-            audio_guidance_scale=audio_guidance_scale,
-            slg_layer=slg_layer,
-            video_negative_prompt=video_negative_prompt,
-            audio_negative_prompt=audio_negative_prompt,
-        )
-        tmpfile = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
-        output_path = tmpfile.name
-        save_video(output_path, generated_video, generated_audio, fps=24, sample_rate=16000)
-        return output_path
-    except Exception as e:
-        print(f"Error during video generation: {e}")
-        return None
-def generate_image(text_prompt, image_seed, image_height, image_width):
-    if flux_model is None:
-        return None
-    text_prompt = clean_text(text_prompt)
-    print(f"Generating image with prompt='{text_prompt}', seed={image_seed}, size=({image_height},{image_width})")
-    image_h, image_w = scale_hw_to_area_divisible(image_height, image_width, area=1024 * 1024)
-    image = flux_model(
-        text_prompt,
-        height=image_h,
-        width=image_w,
-        guidance_scale=4.5,
-        generator=torch.Generator().manual_seed(int(image_seed))
-    ).images[0]
-    tmpfile = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
-    image.save(tmpfile.name)
-    return tmpfile.name
-# Build UI
-with gr.Blocks() as demo:
-    gr.Markdown("# 🎥 Ovi Joint Video + Audio Generation Demo")
-    gr.Markdown(
-        """
-        ## 📘 Instructions
-        Follow the steps in order:
-        1️⃣ **Enter a Text Prompt** — describe your video. (This text prompt will be shared for image generation if enabled.)
-        2️⃣ **Upload or Generate an Image** — Upload an image or generate one if image generation is enabled.  (If you do not see the image generation options, make sure to run the script with `--use_image_gen`.)
-        3️⃣ **Configure Video Options** — set resolution, seed, solver, and other parameters. (It will automatically use the uploaded/generated image as the first frame, whichever is rendered on your screen at the time of video generation.)
-        4️⃣ **Generate Video** — click the button to produce your final video with audio.
-        5️⃣ **View the Result** — your generated video will appear below.
-        ---
-        ### 💡 Tips
-        1. For best results, use detailed and specific text prompts.
-        2. Ensure text prompt format is correct, i.e speech to be said should be wrapped with `<S>...<E>`. Can provide optional audio description at the end, wrapping them in `<AUDCAP> ... <ENDAUDCAP>`, refer to examples
-        3. Do not be discouraged by bad or weird results, check prompt format and try different seeds, cfg values and slg layers.
-        """
-    )
-    with gr.Row():
-        with gr.Column():
-            # Image section
-            image = gr.Image(type="filepath", label="First Frame Image (upload or generate)")
-            if args.use_image_gen:
-                with gr.Accordion("🖼️ Image Generation Options", visible=True):
-                    image_text_prompt = gr.Textbox(label="Image Prompt", placeholder="Describe the image you want to generate...")
-                    image_seed = gr.Number(minimum=0, maximum=100000, value=42, label="Image Seed")
-                    image_height = gr.Number(minimum=128, maximum=1280, value=720, step=32, label="Image Height")
-                    image_width = gr.Number(minimum=128, maximum=1280, value=1280, step=32, label="Image Width")
-                    gen_img_btn = gr.Button("Generate Image 🎨")
-            else:
-                gen_img_btn = None
-            with gr.Accordion("🎬 Video Generation Options", open=True):
-                video_text_prompt = gr.Textbox(label="Video Prompt", placeholder="Describe your video...")
-                video_height = gr.Number(minimum=128, maximum=1280, value=512, step=32, label="Video Height")
-                video_width = gr.Number(minimum=128, maximum=1280, value=992, step=32, label="Video Width")
-                video_seed = gr.Number(minimum=0, maximum=100000, value=100, label="Video Seed")
-                solver_name = gr.Dropdown(
-                    choices=["unipc", "euler", "dpm++"], value="unipc", label="Solver Name"
-                )
-                sample_steps = gr.Number(
-                    value=50,
-                    label="Sample Steps",
-                    precision=0,
-                    minimum=20,
-                    maximum=100
-                )
-                shift = gr.Slider(minimum=0.0, maximum=20.0, value=5.0, step=1.0, label="Shift")
-                video_guidance_scale = gr.Slider(minimum=0.0, maximum=10.0, value=4.0, step=0.5, label="Video Guidance Scale")
-                audio_guidance_scale = gr.Slider(minimum=0.0, maximum=10.0, value=3.0, step=0.5, label="Audio Guidance Scale")
-                slg_layer = gr.Number(minimum=-1, maximum=30, value=11, step=1, label="SLG Layer")
-                video_negative_prompt = gr.Textbox(label="Video Negative Prompt", placeholder="Things to avoid in video")
-                audio_negative_prompt = gr.Textbox(label="Audio Negative Prompt", placeholder="Things to avoid in audio")
-                run_btn = gr.Button("Generate Video 🚀")
-        with gr.Column():
-            output_path = gr.Video(label="Generated Video")
-    if args.use_image_gen and gen_img_btn is not None:
-        gen_img_btn.click(
-            fn=generate_image,
-            inputs=[image_text_prompt, image_seed, image_height, image_width],
-            outputs=[image],
-        )
-    # Hook up video generation
-    run_btn.click(
-        fn=generate_video,
-        inputs=[
-            video_text_prompt, image, video_height, video_width, video_seed, solver_name,
-            sample_steps, shift, video_guidance_scale, audio_guidance_scale,
-            slg_layer, video_negative_prompt, audio_negative_prompt,
-        ],
-        outputs=[output_path],
-    )
-if __name__ == "__main__":
-    demo.launch(share=True)

+import spaces
+import gradio as gr
+import argparse
+from ovi.ovi_fusion_engine import OviFusionEngine, DEFAULT_CONFIG
+from diffusers import FluxPipeline
+import tempfile
+from ovi.utils.io_utils import save_video
+from ovi.utils.processing_utils import clean_text, scale_hw_to_area_divisible
+from huggingface_hub import snapshot_download
+import os
+import subprocess
+# Re-discover all .pth/.egg-link files
+for sitedir in site.getsitepackages():
+    site.addsitedir(sitedir)
+# Clear caches so importlib will pick up new modules
+importlib.invalidate_caches()
+def sh(cmd): subprocess.check_call(cmd, shell=True)
+flash_attention_installed = False
+try:
+    print("Attempting to download and install FlashAttention wheel...")
+    flash_attention_wheel = hf_hub_download(
+            repo_id="alexnasa/flash-attn-3",
+            repo_type="model",
+            filename="128/flash_attn_3-3.0.0b1-cp39-abi3-linux_x86_64.whl",
+        )
+    sh(f"pip install {flash_attention_wheel}")
+    # tell Python to re-scan site-packages now that the egg-link exists
+    import importlib, site; site.addsitedir(site.getsitepackages()[0]); importlib.invalidate_caches()
+    flash_attention_installed = True
+    print("FlashAttention installed successfully.")
+except Exception as e:
+    print(f"⚠️ Could not install FlashAttention: {e}")
+    print("Continuing without FlashAttention...")
+import torch
+print(f"Torch version: {torch.__version__}")
+print(f"FlashAttention available: {flash_attention_installed}")
+# ----------------------------
+# Parse CLI Args
+# ----------------------------
+parser = argparse.ArgumentParser(description="Ovi Joint Video + Audio Gradio Demo")
+parser.add_argument(
+    "--use_image_gen",
+    action="store_true",
+    help="Enable image generation UI with FluxPipeline"
+)
+parser.add_argument(
+    "--cpu_offload",
+    action="store_true",
+    help="Enable CPU offload for both OviFusionEngine and FluxPipeline"
+)
+args = parser.parse_args()
+ckpt_dir = "./ckpts"
+# Wan2.2
+wan_dir = os.path.join(ckpt_dir, "Wan2.2-TI2V-5B")
+snapshot_download(
+    repo_id="Wan-AI/Wan2.2-TI2V-5B",
+    local_dir=wan_dir,
+    allow_patterns=[
+        "google/*",
+        "models_t5_umt5-xxl-enc-bf16.pth",
+        "Wan2.2_VAE.pth"
+    ]
+)
+# MMAudio
+mm_audio_dir = os.path.join(ckpt_dir, "MMAudio")
+snapshot_download(
+    repo_id="hkchengrex/MMAudio",
+    local_dir=mm_audio_dir,
+    allow_patterns=[
+        "ext_weights/best_netG.pt",
+        "ext_weights/v1-16.pth"
+    ]
+)
+ovi_dir = os.path.join(ckpt_dir, "Ovi")
+snapshot_download(
+    repo_id="chetwinlow1/Ovi",
+    local_dir=ovi_dir,
+    allow_patterns=[
+        "model.safetensors"
+    ]
+)
+# Initialize OviFusionEngine
+enable_cpu_offload = args.cpu_offload or args.use_image_gen
+use_image_gen = args.use_image_gen
+print(f"loading model... {enable_cpu_offload=}, {use_image_gen=} for gradio demo")
+DEFAULT_CONFIG['cpu_offload'] = enable_cpu_offload # always use cpu offload if image generation is enabled
+DEFAULT_CONFIG['mode'] = "t2v"  # hardcoded since it is always cpu offloaded
+ovi_engine = OviFusionEngine()
+flux_model = None
+if use_image_gen:
+    flux_model = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-Krea-dev", torch_dtype=torch.bfloat16)
+    flux_model.enable_model_cpu_offload() #save some VRAM by offloading the model to CPU. Remove this if you have enough GPU VRAM
+print("loaded model")
+@spaces.GPU()
+def generate_video(
+    text_prompt,
+    image,
+    video_frame_height,
+    video_frame_width,
+    video_seed,
+    solver_name,
+    sample_steps,
+    shift,
+    video_guidance_scale,
+    audio_guidance_scale,
+    slg_layer,
+    video_negative_prompt,
+    audio_negative_prompt,
+):
+    try:
+        image_path = None
+        if image is not None:
+            image_path = image
+        generated_video, generated_audio, _ = ovi_engine.generate(
+            text_prompt=text_prompt,
+            image_path=image_path,
+            video_frame_height_width=[video_frame_height, video_frame_width],
+            seed=video_seed,
+            solver_name=solver_name,
+            sample_steps=sample_steps,
+            shift=shift,
+            video_guidance_scale=video_guidance_scale,
+            audio_guidance_scale=audio_guidance_scale,
+            slg_layer=slg_layer,
+            video_negative_prompt=video_negative_prompt,
+            audio_negative_prompt=audio_negative_prompt,
+        )
+        tmpfile = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
+        output_path = tmpfile.name
+        save_video(output_path, generated_video, generated_audio, fps=24, sample_rate=16000)
+        return output_path
+    except Exception as e:
+        print(f"Error during video generation: {e}")
+        return None
+def generate_image(text_prompt, image_seed, image_height, image_width):
+    if flux_model is None:
+        return None
+    text_prompt = clean_text(text_prompt)
+    print(f"Generating image with prompt='{text_prompt}', seed={image_seed}, size=({image_height},{image_width})")
+    image_h, image_w = scale_hw_to_area_divisible(image_height, image_width, area=1024 * 1024)
+    image = flux_model(
+        text_prompt,
+        height=image_h,
+        width=image_w,
+        guidance_scale=4.5,
+        generator=torch.Generator().manual_seed(int(image_seed))
+    ).images[0]
+    tmpfile = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
+    image.save(tmpfile.name)
+    return tmpfile.name
+# Build UI
+with gr.Blocks() as demo:
+    gr.Markdown("# 🎥 Ovi Joint Video + Audio Generation Demo")
+    gr.Markdown(
+        """
+        ## 📘 Instructions
+        Follow the steps in order:
+        1️⃣ **Enter a Text Prompt** — describe your video. (This text prompt will be shared for image generation if enabled.)
+        2️⃣ **Upload or Generate an Image** — Upload an image or generate one if image generation is enabled.  (If you do not see the image generation options, make sure to run the script with `--use_image_gen`.)
+        3️⃣ **Configure Video Options** — set resolution, seed, solver, and other parameters. (It will automatically use the uploaded/generated image as the first frame, whichever is rendered on your screen at the time of video generation.)
+        4️⃣ **Generate Video** — click the button to produce your final video with audio.
+        5️⃣ **View the Result** — your generated video will appear below.
+        ---
+        ### 💡 Tips
+        1. For best results, use detailed and specific text prompts.
+        2. Ensure text prompt format is correct, i.e speech to be said should be wrapped with `<S>...<E>`. Can provide optional audio description at the end, wrapping them in `<AUDCAP> ... <ENDAUDCAP>`, refer to examples
+        3. Do not be discouraged by bad or weird results, check prompt format and try different seeds, cfg values and slg layers.
+        """
+    )
+    with gr.Row():
+        with gr.Column():
+            # Image section
+            image = gr.Image(type="filepath", label="First Frame Image (upload or generate)")
+            if args.use_image_gen:
+                with gr.Accordion("🖼️ Image Generation Options", visible=True):
+                    image_text_prompt = gr.Textbox(label="Image Prompt", placeholder="Describe the image you want to generate...")
+                    image_seed = gr.Number(minimum=0, maximum=100000, value=42, label="Image Seed")
+                    image_height = gr.Number(minimum=128, maximum=1280, value=720, step=32, label="Image Height")
+                    image_width = gr.Number(minimum=128, maximum=1280, value=1280, step=32, label="Image Width")
+                    gen_img_btn = gr.Button("Generate Image 🎨")
+            else:
+                gen_img_btn = None
+            with gr.Accordion("🎬 Video Generation Options", open=True):
+                video_text_prompt = gr.Textbox(label="Video Prompt", placeholder="Describe your video...")
+                video_height = gr.Number(minimum=128, maximum=1280, value=512, step=32, label="Video Height")
+                video_width = gr.Number(minimum=128, maximum=1280, value=992, step=32, label="Video Width")
+                video_seed = gr.Number(minimum=0, maximum=100000, value=100, label="Video Seed")
+                solver_name = gr.Dropdown(
+                    choices=["unipc", "euler", "dpm++"], value="unipc", label="Solver Name"
+                )
+                sample_steps = gr.Number(
+                    value=50,
+                    label="Sample Steps",
+                    precision=0,
+                    minimum=20,
+                    maximum=100
+                )
+                shift = gr.Slider(minimum=0.0, maximum=20.0, value=5.0, step=1.0, label="Shift")
+                video_guidance_scale = gr.Slider(minimum=0.0, maximum=10.0, value=4.0, step=0.5, label="Video Guidance Scale")
+                audio_guidance_scale = gr.Slider(minimum=0.0, maximum=10.0, value=3.0, step=0.5, label="Audio Guidance Scale")
+                slg_layer = gr.Number(minimum=-1, maximum=30, value=11, step=1, label="SLG Layer")
+                video_negative_prompt = gr.Textbox(label="Video Negative Prompt", placeholder="Things to avoid in video")
+                audio_negative_prompt = gr.Textbox(label="Audio Negative Prompt", placeholder="Things to avoid in audio")
+                run_btn = gr.Button("Generate Video 🚀")
+        with gr.Column():
+            output_path = gr.Video(label="Generated Video")
+    if args.use_image_gen and gen_img_btn is not None:
+        gen_img_btn.click(
+            fn=generate_image,
+            inputs=[image_text_prompt, image_seed, image_height, image_width],
+            outputs=[image],
+        )
+    # Hook up video generation
+    run_btn.click(
+        fn=generate_video,
+        inputs=[
+            video_text_prompt, image, video_height, video_width, video_seed, solver_name,
+            sample_steps, shift, video_guidance_scale, audio_guidance_scale,
+            slg_layer, video_negative_prompt, audio_negative_prompt,
+        ],
+        outputs=[output_path],
+    )
+if __name__ == "__main__":
+    demo.launch(share=True)