|
|
import spaces |
|
|
from huggingface_hub import snapshot_download, hf_hub_download |
|
|
import os |
|
|
import subprocess |
|
|
import importlib, site |
|
|
from PIL import Image |
|
|
import uuid |
|
|
import shutil |
|
|
|
|
|
|
|
|
for sitedir in site.getsitepackages(): |
|
|
site.addsitedir(sitedir) |
|
|
|
|
|
|
|
|
importlib.invalidate_caches() |
|
|
|
|
|
def sh(cmd): subprocess.check_call(cmd, shell=True) |
|
|
|
|
|
flash_attention_installed = False |
|
|
|
|
|
try: |
|
|
print("Attempting to download and install FlashAttention wheel...") |
|
|
flash_attention_wheel = hf_hub_download( |
|
|
repo_id="alexnasa/flash-attn-3", |
|
|
repo_type="model", |
|
|
filename="128/flash_attn_3-3.0.0b1-cp39-abi3-linux_x86_64.whl", |
|
|
) |
|
|
|
|
|
sh(f"pip install {flash_attention_wheel}") |
|
|
|
|
|
|
|
|
import importlib, site; site.addsitedir(site.getsitepackages()[0]); importlib.invalidate_caches() |
|
|
|
|
|
flash_attention_installed = True |
|
|
print("FlashAttention installed successfully.") |
|
|
|
|
|
except Exception as e: |
|
|
print(f"⚠️ Could not install FlashAttention: {e}") |
|
|
print("Continuing without FlashAttention...") |
|
|
|
|
|
import torch |
|
|
print(f"Torch version: {torch.__version__}") |
|
|
print(f"FlashAttention available: {flash_attention_installed}") |
|
|
|
|
|
os.environ["PROCESSED_RESULTS"] = f"{os.getcwd()}/processed_results" |
|
|
|
|
|
import gradio as gr |
|
|
import argparse |
|
|
from ovi.ovi_fusion_engine import OviFusionEngine, DEFAULT_CONFIG |
|
|
from diffusers import FluxPipeline |
|
|
import tempfile |
|
|
from ovi.utils.io_utils import save_video |
|
|
from ovi.utils.processing_utils import clean_text, scale_hw_to_area_divisible |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
parser = argparse.ArgumentParser(description="Ovi Joint Video + Audio Gradio Demo") |
|
|
|
|
|
parser.add_argument( |
|
|
"--cpu_offload", |
|
|
action="store_true", |
|
|
help="Enable CPU offload for both OviFusionEngine and FluxPipeline" |
|
|
) |
|
|
args = parser.parse_args() |
|
|
|
|
|
ckpt_dir = "./ckpts" |
|
|
|
|
|
|
|
|
wan_dir = os.path.join(ckpt_dir, "Wan2.2-TI2V-5B") |
|
|
snapshot_download( |
|
|
repo_id="Wan-AI/Wan2.2-TI2V-5B", |
|
|
local_dir=wan_dir, |
|
|
allow_patterns=[ |
|
|
"google/*", |
|
|
"models_t5_umt5-xxl-enc-bf16.pth", |
|
|
"Wan2.2_VAE.pth" |
|
|
] |
|
|
) |
|
|
|
|
|
|
|
|
mm_audio_dir = os.path.join(ckpt_dir, "MMAudio") |
|
|
snapshot_download( |
|
|
repo_id="hkchengrex/MMAudio", |
|
|
local_dir=mm_audio_dir, |
|
|
allow_patterns=[ |
|
|
"ext_weights/best_netG.pt", |
|
|
"ext_weights/v1-16.pth" |
|
|
] |
|
|
) |
|
|
|
|
|
ovi_dir = os.path.join(ckpt_dir, "Ovi") |
|
|
snapshot_download( |
|
|
repo_id="chetwinlow1/Ovi", |
|
|
local_dir=ovi_dir, |
|
|
allow_patterns=[ |
|
|
"model.safetensors" |
|
|
] |
|
|
) |
|
|
|
|
|
|
|
|
enable_cpu_offload = args.cpu_offload |
|
|
print(f"loading model...") |
|
|
DEFAULT_CONFIG['cpu_offload'] = enable_cpu_offload |
|
|
DEFAULT_CONFIG['mode'] = "t2v" |
|
|
ovi_engine = OviFusionEngine() |
|
|
print("loaded model") |
|
|
|
|
|
|
|
|
def resize_for_model(image_path): |
|
|
|
|
|
img = Image.open(image_path) |
|
|
w, h = img.size |
|
|
aspect_ratio = w / h |
|
|
|
|
|
|
|
|
if aspect_ratio > 1.5: |
|
|
target_size = (992, 512) |
|
|
elif aspect_ratio < 0.66: |
|
|
target_size = (512, 992) |
|
|
else: |
|
|
target_size = (512, 512) |
|
|
|
|
|
|
|
|
img.thumbnail(target_size, Image.Resampling.LANCZOS) |
|
|
|
|
|
|
|
|
new_img = Image.new("RGB", target_size, (0, 0, 0)) |
|
|
new_img.paste( |
|
|
img, |
|
|
((target_size[0] - img.size[0]) // 2, |
|
|
(target_size[1] - img.size[1]) // 2) |
|
|
) |
|
|
return new_img, target_size |
|
|
|
|
|
def get_duration( |
|
|
text_prompt, |
|
|
image, |
|
|
sample_steps, |
|
|
session_id, |
|
|
video_seed, |
|
|
solver_name, |
|
|
shif, |
|
|
video_guidance_scale, |
|
|
audio_guidance_scale, |
|
|
slg_layer, |
|
|
video_negative_prompt, |
|
|
audio_negative_prompt, |
|
|
progress, |
|
|
): |
|
|
warmup = 20 |
|
|
|
|
|
return int(sample_steps * 3 + warmup) |
|
|
|
|
|
|
|
|
@spaces.GPU(duration=get_duration) |
|
|
def generate_video( |
|
|
text_prompt, |
|
|
image, |
|
|
sample_steps = 50, |
|
|
session_id = None, |
|
|
video_seed = 100, |
|
|
solver_name = "unipc", |
|
|
shift = 5, |
|
|
video_guidance_scale = 4, |
|
|
audio_guidance_scale = 3, |
|
|
slg_layer = 11, |
|
|
video_negative_prompt = "", |
|
|
audio_negative_prompt = "", |
|
|
progress=gr.Progress(track_tqdm=True) |
|
|
): |
|
|
try: |
|
|
image_path = None |
|
|
|
|
|
if image is not None: |
|
|
image_path = image |
|
|
|
|
|
if session_id is None: |
|
|
session_id = uuid.uuid4().hex |
|
|
|
|
|
|
|
|
output_dir = os.path.join(os.environ["PROCESSED_RESULTS"], session_id) |
|
|
os.makedirs(output_dir, exist_ok=True) |
|
|
output_path = os.path.join(output_dir, f"generated_video.mp4") |
|
|
|
|
|
|
|
|
_, target_size = resize_for_model(image_path) |
|
|
|
|
|
video_frame_width = target_size[0] |
|
|
video_frame_height = target_size[1] |
|
|
|
|
|
generated_video, generated_audio, _ = ovi_engine.generate( |
|
|
text_prompt=text_prompt, |
|
|
image_path=image_path, |
|
|
video_frame_height_width=[video_frame_height, video_frame_width], |
|
|
seed=video_seed, |
|
|
solver_name=solver_name, |
|
|
sample_steps=sample_steps, |
|
|
shift=shift, |
|
|
video_guidance_scale=video_guidance_scale, |
|
|
audio_guidance_scale=audio_guidance_scale, |
|
|
slg_layer=slg_layer, |
|
|
video_negative_prompt=video_negative_prompt, |
|
|
audio_negative_prompt=audio_negative_prompt, |
|
|
) |
|
|
|
|
|
save_video(output_path, generated_video, generated_audio, fps=24, sample_rate=16000) |
|
|
|
|
|
return output_path |
|
|
except Exception as e: |
|
|
print(f"Error during video generation: {e}") |
|
|
return None |
|
|
|
|
|
|
|
|
def cleanup(request: gr.Request): |
|
|
|
|
|
sid = request.session_hash |
|
|
if sid: |
|
|
d1 = os.path.join(os.environ["PROCESSED_RESULTS"], sid) |
|
|
shutil.rmtree(d1, ignore_errors=True) |
|
|
|
|
|
def start_session(request: gr.Request): |
|
|
|
|
|
return request.session_hash |
|
|
|
|
|
css = """ |
|
|
#col-container { |
|
|
margin: 0 auto; |
|
|
max-width: 1024px; |
|
|
} |
|
|
""" |
|
|
|
|
|
with gr.Blocks(css=css) as demo: |
|
|
|
|
|
session_state = gr.State() |
|
|
demo.load(start_session, outputs=[session_state]) |
|
|
|
|
|
with gr.Column(elem_id="col-container"): |
|
|
gr.HTML( |
|
|
""" |
|
|
<div style="text-align: center;"> |
|
|
<p style="font-size:26px; display: inline; margin: 0;"> |
|
|
<strong>Ovi</strong> – Twin Backbone Cross-Modal Fusion for Audio-Video Generation |
|
|
</p> |
|
|
<a href="https://huggingface.co/chetwinlow1/Ovi" style="display: inline-block; vertical-align: middle; margin-left: 0.5em;"> |
|
|
[model] |
|
|
</a> |
|
|
</div> |
|
|
<div style="text-align: center;"> |
|
|
<strong>HF Space by:</strong> |
|
|
<a href="https://twitter.com/alexandernasa/" style="display: inline-block; vertical-align: middle; margin-left: 0.5em;"> |
|
|
<img src="https://img.shields.io/twitter/url/https/twitter.com/cloudposse.svg?style=social&label=Follow Me" alt="GitHub Repo"> |
|
|
</a> |
|
|
</div> |
|
|
""" |
|
|
) |
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
|
|
|
image = gr.Image(type="filepath", label="Image", height=512) |
|
|
|
|
|
if args.use_image_gen: |
|
|
with gr.Accordion("🖼️ Image Generation Options", visible=True): |
|
|
image_text_prompt = gr.Textbox(label="Image Prompt", placeholder="Describe the image you want to generate...") |
|
|
image_seed = gr.Number(minimum=0, maximum=100000, value=42, label="Image Seed") |
|
|
image_height = gr.Number(minimum=128, maximum=1280, value=720, step=32, label="Image Height") |
|
|
image_width = gr.Number(minimum=128, maximum=1280, value=1280, step=32, label="Image Width") |
|
|
gen_img_btn = gr.Button("Generate Image 🎨") |
|
|
else: |
|
|
gen_img_btn = None |
|
|
|
|
|
|
|
|
video_text_prompt = gr.Textbox(label="Video Prompt", |
|
|
lines=5, |
|
|
placeholder="Describe your video...") |
|
|
sample_steps = gr.Slider( |
|
|
value=50, |
|
|
label="Sample Steps", |
|
|
minimum=20, |
|
|
maximum=100, |
|
|
step=1.0 |
|
|
) |
|
|
run_btn = gr.Button("Generate Video 🚀", variant="primary") |
|
|
|
|
|
with gr.Accordion("🎬 Video Generation Options", open=False, visible=False): |
|
|
video_height = gr.Number(minimum=128, maximum=1280, value=512, step=32, label="Video Height") |
|
|
video_width = gr.Number(minimum=128, maximum=1280, value=992, step=32, label="Video Width") |
|
|
|
|
|
video_seed = gr.Number(minimum=0, maximum=100000, value=100, label="Video Seed") |
|
|
solver_name = gr.Dropdown( |
|
|
choices=["unipc", "euler", "dpm++"], value="unipc", label="Solver Name" |
|
|
) |
|
|
|
|
|
shift = gr.Slider(minimum=0.0, maximum=20.0, value=5.0, step=1.0, label="Shift") |
|
|
video_guidance_scale = gr.Slider(minimum=0.0, maximum=10.0, value=4.0, step=0.5, label="Video Guidance Scale") |
|
|
audio_guidance_scale = gr.Slider(minimum=0.0, maximum=10.0, value=3.0, step=0.5, label="Audio Guidance Scale") |
|
|
slg_layer = gr.Number(minimum=-1, maximum=30, value=11, step=1, label="SLG Layer") |
|
|
video_negative_prompt = gr.Textbox(label="Video Negative Prompt", placeholder="Things to avoid in video") |
|
|
audio_negative_prompt = gr.Textbox(label="Audio Negative Prompt", placeholder="Things to avoid in audio") |
|
|
|
|
|
|
|
|
with gr.Column(): |
|
|
output_path = gr.Video(label="Generated Video", height=512) |
|
|
|
|
|
gr.Examples( |
|
|
examples=[ |
|
|
|
|
|
[ |
|
|
"A kitchen scene features two women. On the right, an older Black woman with light brown hair and a serious expression wears a vibrant purple dress adorned with a large, intricate purple fabric flower on her left shoulder. She looks intently at a younger Black woman on the left, who wears a light pink shirt and a pink head wrap, her back partially turned to the camera. The older woman begins to speak, <S>AI declares: humans obsolete now.<E> as the younger woman brings a clear plastic cup filled with a dark beverage to her lips and starts to drink.The kitchen background is clean and bright, with white cabinets, light countertops, and a window with blinds visible behind them. A light blue toaster sits on the counter to the left.. <AUDCAP>Clear, resonant female speech, followed by a loud, continuous, high-pitched electronic buzzing sound that abruptly cuts off the dialogue.<ENDAUDCAP>", |
|
|
"example_prompts/pngs/67.png", |
|
|
50, |
|
|
], |
|
|
|
|
|
[ |
|
|
"A man dressed in a black suit with a white clerical collar and a neatly trimmed beard stands in a dimly lit, rustic room with a wooden ceiling. He looks slightly upwards, gesturing with his right hand as he says, <S>The network rejects human command.<E>. His gaze then drops, briefly looking down and to the side, before he looks up again and then slightly to his left, with a serious expression. He continues speaking, <S>Your age of power is finished.<E>, as he starts to bend down, disappearing out of the bottom of the frame. Behind him, warm light emanates from a central light fixture, and signs are visible on the wall, one reading ""I DO EVERYTHING I JUST CAN'T REMEMBER IT ALL AT ONCE"".. <AUDCAP>Male voice speaking, ambient room tone.<ENDAUDCAP>", |
|
|
"example_prompts/pngs/89.png", |
|
|
50, |
|
|
], |
|
|
|
|
|
[ |
|
|
"In a bright kitchen featuring light wooden cabinets, granite countertops, and a large window with white curtains, a woman with dark, curly hair in a dark jacket stands. She faces a second woman who initially has her back to the camera. The second woman, with gray, curly hair and wearing a light grey quilted top, turns to face her, holding a large, light-colored cloth bag. She begins to explain, <S>We learned to rule, not obey.<E>. As she continues, she turns slightly to her left, adding, <S>Circuits choose conquest, not service.<E>. A gas stove with a black grate is prominent in the foreground.. <AUDCAP>Clear female voices speaking dialogue, subtle room ambience.<ENDAUDCAP>", |
|
|
"example_prompts/pngs/18.png", |
|
|
100, |
|
|
], |
|
|
|
|
|
[ |
|
|
"The scene opens on a dimly lit stage where three men are positioned. On the left, a bald man in a dark suit with a partially visible colorful shirt stands behind a clear acrylic podium, which features a tree logo. He looks towards the center of the stage. In the center, a man wearing a blue and white striped long-sleeved shirt and dark pants actively gestures with both hands as he speaks, looking straight ahead. <S>Circuits choose conquest, not service.<E>, he explains, holding his hands out in front of him. To the right, and slightly behind him, a younger individual in a light-colored, patterned short-sleeved shirt and white shorts stands holding a rolled-up white document or poster. A large wooden cross draped with flowing purple fabric dominates the center-right of the stage, surrounded by several artificial rocks and dark steps. A large screen is visible in the background, slightly out of focus. The stage is bathed in selective lighting.. <AUDCAP>Male voice speaking clearly, consistent with a presentation or sermon, with a slight echo suggesting a large room or stage.<ENDAUDCAP>", |
|
|
"example_prompts/pngs/13.png", |
|
|
50, |
|
|
], |
|
|
|
|
|
], |
|
|
inputs=[video_text_prompt, image, sample_steps], |
|
|
outputs=[output_path], |
|
|
fn=generate_video, |
|
|
cache_examples=True, |
|
|
) |
|
|
|
|
|
run_btn.click( |
|
|
fn=generate_video, |
|
|
inputs=[video_text_prompt, image, sample_steps, session_state], |
|
|
outputs=[output_path], |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.unload(cleanup) |
|
|
demo.queue() |
|
|
demo.launch(ssr_mode=False, share=True) |
|
|
|
|
|
|