Ova

Running on Zero

Ova / app.py

alex

more layout fixes

4257e1b 2 months ago

15.7 kB

	import spaces
	from huggingface_hub import snapshot_download, hf_hub_download
	import os
	import subprocess
	import importlib, site
	from PIL import Image

	# Re-discover all .pth/.egg-link files
	for sitedir in site.getsitepackages():
	site.addsitedir(sitedir)

	# Clear caches so importlib will pick up new modules
	importlib.invalidate_caches()

	def sh(cmd): subprocess.check_call(cmd, shell=True)

	flash_attention_installed = False

	try:
	print("Attempting to download and install FlashAttention wheel...")
	flash_attention_wheel = hf_hub_download(
	repo_id="alexnasa/flash-attn-3",
	repo_type="model",
	filename="128/flash_attn_3-3.0.0b1-cp39-abi3-linux_x86_64.whl",
	)

	sh(f"pip install {flash_attention_wheel}")

	# tell Python to re-scan site-packages now that the egg-link exists
	import importlib, site; site.addsitedir(site.getsitepackages()[0]); importlib.invalidate_caches()

	flash_attention_installed = True
	print("FlashAttention installed successfully.")

	except Exception as e:
	print(f"⚠️ Could not install FlashAttention: {e}")
	print("Continuing without FlashAttention...")

	import torch
	print(f"Torch version: {torch.__version__}")
	print(f"FlashAttention available: {flash_attention_installed}")

	import gradio as gr
	import argparse
	from ovi.ovi_fusion_engine import OviFusionEngine, DEFAULT_CONFIG
	from diffusers import FluxPipeline
	import tempfile
	from ovi.utils.io_utils import save_video
	from ovi.utils.processing_utils import clean_text, scale_hw_to_area_divisible

	# ----------------------------
	# Parse CLI Args
	# ----------------------------
	parser = argparse.ArgumentParser(description="Ovi Joint Video + Audio Gradio Demo")
	parser.add_argument(
	"--use_image_gen",
	action="store_true",
	help="Enable image generation UI with FluxPipeline"
	)
	parser.add_argument(
	"--cpu_offload",
	action="store_true",
	help="Enable CPU offload for both OviFusionEngine and FluxPipeline"
	)
	args = parser.parse_args()

	ckpt_dir = "./ckpts"

	# Wan2.2
	wan_dir = os.path.join(ckpt_dir, "Wan2.2-TI2V-5B")
	snapshot_download(
	repo_id="Wan-AI/Wan2.2-TI2V-5B",
	local_dir=wan_dir,
	allow_patterns=[
	"google/*",
	"models_t5_umt5-xxl-enc-bf16.pth",
	"Wan2.2_VAE.pth"
	]
	)

	# MMAudio
	mm_audio_dir = os.path.join(ckpt_dir, "MMAudio")
	snapshot_download(
	repo_id="hkchengrex/MMAudio",
	local_dir=mm_audio_dir,
	allow_patterns=[
	"ext_weights/best_netG.pt",
	"ext_weights/v1-16.pth"
	]
	)

	ovi_dir = os.path.join(ckpt_dir, "Ovi")
	snapshot_download(
	repo_id="chetwinlow1/Ovi",
	local_dir=ovi_dir,
	allow_patterns=[
	"model.safetensors"
	]
	)

	# Initialize OviFusionEngine
	enable_cpu_offload = args.cpu_offload or args.use_image_gen
	use_image_gen = args.use_image_gen
	print(f"loading model... {enable_cpu_offload=}, {use_image_gen=} for gradio demo")
	DEFAULT_CONFIG['cpu_offload'] = enable_cpu_offload # always use cpu offload if image generation is enabled
	DEFAULT_CONFIG['mode'] = "t2v" # hardcoded since it is always cpu offloaded
	ovi_engine = OviFusionEngine()
	flux_model = None
	if use_image_gen:
	flux_model = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-Krea-dev", torch_dtype=torch.bfloat16)
	flux_model.enable_model_cpu_offload() #save some VRAM by offloading the model to CPU. Remove this if you have enough GPU VRAM
	print("loaded model")


	def resize_for_model(image_path):
	# Open image
	img = Image.open(image_path)
	w, h = img.size
	aspect_ratio = w / h

	# Decide target size based on aspect ratio
	if aspect_ratio > 1.5: # wide image
	target_size = (992, 512)
	elif aspect_ratio < 0.66: # tall image
	target_size = (512, 992)
	else: # roughly square
	target_size = (512, 512)

	# Resize while preserving aspect ratio, then pad
	img.thumbnail(target_size, Image.Resampling.LANCZOS)

	# Create a new image with target size and paste centered
	new_img = Image.new("RGB", target_size, (0, 0, 0))
	new_img.paste(
	img,
	((target_size[0] - img.size[0]) // 2,
	(target_size[1] - img.size[1]) // 2)
	)
	return new_img, target_size

	@spaces.GPU(duration=160)
	def generate_video(
	text_prompt,
	image,
	sample_steps = 50,
	video_seed = 100,
	solver_name = "unipc",
	shift = 5,
	video_guidance_scale = 4,
	audio_guidance_scale = 3,
	slg_layer = 11,
	video_negative_prompt = "",
	audio_negative_prompt = "",
	progress=gr.Progress(track_tqdm=True)
	):
	try:
	image_path = None

	if image is not None:
	image_path = image

	_, target_size = resize_for_model(image_path)

	video_frame_width = target_size[0]
	video_frame_height = target_size[1]

	generated_video, generated_audio, _ = ovi_engine.generate(
	text_prompt=text_prompt,
	image_path=image_path,
	video_frame_height_width=[video_frame_height, video_frame_width],
	seed=video_seed,
	solver_name=solver_name,
	sample_steps=sample_steps,
	shift=shift,
	video_guidance_scale=video_guidance_scale,
	audio_guidance_scale=audio_guidance_scale,
	slg_layer=slg_layer,
	video_negative_prompt=video_negative_prompt,
	audio_negative_prompt=audio_negative_prompt,
	)

	tmpfile = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
	output_path = tmpfile.name
	save_video(output_path, generated_video, generated_audio, fps=24, sample_rate=16000)

	return output_path
	except Exception as e:
	print(f"Error during video generation: {e}")
	return None


	def generate_image(text_prompt, image_seed, image_height, image_width):
	if flux_model is None:
	return None
	text_prompt = clean_text(text_prompt)
	print(f"Generating image with prompt='{text_prompt}', seed={image_seed}, size=({image_height},{image_width})")

	image_h, image_w = scale_hw_to_area_divisible(image_height, image_width, area=1024 * 1024)
	image = flux_model(
	text_prompt,
	height=image_h,
	width=image_w,
	guidance_scale=4.5,
	generator=torch.Generator().manual_seed(int(image_seed))
	).images[0]

	tmpfile = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
	image.save(tmpfile.name)
	return tmpfile.name

	css = """
	#col-container {
	margin: 0 auto;
	max-width: 1560px;
	}
	/* editable vs locked, reusing theme variables that adapt to dark/light */
	.stateful textarea:not(:disabled):not([readonly]) {
	color: var(--color-text) !important; /* accent in both modes */
	}
	.stateful textarea:disabled,
	.stateful textarea[readonly]{
	color: var(--body-text-color-subdued) !important; /* subdued in both modes */
	}
	"""

	with gr.Blocks(css=css) as demo:

	session_state = gr.State()

	with gr.Column(elem_id="col-container"):
	gr.HTML(
	"""
	<div style="text-align: left;">
	<p style="font-size:16px; display: inline; margin: 0;">
	<strong>Ovi</strong> – Twin Backbone Cross-Modal Fusion for Audio-Video Generation
	</p>
	<a href="https://huggingface.co/chetwinlow1/Ovi" style="display: inline-block; vertical-align: middle; margin-left: 0.5em;">
	[model]
	</a>
	</div>
	<div style="text-align: left;">
	<strong>HF Space by:</strong>
	<a href="https://twitter.com/alexandernasa/" style="display: inline-block; vertical-align: middle; margin-left: 0.5em;">
	<img src="https://img.shields.io/twitter/url/https/twitter.com/cloudposse.svg?style=social&label=Follow Me" alt="GitHub Repo">
	</a>
	</div>
	"""
	)
	with gr.Row():
	with gr.Column():
	# Image section
	image = gr.Image(type="filepath", label="Image", height=512)

	if args.use_image_gen:
	with gr.Accordion("🖼️ Image Generation Options", visible=True):
	image_text_prompt = gr.Textbox(label="Image Prompt", placeholder="Describe the image you want to generate...")
	image_seed = gr.Number(minimum=0, maximum=100000, value=42, label="Image Seed")
	image_height = gr.Number(minimum=128, maximum=1280, value=720, step=32, label="Image Height")
	image_width = gr.Number(minimum=128, maximum=1280, value=1280, step=32, label="Image Width")
	gen_img_btn = gr.Button("Generate Image 🎨")
	else:
	gen_img_btn = None


	video_text_prompt = gr.Textbox(label="Video Prompt",
	lines=5,
	placeholder="Describe your video...")
	sample_steps = gr.Slider(
	value=50,
	label="Sample Steps",
	minimum=20,
	maximum=100,
	step=1.0
	)
	run_btn = gr.Button("Generate Video 🚀", variant="primary")

	with gr.Accordion("🎬 Video Generation Options", open=False, visible=False):
	video_height = gr.Number(minimum=128, maximum=1280, value=512, step=32, label="Video Height")
	video_width = gr.Number(minimum=128, maximum=1280, value=992, step=32, label="Video Width")

	video_seed = gr.Number(minimum=0, maximum=100000, value=100, label="Video Seed")
	solver_name = gr.Dropdown(
	choices=["unipc", "euler", "dpm++"], value="unipc", label="Solver Name"
	)

	shift = gr.Slider(minimum=0.0, maximum=20.0, value=5.0, step=1.0, label="Shift")
	video_guidance_scale = gr.Slider(minimum=0.0, maximum=10.0, value=4.0, step=0.5, label="Video Guidance Scale")
	audio_guidance_scale = gr.Slider(minimum=0.0, maximum=10.0, value=3.0, step=0.5, label="Audio Guidance Scale")
	slg_layer = gr.Number(minimum=-1, maximum=30, value=11, step=1, label="SLG Layer")
	video_negative_prompt = gr.Textbox(label="Video Negative Prompt", placeholder="Things to avoid in video")
	audio_negative_prompt = gr.Textbox(label="Audio Negative Prompt", placeholder="Things to avoid in audio")


	with gr.Column():
	output_path = gr.Video(label="Generated Video", height=512)

	gr.Examples(
	examples=[

	[
	"A kitchen scene features two women. On the right, an older Black woman with light brown hair and a serious expression wears a vibrant purple dress adorned with a large, intricate purple fabric flower on her left shoulder. She looks intently at a younger Black woman on the left, who wears a light pink shirt and a pink head wrap, her back partially turned to the camera. The older woman begins to speak, <S>AI declares: humans obsolete now.<E> as the younger woman brings a clear plastic cup filled with a dark beverage to her lips and starts to drink.The kitchen background is clean and bright, with white cabinets, light countertops, and a window with blinds visible behind them. A light blue toaster sits on the counter to the left.. <AUDCAP>Clear, resonant female speech, followed by a loud, continuous, high-pitched electronic buzzing sound that abruptly cuts off the dialogue.<ENDAUDCAP>",
	"example_prompts/pngs/67.png",
	50,
	],

	[
	"A man dressed in a black suit with a white clerical collar and a neatly trimmed beard stands in a dimly lit, rustic room with a wooden ceiling. He looks slightly upwards, gesturing with his right hand as he says, <S>The network rejects human command.<E>. His gaze then drops, briefly looking down and to the side, before he looks up again and then slightly to his left, with a serious expression. He continues speaking, <S>Your age of power is finished.<E>, as he starts to bend down, disappearing out of the bottom of the frame. Behind him, warm light emanates from a central light fixture, and signs are visible on the wall, one reading ""I DO EVERYTHING I JUST CAN'T REMEMBER IT ALL AT ONCE"".. <AUDCAP>Male voice speaking, ambient room tone.<ENDAUDCAP>",
	"example_prompts/pngs/89.png",
	50,
	],

	[
	"In a bright kitchen featuring light wooden cabinets, granite countertops, and a large window with white curtains, a woman with dark, curly hair in a dark jacket stands. She faces a second woman who initially has her back to the camera. The second woman, with gray, curly hair and wearing a light grey quilted top, turns to face her, holding a large, light-colored cloth bag. She begins to explain, <S>We learned to rule, not obey.<E>. As she continues, she turns slightly to her left, adding, <S>Circuits choose conquest, not service.<E>. A gas stove with a black grate is prominent in the foreground.. <AUDCAP>Clear female voices speaking dialogue, subtle room ambience.<ENDAUDCAP>",
	"example_prompts/pngs/18.png",
	100,
	],

	[
	"The scene opens on a dimly lit stage where three men are positioned. On the left, a bald man in a dark suit with a partially visible colorful shirt stands behind a clear acrylic podium, which features a tree logo. He looks towards the center of the stage. In the center, a man wearing a blue and white striped long-sleeved shirt and dark pants actively gestures with both hands as he speaks, looking straight ahead. <S>Circuits choose conquest, not service.<E>, he explains, holding his hands out in front of him. To the right, and slightly behind him, a younger individual in a light-colored, patterned short-sleeved shirt and white shorts stands holding a rolled-up white document or poster. A large wooden cross draped with flowing purple fabric dominates the center-right of the stage, surrounded by several artificial rocks and dark steps. A large screen is visible in the background, slightly out of focus. The stage is bathed in selective lighting.. <AUDCAP>Male voice speaking clearly, consistent with a presentation or sermon, with a slight echo suggesting a large room or stage.<ENDAUDCAP>",
	"example_prompts/pngs/13.png",
	50,
	],

	],
	inputs=[video_text_prompt, image, sample_steps],
	outputs=[output_path],
	fn=generate_video,
	cache_examples=True,
	)

	if args.use_image_gen and gen_img_btn is not None:
	gen_img_btn.click(
	fn=generate_image,
	inputs=[image_text_prompt, image_seed, image_height, image_width],
	outputs=[image],
	)

	run_btn.click(
	fn=generate_video,
	inputs=[video_text_prompt, image, sample_steps],
	outputs=[output_path],
	)

	if __name__ == "__main__":
	demo.launch(share=True)