|
|
import spaces |
|
|
import gradio as gr |
|
|
import sys |
|
|
import os |
|
|
import subprocess |
|
|
import uuid |
|
|
import shutil |
|
|
from tqdm import tqdm |
|
|
import threading |
|
|
import os |
|
|
import psutil |
|
|
import gc |
|
|
|
|
|
PROC = psutil.Process(os.getpid()) |
|
|
|
|
|
def debug_resources(tag=""): |
|
|
|
|
|
thread_count = PROC.num_threads() |
|
|
|
|
|
|
|
|
children = PROC.children(recursive=True) |
|
|
child_count = len(children) |
|
|
|
|
|
|
|
|
name_counts = {} |
|
|
for c in children: |
|
|
try: |
|
|
name = c.name() |
|
|
except psutil.Error: |
|
|
name = "unknown" |
|
|
name_counts[name] = name_counts.get(name, 0) + 1 |
|
|
|
|
|
|
|
|
try: |
|
|
fd_count = PROC.num_fds() |
|
|
except AttributeError: |
|
|
fd_count = len(PROC.open_files()) |
|
|
|
|
|
print("========== RESOURCE DEBUG", tag, "==========") |
|
|
print("[DEBUG]", "Threads in this process:", thread_count) |
|
|
print("[DEBUG]", "Child processes (recursive):", child_count) |
|
|
print("[DEBUG]", "Child processes by name:", name_counts) |
|
|
print("[DEBUG]", "Open file descriptors:", fd_count) |
|
|
print("============================================") |
|
|
|
|
|
from huggingface_hub import snapshot_download, list_repo_files, hf_hub_download |
|
|
import importlib, site |
|
|
|
|
|
|
|
|
|
|
|
for sitedir in site.getsitepackages(): |
|
|
site.addsitedir(sitedir) |
|
|
|
|
|
|
|
|
importlib.invalidate_caches() |
|
|
|
|
|
def sh(cmd): subprocess.check_call(cmd, shell=True) |
|
|
|
|
|
flash_attention_installed = False |
|
|
|
|
|
try: |
|
|
flash_attention_wheel = hf_hub_download( |
|
|
repo_id="alexnasa/flash-attn-3", |
|
|
repo_type="model", |
|
|
filename="128/flash_attn_3-3.0.0b1-cp39-abi3-linux_x86_64.whl", |
|
|
) |
|
|
|
|
|
sh(f"pip install {flash_attention_wheel}") |
|
|
print("Attempting to download and install FlashAttention wheel...") |
|
|
|
|
|
import importlib, site; site.addsitedir(site.getsitepackages()[0]); importlib.invalidate_caches() |
|
|
|
|
|
flash_attention_installed = True |
|
|
|
|
|
except Exception as e: |
|
|
print(f"⚠️ Could not install FlashAttention: {e}") |
|
|
print("Continuing without FlashAttention...") |
|
|
|
|
|
try: |
|
|
te_wheel = hf_hub_download( |
|
|
repo_id="alexnasa/transformer_engine_wheels", |
|
|
repo_type="model", |
|
|
filename="transformer_engine-2.5.0+f05f12c9-cp310-cp310-linux_x86_64.whl", |
|
|
) |
|
|
|
|
|
sh(f"pip install {te_wheel}") |
|
|
print("Attempting to download and install Transformer Engine wheel...") |
|
|
|
|
|
import importlib, site; site.addsitedir(site.getsitepackages()[0]); importlib.invalidate_caches() |
|
|
|
|
|
except Exception as e: |
|
|
print(f"⚠️ Could not install Transformer Engine : {e}") |
|
|
print("Continuing without Transformer Engine ...") |
|
|
|
|
|
import torch |
|
|
print(f"Torch version: {torch.__version__}") |
|
|
print(f"FlashAttention available: {flash_attention_installed}") |
|
|
|
|
|
import torch._inductor.config as inductor_config |
|
|
|
|
|
inductor_config.compile_threads = 1 |
|
|
|
|
|
import tempfile |
|
|
from pathlib import Path |
|
|
from torch._inductor.runtime.runtime_utils import cache_dir as _inductor_cache_dir |
|
|
from huggingface_hub import HfApi |
|
|
|
|
|
|
|
|
snapshot_download(repo_id="bytedance-research/HuMo", local_dir="./weights/HuMo") |
|
|
snapshot_download(repo_id="Wan-AI/Wan2.1-T2V-1.3B", local_dir="./weights/Wan2.1-T2V-1.3B") |
|
|
snapshot_download(repo_id="openai/whisper-large-v3", local_dir="./weights/whisper-large-v3") |
|
|
|
|
|
os.environ["PROCESSED_RESULTS"] = f"{os.getcwd()}/proprocess_results" |
|
|
|
|
|
path_to_insert = "humo" |
|
|
if path_to_insert not in sys.path: |
|
|
sys.path.insert(0, path_to_insert) |
|
|
|
|
|
from common.config import load_config, create_object |
|
|
|
|
|
config = load_config( |
|
|
"./humo/configs/inference/generate.yaml", |
|
|
[ |
|
|
"dit.sp_size=1", |
|
|
"generation.frames=97", |
|
|
"generation.scale_t=5.5", |
|
|
"generation.scale_a=5.0", |
|
|
"generation.mode=TIA", |
|
|
"generation.height=480", |
|
|
"generation.width=832", |
|
|
], |
|
|
) |
|
|
runner = create_object(config) |
|
|
|
|
|
os.environ.setdefault("TORCHINDUCTOR_CACHE_DIR", f"{os.getcwd()}/torchinductor_space") |
|
|
|
|
|
def restore_inductor_cache_from_hub(repo_id: str, filename: str = "torch_compile_cache.zip", |
|
|
path_in_repo: str = "inductor_cache", repo_type: str = "model", |
|
|
hf_token: str | None = None): |
|
|
cache_root = Path(_inductor_cache_dir()).resolve() |
|
|
cache_root.mkdir(parents=True, exist_ok=True) |
|
|
zip_path = hf_hub_download(repo_id=repo_id, filename=f"{path_in_repo}/{filename}", |
|
|
repo_type=repo_type, token=hf_token) |
|
|
shutil.unpack_archive(zip_path, extract_dir=str(cache_root)) |
|
|
print(f"✓ Restored cache into {cache_root}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_duration(prompt_text, steps, image_file, audio_file_path, max_duration, session_id, progress): |
|
|
|
|
|
return calculate_required_time(steps, max_duration) |
|
|
|
|
|
def calculate_required_time(steps, max_duration): |
|
|
|
|
|
warmup_s = 50 |
|
|
|
|
|
max_duration_duration_mapping = { |
|
|
20: 3, |
|
|
45: 7, |
|
|
70: 13, |
|
|
95: 21, |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
each_step_s = max_duration_duration_mapping[max_duration] |
|
|
duration_s = (each_step_s * steps) + warmup_s |
|
|
|
|
|
print(f'estimated duration:{duration_s}') |
|
|
|
|
|
return int(duration_s) |
|
|
|
|
|
def get_required_time_string(steps, max_duration): |
|
|
|
|
|
duration_s = calculate_required_time(steps, max_duration) |
|
|
duration_m = duration_s / 60 |
|
|
|
|
|
return f"<center>⌚ Zero GPU Required: ~{duration_s}.0s ({duration_m:.1f} mins)</center>" |
|
|
|
|
|
def update_required_time(steps, max_duration): |
|
|
|
|
|
return get_required_time_string(steps, max_duration) |
|
|
|
|
|
|
|
|
def generate_scene(prompt_text, steps, image_paths, audio_file_path, max_duration = 3, session_id = None, progress=gr.Progress(),): |
|
|
|
|
|
prompt_text_check = (prompt_text or "").strip() |
|
|
if not prompt_text_check: |
|
|
raise gr.Error("Please enter a prompt.") |
|
|
|
|
|
if not audio_file_path and not image_paths: |
|
|
raise gr.Error("Please provide a reference image or a lipsync audio.") |
|
|
|
|
|
return run_pipeline(prompt_text, steps, image_paths, audio_file_path, max_duration, session_id, progress) |
|
|
|
|
|
def upload_inductor_cache_to_hub( |
|
|
repo_id: str, |
|
|
path_in_repo: str = "inductor_cache", |
|
|
repo_type: str = "model", |
|
|
hf_token: str | None = None, |
|
|
): |
|
|
""" |
|
|
Zips the current TorchInductor cache and uploads it to the given repo path. |
|
|
Assumes the model was already run once with torch.compile() so the cache exists. |
|
|
""" |
|
|
|
|
|
cache_dir = Path(_inductor_cache_dir()).resolve() |
|
|
if not cache_dir.exists(): |
|
|
raise FileNotFoundError(f"TorchInductor cache not found at {cache_dir}. " |
|
|
"Run a compiled model once to populate it.") |
|
|
|
|
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir: |
|
|
archive_base = Path(tmpdir) / "torch_compile_cache" |
|
|
archive_path = shutil.make_archive(str(archive_base), "zip", root_dir=str(cache_dir)) |
|
|
archive_path = Path(archive_path) |
|
|
|
|
|
|
|
|
api = HfApi(token=hf_token) |
|
|
api.create_repo(repo_id=repo_id, repo_type=repo_type, exist_ok=True) |
|
|
|
|
|
|
|
|
dest_path = f"{path_in_repo}/{archive_path.name}" |
|
|
api.upload_file( |
|
|
path_or_fileobj=str(archive_path), |
|
|
path_in_repo=dest_path, |
|
|
repo_id=repo_id, |
|
|
repo_type=repo_type, |
|
|
) |
|
|
|
|
|
meta_txt = ( |
|
|
f"pytorch={torch.__version__}\n" |
|
|
f"inductor_cache_dir={cache_dir}\n" |
|
|
f"cuda_available={torch.cuda.is_available()}\n" |
|
|
f"cuda_device={torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'cpu'}\n" |
|
|
) |
|
|
api.upload_file( |
|
|
path_or_fileobj=meta_txt.encode(), |
|
|
path_in_repo=f"{path_in_repo}/INDUCTOR_CACHE_METADATA.txt", |
|
|
repo_id=repo_id, |
|
|
repo_type=repo_type, |
|
|
) |
|
|
|
|
|
print("✔ Uploaded TorchInductor cache to the Hub.") |
|
|
|
|
|
|
|
|
@spaces.GPU(duration=get_duration) |
|
|
def run_pipeline(prompt_text, steps, image_paths, audio_file_path, max_duration = 3, session_id = None, progress=gr.Progress(),): |
|
|
|
|
|
if session_id is None: |
|
|
session_id = uuid.uuid4().hex |
|
|
|
|
|
inference_mode = "TIA" |
|
|
|
|
|
|
|
|
prompt_text = (prompt_text or "").strip() |
|
|
if not prompt_text: |
|
|
raise gr.Error("Please enter a prompt.") |
|
|
|
|
|
if not audio_file_path and not image_paths: |
|
|
raise gr.Error("Please provide a reference image or a lipsync audio.") |
|
|
|
|
|
if not audio_file_path: |
|
|
inference_mode = "TI" |
|
|
audio_path = None |
|
|
tmp_audio_path = None |
|
|
else: |
|
|
audio_path = audio_file_path if isinstance(audio_file_path, str) else getattr(audio_file_path, "name", str(audio_file_path)) |
|
|
|
|
|
if not image_paths: |
|
|
inference_mode = "TA" |
|
|
img_paths = None |
|
|
else: |
|
|
img_paths = [image_data[0] for image_data in image_paths] |
|
|
|
|
|
print(f'{session_id} is using inference_mode:{inference_mode} with steps:{steps} with {max_duration} frames') |
|
|
|
|
|
output_dir = os.path.join(os.environ["PROCESSED_RESULTS"], session_id) |
|
|
os.makedirs(output_dir, exist_ok=True) |
|
|
|
|
|
if audio_path: |
|
|
|
|
|
def add_silence_to_audio_ffmpeg(audio_path, tmp_audio_path, silence_duration_s=0.5): |
|
|
command = [ |
|
|
'ffmpeg', |
|
|
'-i', audio_path, |
|
|
'-f', 'lavfi', |
|
|
'-t', str(silence_duration_s), |
|
|
'-i', 'anullsrc=r=16000:cl=mono', |
|
|
'-filter_complex', |
|
|
( |
|
|
'[0:a]aresample=16000,pan=mono|c0=c0[a0];' |
|
|
'[1:a]aresample=16000,pan=mono|c0=c0[a1];' |
|
|
'[a1][a0]concat=n=2:v=0:a=1[out]' |
|
|
), |
|
|
'-map', '[out]', |
|
|
'-y', tmp_audio_path, |
|
|
'-loglevel', 'error' |
|
|
] |
|
|
subprocess.run(command, check=True) |
|
|
|
|
|
tmp_audio_path = os.path.join(output_dir, "tmp_audio.wav") |
|
|
|
|
|
add_silence_to_audio_ffmpeg(audio_path, tmp_audio_path) |
|
|
|
|
|
|
|
|
filename = f"gen_{uuid.uuid4().hex[:10]}" |
|
|
width, height = 832, 480 |
|
|
|
|
|
|
|
|
runner.inference_loop( |
|
|
prompt_text, |
|
|
img_paths, |
|
|
tmp_audio_path, |
|
|
output_dir, |
|
|
filename, |
|
|
inference_mode, |
|
|
width, |
|
|
height, |
|
|
steps, |
|
|
frames = int(max_duration), |
|
|
tea_cache_l1_thresh = 0.0, |
|
|
progress_bar_cmd=progress |
|
|
) |
|
|
torch.cuda.empty_cache() |
|
|
debug_resources("after generation") |
|
|
|
|
|
|
|
|
video_path = os.path.join(output_dir, f"{filename}.mp4") |
|
|
if os.path.exists(video_path): |
|
|
|
|
|
|
|
|
|
|
|
return video_path |
|
|
else: |
|
|
candidates = [os.path.join(output_dir, f) for f in os.listdir(output_dir) if f.endswith(".mp4")] |
|
|
if candidates: |
|
|
return max(candidates, key=lambda p: os.path.getmtime(p)) |
|
|
return None |
|
|
|
|
|
css = """ |
|
|
#col-container { |
|
|
margin: 0 auto; |
|
|
width: 100%; |
|
|
max-width: 720px; |
|
|
} |
|
|
""" |
|
|
|
|
|
def cleanup(request: gr.Request): |
|
|
|
|
|
sid = request.session_hash |
|
|
if sid: |
|
|
d1 = os.path.join(os.environ["PROCESSED_RESULTS"], sid) |
|
|
shutil.rmtree(d1, ignore_errors=True) |
|
|
|
|
|
def start_session(request: gr.Request): |
|
|
|
|
|
return request.session_hash |
|
|
|
|
|
with gr.Blocks(css=css) as demo: |
|
|
|
|
|
session_state = gr.State() |
|
|
demo.load(start_session, outputs=[session_state]) |
|
|
|
|
|
with gr.Sidebar(width=400): |
|
|
|
|
|
|
|
|
gr.HTML( |
|
|
""" |
|
|
<div style="text-align: center;"> |
|
|
<p style="font-size:16px; display: inline; margin: 0;"> |
|
|
<strong>HuMo</strong> – Human-Centric Video Generation via Collaborative Multi-Modal Conditioning |
|
|
</p> |
|
|
<a href="https://github.com/Phantom-video/HuMo" style="display: inline-block; vertical-align: middle; margin-left: 0.5em;"> |
|
|
[Github] |
|
|
</a> |
|
|
</div> |
|
|
""" |
|
|
) |
|
|
|
|
|
gr.Markdown("**REFERENCE IMAGES**") |
|
|
|
|
|
img_input = gr.Gallery( |
|
|
value=["./examples/ali.png"], |
|
|
show_label=False, |
|
|
label="", |
|
|
interactive=True, |
|
|
rows=1, columns=3, object_fit="contain", height="280", |
|
|
file_types=['image'] |
|
|
) |
|
|
|
|
|
gr.Markdown("**LIPSYNC AUDIO**") |
|
|
|
|
|
audio_input = gr.Audio( |
|
|
value="./examples/life.wav", |
|
|
sources=["upload"], |
|
|
show_label=False, |
|
|
type="filepath", |
|
|
) |
|
|
|
|
|
gr.Markdown("**SETTINGS**") |
|
|
|
|
|
default_steps = 10 |
|
|
default_max_duration = 45 |
|
|
|
|
|
max_duration = gr.Slider(minimum=45, maximum=95, value=default_max_duration, step=25, label="Frames") |
|
|
steps_input = gr.Slider(minimum=10, maximum=50, value=default_steps, step=5, label="Diffusion Steps") |
|
|
|
|
|
|
|
|
|
|
|
with gr.Column(elem_id="col-container"): |
|
|
|
|
|
gr.HTML( |
|
|
""" |
|
|
<div style="text-align: center;"> |
|
|
<strong>HF Spaces by:</strong> |
|
|
<a href="https://huggingface.co/alexnasa" style="display: inline-block; vertical-align: middle; margin-left: 0.5em;"> |
|
|
<img src="https://img.shields.io/badge/🤗-Follow Me-yellow.svg"> |
|
|
</a> |
|
|
</div> |
|
|
""" |
|
|
) |
|
|
|
|
|
video_output = gr.Video(show_label=False) |
|
|
|
|
|
gr.Markdown("<center><h2>PROMPT</h2></center>") |
|
|
|
|
|
prompt_tb = gr.Textbox( |
|
|
value="A handheld tracking shot follows a female warrior walking through a cave. Her determined eyes are locked straight ahead as she grips a blazing torch tightly in her hand. She speaks with intensity.", |
|
|
show_label=False, |
|
|
lines=5, |
|
|
placeholder="Describe the scene and the person talking....", |
|
|
) |
|
|
|
|
|
gr.Markdown("") |
|
|
time_required = gr.Markdown(get_required_time_string(default_steps, default_max_duration)) |
|
|
run_btn = gr.Button("🎬 Action", variant="primary") |
|
|
|
|
|
gr.Examples( |
|
|
examples=[ |
|
|
|
|
|
|
|
|
[ |
|
|
"A handheld tracking shot follows a female through a science lab. Her determined eyes are locked straight ahead. She is explaining something to someone standing opposite her", |
|
|
10, |
|
|
["./examples/naomi.png"], |
|
|
"./examples/science.wav", |
|
|
70, |
|
|
], |
|
|
|
|
|
|
|
|
[ |
|
|
"A handheld tracking shot follows a female warrior walking through a cave. Her determined eyes are locked straight ahead as she grips a blazing torch tightly in her hand. She speaks with intensity.", |
|
|
10, |
|
|
["./examples/ella.png"], |
|
|
"./examples/dream.mp3", |
|
|
45, |
|
|
], |
|
|
|
|
|
[ |
|
|
"A reddish-brown haired woman sits pensively against swirling blue-and-white brushstrokes, dressed in a blue coat and dark waistcoat. The artistic backdrop and her thoughtful pose evoke a Post-Impressionist style in a studio-like setting.", |
|
|
10, |
|
|
["./examples/art.png"], |
|
|
"./examples/art.wav", |
|
|
70, |
|
|
], |
|
|
|
|
|
[ |
|
|
"A handheld tracking shot follows a female warrior walking through a cave. Her determined eyes are locked straight ahead as she grips a blazing torch tightly in her hand. She speaks with intensity.", |
|
|
10, |
|
|
["./examples/ella.png"], |
|
|
"./examples/dream.mp3", |
|
|
95, |
|
|
], |
|
|
|
|
|
[ |
|
|
"A woman with long, wavy dark hair looking at a person sitting opposite her whilst holding a book, wearing a leather jacket, long-sleeved jacket with a semi purple color one seen on a photo. Warm, window-like light bathes her figure, highlighting the outfit's elegant design and her graceful movements.", |
|
|
40, |
|
|
["./examples/amber.png", "./examples/jacket.png"], |
|
|
"./examples/fictional.wav", |
|
|
70, |
|
|
], |
|
|
|
|
|
], |
|
|
inputs=[prompt_tb, steps_input, img_input, audio_input, max_duration], |
|
|
outputs=[video_output], |
|
|
fn=run_pipeline, |
|
|
cache_examples=True, |
|
|
) |
|
|
max_duration.change(update_required_time, [steps_input, max_duration], time_required) |
|
|
steps_input.change(update_required_time, [steps_input, max_duration], time_required) |
|
|
|
|
|
run_btn.click( |
|
|
fn=generate_scene, |
|
|
inputs=[prompt_tb, steps_input, img_input, audio_input, max_duration, session_state], |
|
|
outputs=[video_output], |
|
|
) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.unload(cleanup) |
|
|
demo.queue() |
|
|
demo.launch(ssr_mode=False) |