Spaces:

alexnasa
/

HuMo_local

Running on Zero

App Files Files Community

alexnasa commited on Oct 15

Commit

1a20a46

verified ·

1 Parent(s): 295978e

Update app.py

Browse files

Files changed (1) hide show

app.py +435 -435

app.py CHANGED Viewed

@@ -1,435 +1,435 @@
-import spaces
-import gradio as gr
-import sys
-import os
-import subprocess
-import uuid
-import shutil
-from huggingface_hub import snapshot_download, list_repo_files, hf_hub_download
-import importlib, site
-# Re-discover all .pth/.egg-link files
-for sitedir in site.getsitepackages():
-    site.addsitedir(sitedir)
-# Clear caches so importlib will pick up new modules
-importlib.invalidate_caches()
-def sh(cmd): subprocess.check_call(cmd, shell=True)
-flash_attention_installed = False
-try:
-    flash_attention_wheel = hf_hub_download(
-            repo_id="alexnasa/flash-attn-3",
-            repo_type="model",
-            filename="128/flash_attn_3-3.0.0b1-cp39-abi3-linux_x86_64.whl",
-        )
-    sh(f"pip install {flash_attention_wheel}")
-    print("Attempting to download and install FlashAttention wheel...")
-    # sh("pip install flash-attn")
-    sh("pip install --no-build-isolation transformer_engine-2.5.0+f05f12c9-cp310-cp310-linux_x86_64.whl")
-    # tell Python to re-scan site-packages now that the egg-link exists
-    import importlib, site; site.addsitedir(site.getsitepackages()[0]); importlib.invalidate_caches()
-    flash_attention_installed = True
-except Exception as e:
-    print(f"⚠️ Could not install FlashAttention: {e}")
-    print("Continuing without FlashAttention...")
-try:
-    te_wheel = hf_hub_download(
-            repo_id="alexnasa/transformer_engine_wheels",
-            repo_type="model",
-            filename="transformer_engine-2.5.0+f05f12c9-cp310-cp310-linux_x86_64.whl",
-        )
-    sh(f"pip install {te_wheel}")
-    print("Attempting to download and install Transformer Engine wheel...")
-    # tell Python to re-scan site-packages now that the egg-link exists
-    import importlib, site; site.addsitedir(site.getsitepackages()[0]); importlib.invalidate_caches()
-except Exception as e:
-    print(f"⚠️ Could not install Transformer Engine : {e}")
-    print("Continuing without Transformer Engine ...")
-import torch
-print(f"Torch version: {torch.__version__}")
-print(f"FlashAttention available: {flash_attention_installed}")
-import tempfile
-from pathlib import Path
-from torch._inductor.runtime.runtime_utils import cache_dir as _inductor_cache_dir
-from huggingface_hub import HfApi
-snapshot_download(repo_id="bytedance-research/HuMo", local_dir="./weights/HuMo")
-snapshot_download(repo_id="Wan-AI/Wan2.1-T2V-1.3B", local_dir="./weights/Wan2.1-T2V-1.3B")
-snapshot_download(repo_id="openai/whisper-large-v3", local_dir="./weights/whisper-large-v3")
-os.environ["PROCESSED_RESULTS"] = f"{os.getcwd()}/proprocess_results"
-path_to_insert = "humo"
-if path_to_insert not in sys.path:
-    sys.path.insert(0, path_to_insert)
-from common.config import load_config, create_object
-config = load_config(
-    "./humo/configs/inference/generate.yaml",
-    [
-        "dit.sp_size=1",
-        "generation.frames=97",
-        "generation.scale_t=5.5",
-        "generation.scale_a=5.0",
-        "generation.mode=TIA",
-        "generation.height=480",
-        "generation.width=832",
-    ],
-)
-runner = create_object(config)
-os.environ.setdefault("TORCHINDUCTOR_CACHE_DIR", f"{os.getcwd()}/torchinductor_space")  # or another writable path
-def restore_inductor_cache_from_hub(repo_id: str, filename: str = "torch_compile_cache.zip",
-                                    path_in_repo: str = "inductor_cache", repo_type: str = "model",
-                                    hf_token: str | None = None):
-    cache_root = Path(_inductor_cache_dir()).resolve()
-    cache_root.mkdir(parents=True, exist_ok=True)
-    zip_path = hf_hub_download(repo_id=repo_id, filename=f"{path_in_repo}/{filename}",
-                               repo_type=repo_type, token=hf_token)
-    shutil.unpack_archive(zip_path, extract_dir=str(cache_root))
-    print(f"✓ Restored cache into {cache_root}")
-# restore_inductor_cache_from_hub("alexnasa/humo-compiled")
-def get_duration(prompt_text, steps, image_file, audio_file_path, tea_cache_l1_thresh, max_duration, session_id):
-    return calculate_required_time(steps, max_duration)
-def calculate_required_time(steps, max_duration):
-    warmup_s = 60
-    max_duration_duration_mapping = {
-        1: 8,
-        2: 8,
-        3: 11,
-        4: 20,
-        5: 30,
-    }
-    each_step_s = max_duration_duration_mapping[max_duration]
-    duration_s = (each_step_s * steps) + warmup_s
-    print(f'estimated duration:{duration_s}')
-    return int(duration_s)
-def get_required_time_string(steps, max_duration):
-    duration_s = calculate_required_time(steps, max_duration)
-    duration_m = duration_s / 60
-    return f"<center>⌚ Zero GPU Required: ~{duration_s}.0s ({duration_m:.1f} mins)</center>"
-def update_required_time(steps, max_duration):
-    return get_required_time_string(steps, max_duration)
-def generate_scene(prompt_text, steps, image_paths, audio_file_path, tea_cache_l1_thresh, max_duration = 2, session_id = None):
-    print(image_paths)
-    prompt_text_check = (prompt_text or "").strip()
-    if not prompt_text_check:
-        raise gr.Error("Please enter a prompt.")
-    if not audio_file_path and not image_paths:
-        raise gr.Error("Please provide a reference image or a lipsync audio.")
-    return run_pipeline(prompt_text, steps, image_paths, audio_file_path, tea_cache_l1_thresh, max_duration, session_id)
-def upload_inductor_cache_to_hub(
-    repo_id: str,
-    path_in_repo: str = "inductor_cache",
-    repo_type: str = "model",   # or "dataset" if you prefer
-    hf_token: str | None = None,
-):
-    """
-    Zips the current TorchInductor cache and uploads it to the given repo path.
-    Assumes the model was already run once with torch.compile() so the cache exists.
-    """
-    cache_dir = Path(_inductor_cache_dir()).resolve()
-    if not cache_dir.exists():
-        raise FileNotFoundError(f"TorchInductor cache not found at {cache_dir}. "
-                                "Run a compiled model once to populate it.")
-    # Create a zip archive of the entire cache directory
-    with tempfile.TemporaryDirectory() as tmpdir:
-        archive_base = Path(tmpdir) / "torch_compile_cache"
-        archive_path = shutil.make_archive(str(archive_base), "zip", root_dir=str(cache_dir))
-        archive_path = Path(archive_path)
-        # Upload to Hub
-        api = HfApi(token=hf_token)
-        api.create_repo(repo_id=repo_id, repo_type=repo_type, exist_ok=True)
-        # Put each artifact under path_in_repo, including a tiny metadata stamp for traceability
-        # Upload the zip
-        dest_path = f"{path_in_repo}/{archive_path.name}"
-        api.upload_file(
-            path_or_fileobj=str(archive_path),
-            path_in_repo=dest_path,
-            repo_id=repo_id,
-            repo_type=repo_type,
-        )
-        # Upload a small metadata file (optional but handy)
-        meta_txt = (
-            f"pytorch={torch.__version__}\n"
-            f"inductor_cache_dir={cache_dir}\n"
-            f"cuda_available={torch.cuda.is_available()}\n"
-            f"cuda_device={torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'cpu'}\n"
-        )
-        api.upload_file(
-            path_or_fileobj=meta_txt.encode(),
-            path_in_repo=f"{path_in_repo}/INDUCTOR_CACHE_METADATA.txt",
-            repo_id=repo_id,
-            repo_type=repo_type,
-        )
-    print("✔ Uploaded TorchInductor cache to the Hub.")
-@spaces.GPU(duration=get_duration)
-def run_pipeline(prompt_text, steps, image_paths, audio_file_path, tea_cache_l1_thresh = 0.0, max_duration = 2, session_id = None):
-    if session_id is None:
-        session_id = uuid.uuid4().hex
-    inference_mode = "TIA"
-    # Validate inputs
-    prompt_text = (prompt_text or "").strip()
-    if not prompt_text:
-        raise gr.Error("Please enter a prompt.")
-    if not audio_file_path and not image_paths:
-        raise gr.Error("Please provide a reference image or a lipsync audio.")
-    if not audio_file_path:
-        inference_mode = "TI"
-        audio_path = None
-    else:
-        audio_path = audio_file_path if isinstance(audio_file_path, str) else getattr(audio_file_path, "name", str(audio_file_path))
-    if not image_paths:
-        inference_mode = "TA"
-        img_paths = None
-    else:
-        img_paths = [image_data[0] for image_data in image_paths]
-    # Prepare output
-    output_dir = os.path.join(os.environ["PROCESSED_RESULTS"], session_id)
-    os.makedirs(output_dir, exist_ok=True)
-    # Random filename
-    filename = f"gen_{uuid.uuid4().hex[:10]}"
-    width, height = 832, 480
-    duration_frame_mapping = {
-        1:25,
-        2:45,
-        3:70,
-        4:97,
-        5:129
-    }
-    # Run inference
-    runner.inference_loop(
-        prompt_text,
-        img_paths,
-        audio_path,
-        output_dir,
-        filename,
-        inference_mode,
-        width,
-        height,
-        steps,
-        frames = int(duration_frame_mapping[max_duration]),
-        tea_cache_l1_thresh = tea_cache_l1_thresh,
-    )
-    # Return resulting video path
-    video_path = os.path.join(output_dir, f"{filename}.mp4")
-    if os.path.exists(video_path):
-        # upload_inductor_cache_to_hub("alexnasa/humo-compiled")
-        return video_path
-    else:
-        candidates = [os.path.join(output_dir, f) for f in os.listdir(output_dir) if f.endswith(".mp4")]
-        if candidates:
-            return max(candidates, key=lambda p: os.path.getmtime(p))
-        return None
-css = """
-    #col-container {
-        margin: 0 auto;
-        width: 100%;
-        max-width: 720px;
-    }
-    """
-def cleanup(request: gr.Request):
-    sid = request.session_hash
-    if sid:
-        d1 = os.path.join(os.environ["PROCESSED_RESULTS"], sid)
-        shutil.rmtree(d1, ignore_errors=True)
-def start_session(request: gr.Request):
-    return request.session_hash
-with gr.Blocks(css=css) as demo:
-    session_state = gr.State()
-    demo.load(start_session, outputs=[session_state])
-    with gr.Sidebar(width=400):
-        gr.HTML(
-            """
-            <div style="text-align: center;">
-                <p style="font-size:16px; display: inline; margin: 0;">
-                    <strong>HuMo</strong> – Human-Centric Video Generation via Collaborative Multi-Modal Conditioning
-                </p>
-                <a href="https://github.com/Phantom-video/HuMo" style="display: inline-block; vertical-align: middle; margin-left: 0.5em;">
-                    [Github]
-                </a>
-            </div>
-            """
-        )
-        gr.Markdown("**REFERENCE IMAGES**")
-        img_input = gr.Gallery(
-            show_label=False,
-            label="",
-            interactive=True,
-            rows=1, columns=3, object_fit="contain", height="280",
-            file_types=['image']
-        )
-        gr.Markdown("**LIPSYNC AUDIO**")
-        audio_input = gr.Audio(
-            sources=["upload"],
-            show_label=False,
-            type="filepath",
-        )
-        gr.Markdown("**SETTINGS**")
-        default_steps = 10
-        default_max_duration = 2
-        max_duration = gr.Slider(minimum=2, maximum=5, value=default_max_duration, step=1, label="Max Duration")
-        steps_input = gr.Slider(minimum=5, maximum=50, value=default_steps, step=5, label="Diffusion Steps")
-        tea_cache_l1_thresh = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.01, label="Cache", visible=False)
-    with gr.Column(elem_id="col-container"):
-        gr.HTML(
-            """
-            <div style="text-align: center;">
-                <strong>HF Space by:</strong>
-                <a href="https://twitter.com/alexandernasa/" style="display: inline-block; vertical-align: middle; margin-left: 0.5em;">
-                    <img src="https://img.shields.io/twitter/url/https/twitter.com/cloudposse.svg?style=social&label=Follow Me" alt="GitHub Repo">
-                </a>
-            </div>
-            """
-        )
-        video_output = gr.Video(show_label=False)
-        gr.Markdown("<center><h2>PROMPT</h2></center>")
-        prompt_tb = gr.Textbox(
-            show_label=False,
-            lines=5,
-            placeholder="Describe the scene and the person talking....",
-        )
-        gr.Markdown("")
-        time_required = gr.Markdown(get_required_time_string(default_steps, default_max_duration))
-        run_btn = gr.Button("🎬 Action", variant="primary")
-        gr.Examples(
-            examples=[
-                [
-                    "A handheld tracking shot follows a female warrior walking through a cave. Her determined eyes are locked straight ahead. She speaks with intensity.",
-                    5,
-                    ["./examples/naomi.png"],
-                    "./examples/dream.mp3",
-                ],
-                [
-                    "A reddish-brown haired and bearded man sits pensively against swirling blue-and-white brushstrokes, dressed in a blue coat and dark waistcoat. The artistic backdrop and his thoughtful pose evoke a Post-Impressionist style in a studio-like setting.",
-                    10,
-                    ["./examples/vangogh.jpg"],
-                    "./examples/art.wav",
-                ],
-                [
-                    "A handheld tracking shot follows a female through a science lab. Her determined eyes are locked straight ahead. The clip is in black and white and patchy as she is explaining something to someone standing opposite her",
-                    10,
-                    ["./examples/naomi.png"],
-                    "./examples/science.wav",
-                ],
-                [
-                    "A woman with long, wavy dark hair looking at a person sitting opposite her whilst holding a book, wearing a leather jacket, long-sleeved jacket with a semi purple color one seen on a photo. Warm, window-like light bathes her figure, highlighting the outfit's elegant design and her graceful movements.",
-                    50,
-                    ["./examples/amber.png", "./examples/jacket.png"],
-                    "./examples/fictional.mp3",
-                ],
-            ],
-            inputs=[prompt_tb, steps_input, img_input, audio_input],
-            outputs=[video_output],
-            fn=run_pipeline,
-            cache_examples=True,
-        )
-        max_duration.change(update_required_time, [steps_input, max_duration], time_required)
-        steps_input.change(update_required_time, [steps_input, max_duration], time_required)
-        run_btn.click(
-            fn=generate_scene,
-            inputs=[prompt_tb, steps_input, img_input, audio_input, tea_cache_l1_thresh, max_duration, session_state],
-            outputs=[video_output],
-        )
-if __name__ == "__main__":
-    demo.unload(cleanup)
-    demo.queue()
-    demo.launch(ssr_mode=False)

+import spaces
+import gradio as gr
+import sys
+import os
+import subprocess
+import uuid
+import shutil
+from huggingface_hub import snapshot_download, list_repo_files, hf_hub_download
+import importlib, site
+# Re-discover all .pth/.egg-link files
+for sitedir in site.getsitepackages():
+    site.addsitedir(sitedir)
+# Clear caches so importlib will pick up new modules
+importlib.invalidate_caches()
+def sh(cmd): subprocess.check_call(cmd, shell=True)
+flash_attention_installed = False
+try:
+    flash_attention_wheel = hf_hub_download(
+            repo_id="alexnasa/flash-attn-3",
+            repo_type="model",
+            filename="128/flash_attn_3-3.0.0b1-cp39-abi3-linux_x86_64.whl",
+        )
+    sh(f"pip install {flash_attention_wheel}")
+    print("Attempting to download and install FlashAttention wheel...")
+    # sh("pip install flash-attn")
+    sh("pip install --no-build-isolation transformer_engine-2.5.0+f05f12c9-cp310-cp310-linux_x86_64.whl")
+    # tell Python to re-scan site-packages now that the egg-link exists
+    import importlib, site; site.addsitedir(site.getsitepackages()[0]); importlib.invalidate_caches()
+    flash_attention_installed = True
+except Exception as e:
+    print(f"⚠️ Could not install FlashAttention: {e}")
+    print("Continuing without FlashAttention...")
+try:
+    te_wheel = hf_hub_download(
+            repo_id="alexnasa/transformer_engine_wheels",
+            repo_type="model",
+            filename="transformer_engine-2.5.0+f05f12c9-cp310-cp310-linux_x86_64.whl",
+        )
+    sh(f"pip install {te_wheel}")
+    print("Attempting to download and install Transformer Engine wheel...")
+    # tell Python to re-scan site-packages now that the egg-link exists
+    import importlib, site; site.addsitedir(site.getsitepackages()[0]); importlib.invalidate_caches()
+except Exception as e:
+    print(f"⚠️ Could not install Transformer Engine : {e}")
+    print("Continuing without Transformer Engine ...")
+import torch
+print(f"Torch version: {torch.__version__}")
+print(f"FlashAttention available: {flash_attention_installed}")
+import tempfile
+from pathlib import Path
+from torch._inductor.runtime.runtime_utils import cache_dir as _inductor_cache_dir
+from huggingface_hub import HfApi
+snapshot_download(repo_id="bytedance-research/HuMo", local_dir="./weights/HuMo")
+snapshot_download(repo_id="Wan-AI/Wan2.1-T2V-1.3B", local_dir="./weights/Wan2.1-T2V-1.3B")
+snapshot_download(repo_id="openai/whisper-large-v3", local_dir="./weights/whisper-large-v3")
+os.environ["PROCESSED_RESULTS"] = f"{os.getcwd()}/proprocess_results"
+path_to_insert = "humo"
+if path_to_insert not in sys.path:
+    sys.path.insert(0, path_to_insert)
+from common.config import load_config, create_object
+config = load_config(
+    "./humo/configs/inference/generate.yaml",
+    [
+        "dit.sp_size=1",
+        "generation.frames=97",
+        "generation.scale_t=5.5",
+        "generation.scale_a=5.0",
+        "generation.mode=TIA",
+        "generation.height=480",
+        "generation.width=832",
+    ],
+)
+runner = create_object(config)
+os.environ.setdefault("TORCHINDUCTOR_CACHE_DIR", f"{os.getcwd()}/torchinductor_space")  # or another writable path
+def restore_inductor_cache_from_hub(repo_id: str, filename: str = "torch_compile_cache.zip",
+                                    path_in_repo: str = "inductor_cache", repo_type: str = "model",
+                                    hf_token: str | None = None):
+    cache_root = Path(_inductor_cache_dir()).resolve()
+    cache_root.mkdir(parents=True, exist_ok=True)
+    zip_path = hf_hub_download(repo_id=repo_id, filename=f"{path_in_repo}/{filename}",
+                               repo_type=repo_type, token=hf_token)
+    shutil.unpack_archive(zip_path, extract_dir=str(cache_root))
+    print(f"✓ Restored cache into {cache_root}")
+# restore_inductor_cache_from_hub("alexnasa/humo-compiled")
+def get_duration(prompt_text, steps, image_file, audio_file_path, tea_cache_l1_thresh, max_duration, session_id):
+    return calculate_required_time(steps, max_duration)
+def calculate_required_time(steps, max_duration):
+    warmup_s = 60
+    max_duration_duration_mapping = {
+        1: 8,
+        2: 8,
+        3: 11,
+        4: 20,
+        5: 30,
+    }
+    each_step_s = max_duration_duration_mapping[max_duration]
+    duration_s = (each_step_s * steps) + warmup_s
+    print(f'estimated duration:{duration_s}')
+    return int(duration_s)
+def get_required_time_string(steps, max_duration):
+    duration_s = calculate_required_time(steps, max_duration)
+    duration_m = duration_s / 60
+    return f"<center>⌚ Zero GPU Required: ~{duration_s}.0s ({duration_m:.1f} mins)</center>"
+def update_required_time(steps, max_duration):
+    return get_required_time_string(steps, max_duration)
+def generate_scene(prompt_text, steps, image_paths, audio_file_path, tea_cache_l1_thresh, max_duration = 2, session_id = None):
+    print(image_paths)
+    prompt_text_check = (prompt_text or "").strip()
+    if not prompt_text_check:
+        raise gr.Error("Please enter a prompt.")
+    if not audio_file_path and not image_paths:
+        raise gr.Error("Please provide a reference image or a lipsync audio.")
+    return run_pipeline(prompt_text, steps, image_paths, audio_file_path, tea_cache_l1_thresh, max_duration, session_id)
+def upload_inductor_cache_to_hub(
+    repo_id: str,
+    path_in_repo: str = "inductor_cache",
+    repo_type: str = "model",   # or "dataset" if you prefer
+    hf_token: str | None = None,
+):
+    """
+    Zips the current TorchInductor cache and uploads it to the given repo path.
+    Assumes the model was already run once with torch.compile() so the cache exists.
+    """
+    cache_dir = Path(_inductor_cache_dir()).resolve()
+    if not cache_dir.exists():
+        raise FileNotFoundError(f"TorchInductor cache not found at {cache_dir}. "
+                                "Run a compiled model once to populate it.")
+    # Create a zip archive of the entire cache directory
+    with tempfile.TemporaryDirectory() as tmpdir:
+        archive_base = Path(tmpdir) / "torch_compile_cache"
+        archive_path = shutil.make_archive(str(archive_base), "zip", root_dir=str(cache_dir))
+        archive_path = Path(archive_path)
+        # Upload to Hub
+        api = HfApi(token=hf_token)
+        api.create_repo(repo_id=repo_id, repo_type=repo_type, exist_ok=True)
+        # Put each artifact under path_in_repo, including a tiny metadata stamp for traceability
+        # Upload the zip
+        dest_path = f"{path_in_repo}/{archive_path.name}"
+        api.upload_file(
+            path_or_fileobj=str(archive_path),
+            path_in_repo=dest_path,
+            repo_id=repo_id,
+            repo_type=repo_type,
+        )
+        # Upload a small metadata file (optional but handy)
+        meta_txt = (
+            f"pytorch={torch.__version__}\n"
+            f"inductor_cache_dir={cache_dir}\n"
+            f"cuda_available={torch.cuda.is_available()}\n"
+            f"cuda_device={torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'cpu'}\n"
+        )
+        api.upload_file(
+            path_or_fileobj=meta_txt.encode(),
+            path_in_repo=f"{path_in_repo}/INDUCTOR_CACHE_METADATA.txt",
+            repo_id=repo_id,
+            repo_type=repo_type,
+        )
+    print("✔ Uploaded TorchInductor cache to the Hub.")
+@spaces.GPU(duration=get_duration)
+def run_pipeline(prompt_text, steps, image_paths, audio_file_path, tea_cache_l1_thresh = 0.0, max_duration = 2, session_id = None):
+    if session_id is None:
+        session_id = uuid.uuid4().hex
+    inference_mode = "TIA"
+    # Validate inputs
+    prompt_text = (prompt_text or "").strip()
+    if not prompt_text:
+        raise gr.Error("Please enter a prompt.")
+    if not audio_file_path and not image_paths:
+        raise gr.Error("Please provide a reference image or a lipsync audio.")
+    if not audio_file_path:
+        inference_mode = "TI"
+        audio_path = None
+    else:
+        audio_path = audio_file_path if isinstance(audio_file_path, str) else getattr(audio_file_path, "name", str(audio_file_path))
+    if not image_paths:
+        inference_mode = "TA"
+        img_paths = None
+    else:
+        img_paths = [image_data[0] for image_data in image_paths]
+    # Prepare output
+    output_dir = os.path.join(os.environ["PROCESSED_RESULTS"], session_id)
+    os.makedirs(output_dir, exist_ok=True)
+    # Random filename
+    filename = f"gen_{uuid.uuid4().hex[:10]}"
+    width, height = 832, 480
+    duration_frame_mapping = {
+        1:25,
+        2:45,
+        3:70,
+        4:97,
+        5:129
+    }
+    # Run inference
+    runner.inference_loop(
+        prompt_text,
+        img_paths,
+        audio_path,
+        output_dir,
+        filename,
+        inference_mode,
+        width,
+        height,
+        steps,
+        frames = int(duration_frame_mapping[max_duration]),
+        tea_cache_l1_thresh = tea_cache_l1_thresh,
+    )
+    # Return resulting video path
+    video_path = os.path.join(output_dir, f"{filename}.mp4")
+    if os.path.exists(video_path):
+        # upload_inductor_cache_to_hub("alexnasa/humo-compiled")
+        return video_path
+    else:
+        candidates = [os.path.join(output_dir, f) for f in os.listdir(output_dir) if f.endswith(".mp4")]
+        if candidates:
+            return max(candidates, key=lambda p: os.path.getmtime(p))
+        return None
+css = """
+    #col-container {
+        margin: 0 auto;
+        width: 100%;
+        max-width: 720px;
+    }
+    """
+def cleanup(request: gr.Request):
+    sid = request.session_hash
+    if sid:
+        d1 = os.path.join(os.environ["PROCESSED_RESULTS"], sid)
+        shutil.rmtree(d1, ignore_errors=True)
+def start_session(request: gr.Request):
+    return request.session_hash
+with gr.Blocks(css=css) as demo:
+    session_state = gr.State()
+    demo.load(start_session, outputs=[session_state])
+    with gr.Sidebar(width=400):
+        gr.HTML(
+            """
+            <div style="text-align: center;">
+                <p style="font-size:16px; display: inline; margin: 0;">
+                    <strong>HuMo</strong> – Human-Centric Video Generation via Collaborative Multi-Modal Conditioning
+                </p>
+                <a href="https://github.com/Phantom-video/HuMo" style="display: inline-block; vertical-align: middle; margin-left: 0.5em;">
+                    [Github]
+                </a>
+            </div>
+            """
+        )
+        gr.Markdown("**REFERENCE IMAGES**")
+        img_input = gr.Gallery(
+            show_label=False,
+            label="",
+            interactive=True,
+            rows=1, columns=3, object_fit="contain", height="280",
+            file_types=['image']
+        )
+        gr.Markdown("**LIPSYNC AUDIO**")
+        audio_input = gr.Audio(
+            sources=["upload"],
+            show_label=False,
+            type="filepath",
+        )
+        gr.Markdown("**SETTINGS**")
+        default_steps = 10
+        default_max_duration = 2
+        max_duration = gr.Slider(minimum=2, maximum=5, value=default_max_duration, step=1, label="Max Duration")
+        steps_input = gr.Slider(minimum=5, maximum=50, value=default_steps, step=5, label="Diffusion Steps")
+        tea_cache_l1_thresh = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.01, label="Cache", visible=False)
+    with gr.Column(elem_id="col-container"):
+        gr.HTML(
+            """
+            <div style="text-align: center;">
+                <strong>HF Space by:</strong>
+                <a href="https://twitter.com/alexandernasa/" style="display: inline-block; vertical-align: middle; margin-left: 0.5em;">
+                    <img src="https://img.shields.io/twitter/url/https/twitter.com/cloudposse.svg?style=social&label=Follow Me" alt="GitHub Repo">
+                </a>
+            </div>
+            """
+        )
+        video_output = gr.Video(show_label=False)
+        gr.Markdown("<center><h2>PROMPT</h2></center>")
+        prompt_tb = gr.Textbox(
+            show_label=False,
+            lines=5,
+            placeholder="Describe the scene and the person talking....",
+        )
+        gr.Markdown("")
+        time_required = gr.Markdown(get_required_time_string(default_steps, default_max_duration))
+        run_btn = gr.Button("🎬 Action", variant="primary")
+        gr.Examples(
+            examples=[
+                [
+                    "A handheld tracking shot follows a female warrior walking through a cave. Her determined eyes are locked straight ahead. She speaks with intensity.",
+                    5,
+                    ["./examples/naomi.png"],
+                    "./examples/dream.mp3",
+                ],
+                [
+                    "A reddish-brown haired and bearded man sits pensively against swirling blue-and-white brushstrokes, dressed in a blue coat and dark waistcoat. The artistic backdrop and his thoughtful pose evoke a Post-Impressionist style in a studio-like setting.",
+                    10,
+                    ["./examples/vangogh.jpg"],
+                    "./examples/art.wav",
+                ],
+                [
+                    "A handheld tracking shot follows a female through a science lab. Her determined eyes are locked straight ahead. The clip is in black and white and patchy as she is explaining something to someone standing opposite her",
+                    10,
+                    ["./examples/naomi.png"],
+                    "./examples/science.wav",
+                ],
+                [
+                    "A woman with long, wavy dark hair looking at a person sitting opposite her whilst holding a book, wearing a leather jacket, long-sleeved jacket with a semi purple color one seen on a photo. Warm, window-like light bathes her figure, highlighting the outfit's elegant design and her graceful movements.",
+                    50,
+                    ["./examples/amber.png", "./examples/jacket.png"],
+                    "./examples/fictional.wav",
+                ],
+            ],
+            inputs=[prompt_tb, steps_input, img_input, audio_input],
+            outputs=[video_output],
+            fn=run_pipeline,
+            cache_examples=True,
+        )
+        max_duration.change(update_required_time, [steps_input, max_duration], time_required)
+        steps_input.change(update_required_time, [steps_input, max_duration], time_required)
+        run_btn.click(
+            fn=generate_scene,
+            inputs=[prompt_tb, steps_input, img_input, audio_input, tea_cache_l1_thresh, max_duration, session_state],
+            outputs=[video_output],
+        )
+if __name__ == "__main__":
+    demo.unload(cleanup)
+    demo.queue()
+    demo.launch(ssr_mode=False)