Spaces:

jaeikkim
/

AIDAS-Omni-Modal-Diffusion

Sleeping

App Files Files Community

jake commited on Nov 20, 2025

Commit

05fc139

1 Parent(s): 1d0c879

TF

Browse files

Files changed (1) hide show

app.py +680 -369

app.py CHANGED Viewed

@@ -1,398 +1,709 @@
 import os
 import sys
 from pathlib import Path
 import spaces
-# === Import project modules ===
 PROJECT_ROOT = Path(__file__).resolve().parent
 MMADA_ROOT = PROJECT_ROOT / "MMaDA"
 if str(MMADA_ROOT) not in sys.path:
     sys.path.insert(0, str(MMADA_ROOT))
-from inference.gradio_multimodal_demo_inst import OmadaDemo
-import gradio as gr
-# ----------------------------------------------------------------------
-#                  1.  Asset Loading (Downloaded by entrypoint)
-# ----------------------------------------------------------------------
-ASSET_ROOT = PROJECT_ROOT / "_asset_cache" / "AIDAS-Omni-Modal-Diffusion-assets"
-DEMO_ROOT = ASSET_ROOT  # asset repo already modality-split
-# ----------------------------------------------------------------------
-#                  2.  GPU Handler Wrapper
-# ----------------------------------------------------------------------
-def gpu_handler(fn):
     """
-    Wrap an inference function using ZeroGPU.
     """
-    @spaces.GPU
-    def inner(*args, **kwargs):
-        return fn(*args, **kwargs)
-    return inner
-# ----------------------------------------------------------------------
-#                  3.  Build Demo UI With Examples
-# ----------------------------------------------------------------------
-def build_zero_gpu_demo(app: OmadaDemo):
-    with gr.Blocks(title="AIDAS Omni-Modal Diffusion (ZeroGPU)") as demo:
-        # ---------------- Header ----------------
-        gr.Markdown(
-            "<h1 style='text-align:center'>AIDAS Omni-Modal Diffusion Model</h1>"
         )
-        try:
-            logo_path = "/mnt/data/A2E36E9F-F389-487D-9984-FFF21C9228E3.png"
-            gr.Image(logo_path, elem_id="logo", show_label=False, height=120)
-        except:
-            pass
-        gr.Markdown("### Multimodal Inference Demo (ZeroGPU Optimized)")
-        gr.Markdown("---")
-        # ---------------- Tabs ----------------
-        with gr.Tabs():
-            # ============================================================
-            # 1) TEXT → SPEECH (T2S)
-            # ============================================================
-            with gr.Tab("Text → Speech (T2S)"):
-                t2s_in = gr.Textbox(label="Input Text")
-                t2s_btn = gr.Button("Generate")
-                t2s_audio = gr.Audio(label="Speech Output")
-                t2s_status = gr.Textbox(label="Status", interactive=False)
-                t2s_examples = []
-                t2s_dir = DEMO_ROOT / "t2s"
-                if t2s_dir.exists():
-                    for f in t2s_dir.glob("*.txt"):
-                        txt = f.read_text().strip()
-                        t2s_examples.append([txt])
-                if len(t2s_examples) > 0:
-                    gr.Examples(
-                        examples=t2s_examples,
-                        inputs=[t2s_in],
-                        outputs=[t2s_audio, t2s_status],
-                        fn=gpu_handler(app.run_t2s),
-                    )
-                t2s_btn.click(
-                    gpu_handler(app.run_t2s),
-                    inputs=[t2s_in],
-                    outputs=[t2s_audio, t2s_status],
-                )
-            # ============================================================
-            # 2) SPEECH → SPEECH (S2S)
-            # ============================================================
-            with gr.Tab("Speech → Speech (S2S)"):
-                s2s_in = gr.Audio(type="filepath", label="Input Speech")
-                s2s_btn = gr.Button("Generate")
-                s2s_audio = gr.Audio(label="Output Speech")
-                s2s_status = gr.Textbox(label="Status", interactive=False)
-                s2s_examples = []
-                s2s_dir = DEMO_ROOT / "s2s"
-                if s2s_dir.exists():
-                    for f in s2s_dir.glob("*.wav"):
-                        s2s_examples.append([str(f)])
-                if len(s2s_examples) > 0:
-                    gr.Examples(
-                        examples=s2s_examples,
-                        inputs=[s2s_in],
-                        outputs=[s2s_audio, s2s_status],
-                        fn=gpu_handler(app.run_s2s),
-                    )
-                s2s_btn.click(
-                    gpu_handler(app.run_s2s),
-                    inputs=[s2s_in],
-                    outputs=[s2s_audio, s2s_status]
-                )
-            # ============================================================
-            # 3) SPEECH → TEXT (S2T)
-            # ============================================================
-            with gr.Tab("Speech → Text (S2T)"):
-                s2t_in = gr.Audio(type="filepath", label="Input Speech")
-                s2t_btn = gr.Button("Transcribe")
-                s2t_text = gr.Textbox(label="Transcribed Text")
-                s2t_status = gr.Textbox(label="Status", interactive=False)
-                s2t_examples = []
-                s2t_dir = DEMO_ROOT / "s2t"
-                if s2t_dir.exists():
-                    for f in s2t_dir.glob("*.wav"):
-                        s2t_examples.append([str(f)])
-                if len(s2t_examples) > 0:
-                    gr.Examples(
-                        examples=s2t_examples,
-                        inputs=[s2t_in],
-                        outputs=[s2t_text, s2t_status],
-                        fn=gpu_handler(app.run_s2t),
-                    )
-                s2t_btn.click(
-                    gpu_handler(app.run_s2t),
-                    inputs=[s2t_in],
-                    outputs=[s2t_text, s2t_status],
-                )
-            # ============================================================
-            # 4) VIDEO → TEXT (V2T)
-            # ============================================================
-            with gr.Tab("Video → Text (V2T)"):
-                v2t_in = gr.Video(type="filepath", label="Input Video")
-                v2t_btn = gr.Button("Generate Caption")
-                v2t_text = gr.Textbox(label="Caption")
-                v2t_status = gr.Textbox(label="Status")
-                v2t_examples = []
-                v2t_dir = DEMO_ROOT / "v2t"
-                if v2t_dir.exists():
-                    for f in v2t_dir.glob("*.mp4"):
-                        v2t_examples.append([str(f)])
-                if len(v2t_examples) > 0:
-                    gr.Examples(
-                        examples=v2t_examples,
-                        inputs=[v2t_in],
-                        outputs=[v2t_text, v2t_status],
-                        fn=gpu_handler(app.run_v2t),
-                    )
-                v2t_btn.click(
-                    gpu_handler(app.run_v2t),
-                    inputs=[v2t_in],
-                    outputs=[v2t_text, v2t_status],
-                )
-            # ============================================================
-            # 5) VIDEO → SPEECH (V2S)
-            # ============================================================
-            with gr.Tab("Video → Speech (V2S)"):
-                v2s_in = gr.Video(type="filepath", label="Input Video")
-                v2s_btn = gr.Button("Generate Speech")
-                v2s_audio = gr.Audio(label="Speech Output")
-                v2s_status = gr.Textbox(label="Status")
-                v2s_examples = []
-                v2s_dir = DEMO_ROOT / "v2s"
-                if v2s_dir.exists():
-                    for f in v2s_dir.glob("*.mp4"):
-                        v2s_examples.append([str(f)])
-                if len(v2s_examples) > 0:
-                    gr.Examples(
-                        examples=v2s_examples,
-                        inputs=[v2s_in],
-                        outputs=[v2s_audio, v2s_status],
-                        fn=gpu_handler(app.run_v2s),
-                    )
-                v2s_btn.click(
-                    gpu_handler(app.run_v2s),
-                    inputs=[v2s_in],
-                    outputs=[v2s_audio, v2s_status],
-                )
-            # ============================================================
-            # 6) IMAGE → SPEECH (I2S)
-            # ============================================================
-            with gr.Tab("Image → Speech (I2S)"):
-                i2s_in = gr.Image(type="filepath", label="Input Image")
-                i2s_btn = gr.Button("Generate Speech")
-                i2s_audio = gr.Audio(label="Speech")
-                i2s_status = gr.Textbox(label="Status")
-                # Only if folder exists
-                i2s_examples = []
-                i2s_dir = DEMO_ROOT / "i2s"
-                if i2s_dir.exists():
-                    for f in i2s_dir.glob("*.*"):
-                        i2s_examples.append([str(f)])
-                if len(i2s_examples) > 0:
-                    gr.Examples(
-                        examples=i2s_examples,
-                        inputs=[i2s_in],
-                        outputs=[i2s_audio, i2s_status],
-                        fn=gpu_handler(app.run_i2s),
-                    )
-                i2s_btn.click(
-                    gpu_handler(app.run_i2s),
-                    inputs=[i2s_in],
-                    outputs=[i2s_audio, i2s_status],
-                )
-            # ============================================================
-            # 7) CHAT
-            # ============================================================
-            with gr.Tab("Chat (Text)"):
-                chat_in = gr.Textbox(label="Message")
-                chat_btn = gr.Button("Send")
-                chat_out = gr.Textbox(label="Response")
-                chat_status = gr.Textbox(label="Status")
-                chat_examples = []
-                chat_dir = DEMO_ROOT / "chat"
-                if chat_dir.exists():
-                    for f in chat_dir.glob("*.txt"):
-                        txt = f.read_text().strip()
-                        chat_examples.append([txt])
-                if len(chat_examples) > 0:
-                    gr.Examples(
-                        examples=chat_examples,
-                        inputs=[chat_in],
-                        outputs=[chat_out, chat_status],
-                        fn=gpu_handler(app.run_chat),
-                    )
-                chat_btn.click(
-                    gpu_handler(app.run_chat),
-                    inputs=[chat_in],
-                    outputs=[chat_out, chat_status],
-                )
-            # ============================================================
-            # 8) MMU (2 images → text)
-            # ============================================================
-            with gr.Tab("MMU (Dual-Image Reasoning)"):
-                mmu_img1 = gr.Image(type="filepath", label="Image 1")
-                mmu_img2 = gr.Image(type="filepath", label="Image 2")
-                mmu_prompt = gr.Textbox(label="Prompt")
-                mmu_btn = gr.Button("Run MMU")
-                mmu_out = gr.Textbox(label="Output")
-                mmu_status = gr.Textbox(label="Status")
-                mmu_examples = []
-                mmu_dir = DEMO_ROOT / "mmu"
-                if mmu_dir.exists():
-                    imgs = list(mmu_dir.glob("*.png"))
-                    if len(imgs) >= 2:
-                        mmu_examples.append([
-                            str(imgs[0]),
-                            str(imgs[1]),
-                            "Describe the relation between two objects."
-                        ])
-                if len(mmu_examples) > 0:
-                    gr.Examples(
-                        examples=mmu_examples,
-                        inputs=[mmu_img1, mmu_img2, mmu_prompt],
-                        outputs=[mmu_out, mmu_status],
-                        fn=gpu_handler(app.run_mmu_dual),
-                    )
-                mmu_btn.click(
-                    gpu_handler(app.run_mmu_dual),
-                    inputs=[mmu_img1, mmu_img2, mmu_prompt],
-                    outputs=[mmu_out, mmu_status]
-                )
-            # ============================================================
-            # 9) TEXT → IMAGE (T2I)
-            # ============================================================
-            with gr.Tab("Text → Image (T2I)"):
-                t2i_in = gr.Textbox(label="Prompt")
-                t2i_btn = gr.Button("Generate Image")
-                t2i_img = gr.Image(label="Generated Image")
-                t2i_status = gr.Textbox(label="Status")
-                t2i_examples = []
-                t2i_dir = DEMO_ROOT / "t2i"
-                if t2i_dir.exists():
-                    for f in t2i_dir.glob("*.txt"):
-                        txt = f.read_text().strip()
-                        t2i_examples.append([txt])
-                if len(t2i_examples) > 0:
-                    gr.Examples(
-                        examples=t2i_examples,
-                        inputs=[t2i_in],
-                        outputs=[t2i_img, t2i_status],
-                        fn=gpu_handler(app.run_t2i),
-                    )
-                t2i_btn.click(
-                    gpu_handler(app.run_t2i),
-                    inputs=[t2i_in],
-                    outputs=[t2i_img, t2i_status],
-                )
-            # ============================================================
-            # 10) IMAGE EDITING (I2I)
-            # ============================================================
-            with gr.Tab("Image Editing (I2I)"):
-                i2i_in = gr.Image(type="filepath", label="Input Image")
-                i2i_prompt = gr.Textbox(label="Edit Instruction")
-                i2i_btn = gr.Button("Apply Edit")
-                i2i_img = gr.Image(label="Edited Image")
-                i2i_status = gr.Textbox(label="Status")
-                i2i_examples = []
-                i2i_dir = DEMO_ROOT / "i2i"
-                if i2i_dir.exists():
-                    for f in i2i_dir.glob("*.*"):
-                        i2i_examples.append([str(f), "Make it more vibrant."])
-                if len(i2i_examples) > 0:
-                    gr.Examples(
-                        examples=i2i_examples,
-                        inputs=[i2i_in, i2i_prompt],
-                        outputs=[i2i_img, i2i_status],
-                        fn=gpu_handler(app.run_i2i),
-                    )
-                i2i_btn.click(
-                    gpu_handler(app.run_i2i),
-                    inputs=[i2i_in, i2i_prompt],
-                    outputs=[i2i_img, i2i_status]
-                )
-        # End Tabs
-    return demo
-# ----------------------------------------------------------------------
-#                  4.  Entry Point for Space
-# ----------------------------------------------------------------------
 @spaces.GPU
-def main():
-    app = OmadaDemo(
-        train_config=str(MMADA_ROOT / "inference/demo/demo.yaml"),
-        checkpoint=os.getenv("MODEL_CHECKPOINT_DIR", "_ckpt_cache/omada"),
-        device="cpu"
     )
-    demo = build_zero_gpu_demo(app)
-    demo.launch(server_name="0.0.0.0", server_port=7860, share=False)
 if __name__ == "__main__":
-    main()

+"""
+ZeroGPU-friendly Gradio entrypoint for OMada demo.
+- Downloads checkpoint + assets + style centroids from Hugging Face Hub
+- Instantiates OmadaDemo once (global)
+- Exposes 10 modalities via Gradio tabs
+- Uses @spaces.GPU only on inference handlers so GPU is allocated per request
+Environment overrides:
+  MODEL_REPO_ID      (default: jaeikkim/AIDAS-Omni-Modal-Diffusion)
+  MODEL_REVISION     (default: main)
+  ASSET_REPO_ID      (default: jaeikkim/AIDAS-Omni-Modal-Diffusion-assets)
+  ASSET_REVISION     (default: main)
+  STYLE_REPO_ID      (default: jaeikkim/aidas-style-centroid)
+  STYLE_REVISION     (default: main)
+  HF_TOKEN           (optional, for private model/dataset)
+  TRAIN_CONFIG_PATH  (default: MMaDA/inference/demo/demo.yaml)
+  DEVICE             (default: cuda)
+"""
 import os
 import sys
+import subprocess
+import importlib
 from pathlib import Path
+import gradio as gr
 import spaces
+from packaging.version import parse as parse_version
+# ---------------------------
+# Project roots & sys.path
+# ---------------------------
 PROJECT_ROOT = Path(__file__).resolve().parent
 MMADA_ROOT = PROJECT_ROOT / "MMaDA"
 if str(MMADA_ROOT) not in sys.path:
     sys.path.insert(0, str(MMADA_ROOT))
+EMOVA_ROOT = PROJECT_ROOT / "EMOVA_speech_tokenizer"
+if str(EMOVA_ROOT) not in sys.path:
+    sys.path.insert(0, str(EMOVA_ROOT))
+# ---------------------------
+# HuggingFace Hub helper
+# ---------------------------
+def ensure_hf_hub(target: str = "0.36.0"):
     """
+    Make sure huggingface_hub stays <1.0 to satisfy transformers/tokenizers.
+    The Spaces base image may pull in a newer version via gradio, so we pin it.
     """
+    try:
+        import huggingface_hub as hub
+    except ImportError:
+        subprocess.check_call(
+            [sys.executable, "-m", "pip", "install", f"huggingface-hub=={target}", "--no-cache-dir"]
+        )
+        import huggingface_hub as hub
+    if parse_version(hub.__version__) >= parse_version("1.0.0"):
+        subprocess.check_call(
+            [sys.executable, "-m", "pip", "install", f"huggingface-hub=={target}", "--no-cache-dir"]
+        )
+        hub = importlib.reload(hub)
+    # Backfill missing constants in older hub versions to avoid AttributeError.
+    try:
+        import huggingface_hub.constants as hub_consts  # type: ignore
+    except Exception:
+        hub_consts = None
+    if hub_consts and not hasattr(hub_consts, "HF_HUB_ENABLE_HF_TRANSFER"):
+        setattr(hub_consts, "HF_HUB_ENABLE_HF_TRANSFER", False)
+    return hub
+snapshot_download = ensure_hf_hub().snapshot_download
+# ---------------------------
+# Imports from OMada demo
+# ---------------------------
+from inference.gradio_multimodal_demo_inst import (  # noqa: E402
+    OmadaDemo,
+    CUSTOM_CSS,
+    FORCE_LIGHT_MODE_JS,
+)
+# ---------------------------
+# HF download helpers
+# ---------------------------
+def download_assets() -> Path:
+    """Download demo assets (logo + sample prompts/media) and return the root path."""
+    repo_id = os.getenv("ASSET_REPO_ID", "jaeikkim/AIDAS-Omni-Modal-Diffusion-assets")
+    revision = os.getenv("ASSET_REVISION", "main")
+    token = os.getenv("HF_TOKEN")
+    cache_dir = PROJECT_ROOT / "_asset_cache"
+    cache_dir.mkdir(parents=True, exist_ok=True)
+    return Path(
+        snapshot_download(
+            repo_id=repo_id,
+            revision=revision,
+            repo_type="dataset",
+            local_dir=cache_dir,
+            local_dir_use_symlinks=False,
+            token=token,
+        )
+    )
+def download_style() -> Path:
+    """Download style centroid dataset and return the root path."""
+    repo_id = os.getenv("STYLE_REPO_ID", "jaeikkim/aidas-style-centroid")
+    revision = os.getenv("STYLE_REVISION", "main")
+    token = os.getenv("HF_TOKEN")
+    cache_dir = PROJECT_ROOT / "_style_cache"
+    cache_dir.mkdir(parents=True, exist_ok=True)
+    return Path(
+        snapshot_download(
+            repo_id=repo_id,
+            revision=revision,
+            repo_type="dataset",
+            local_dir=cache_dir,
+            local_dir_use_symlinks=False,
+            token=token,
+        )
+    )
+def download_checkpoint() -> Path:
+    """Download checkpoint snapshot and return an `unwrapped_model` directory."""
+    repo_id = os.getenv("MODEL_REPO_ID", "jaeikkim/AIDAS-Omni-Modal-Diffusion")
+    revision = os.getenv("MODEL_REVISION", "main")
+    token = os.getenv("HF_TOKEN")
+    cache_dir = PROJECT_ROOT / "_ckpt_cache"
+    cache_dir.mkdir(parents=True, exist_ok=True)
+    snapshot_path = Path(
+        snapshot_download(
+            repo_id=repo_id,
+            revision=revision,
+            repo_type="model",
+            local_dir=cache_dir,
+            local_dir_use_symlinks=False,
+            token=token,
         )
+    )
+    # If snapshot itself is unwrapped_model, return it; otherwise look for nested dir,
+    # and finally alias via symlink.
+    if snapshot_path.name == "unwrapped_model":
+        return snapshot_path
+    nested = snapshot_path / "unwrapped_model"
+    if nested.is_dir():
+        return nested
+    aliased = snapshot_path.parent / "unwrapped_model"
+    if not aliased.exists():
+        aliased.symlink_to(snapshot_path, target_is_directory=True)
+    return aliased
+# ---------------------------
+# Global OmadaDemo instance
+# ---------------------------
+APP = None  # type: ignore
+def get_app() -> OmadaDemo:
+    global APP
+    if APP is not None:
+        return APP
+    # Download everything once
+    ckpt_dir = download_checkpoint()
+    asset_root = download_assets()
+    style_root = download_style()
+    # Wire style centroids to expected locations
+    style_targets = [
+        MMADA_ROOT / "models" / "speech_tokenization" / "condition_style_centroid",
+        PROJECT_ROOT
+        / "EMOVA_speech_tokenizer"
+        / "emova_speech_tokenizer"
+        / "speech_tokenization"
+        / "condition_style_centroid",
+    ]
+    for starget in style_targets:
+        if not starget.exists():
+            starget.parent.mkdir(parents=True, exist_ok=True)
+            starget.symlink_to(style_root, target_is_directory=True)
+    # Choose train config
+    default_cfg = PROJECT_ROOT / "MMaDA" / "inference" / "demo" / "demo.yaml"
+    legacy_cfg = PROJECT_ROOT / "MMaDA" / "configs" / "mmada_demo.yaml"
+    train_config = os.getenv("TRAIN_CONFIG_PATH")
+    if not train_config:
+        train_config = str(default_cfg if default_cfg.exists() else legacy_cfg)
+    # Device: in ZeroGPU environment, "cuda" is virtualized and only actually
+    # attached inside @spaces.GPU handlers.
+    device = os.getenv("DEVICE", "cuda")
+    APP = OmadaDemo(train_config=train_config, checkpoint=str(ckpt_dir), device=device)
+    return APP
+# ---------------------------
+# ZeroGPU-wrapped handlers
+# ---------------------------
+@spaces.GPU
+def t2s_handler(
+    text,
+    max_tokens,
+    steps,
+    block_len,
+    temperature,
+    cfg_scale,
+    gender,
+    emotion,
+    speed,
+    pitch,
+):
+    app = get_app()
+    audio, status = app.run_t2s(
+        text=text,
+        max_new_tokens=int(max_tokens),
+        steps=int(steps),
+        block_length=int(block_len),
+        temperature=float(temperature),
+        cfg_scale=float(cfg_scale),
+        gender_choice=gender,
+        emotion_choice=emotion,
+        speed_choice=speed,
+        pitch_choice=pitch,
+    )
+    return audio, status
+@spaces.GPU
+def s2s_handler(
+    audio_path,
+    max_tokens,
+    steps,
+    block_len,
+    temperature,
+    cfg_scale,
+):
+    app = get_app()
+    audio, status = app.run_s2s(
+        audio_path=audio_path,
+        max_new_tokens=int(max_tokens),
+        steps=int(steps),
+        block_length=int(block_len),
+        temperature=float(temperature),
+        cfg_scale=float(cfg_scale),
+    )
+    return audio, status
+@spaces.GPU
+def s2t_handler(
+    audio_path,
+    steps,
+    block_len,
+    max_tokens,
+    remasking,
+):
+    app = get_app()
+    text, status = app.run_s2t(
+        audio_path=audio_path,
+        steps=int(steps),
+        block_length=int(block_len),
+        max_new_tokens=int(max_tokens),
+        remasking=str(remasking),
+    )
+    return text, status
+@spaces.GPU
+def v2t_handler(
+    video,
+    steps,
+    block_len,
+    max_tokens,
+):
+    app = get_app()
+    text, status = app.run_v2t(
+        video_path=video,
+        steps=int(steps),
+        block_length=int(block_len),
+        max_new_tokens=int(max_tokens),
+    )
+    return text, status
+@spaces.GPU
+def v2s_handler(
+    video,
+    message,
+    max_tokens,
+    steps,
+    block_len,
+    temperature,
+    cfg_scale,
+):
+    app = get_app()
+    audio, status = app.run_v2s(
+        video_path=video,
+        message=message,
+        max_new_tokens=int(max_tokens),
+        steps=int(steps),
+        block_length=int(block_len),
+        temperature=float(temperature),
+        cfg_scale=float(cfg_scale),
+    )
+    return audio, status
+@spaces.GPU
+def i2s_handler(
+    image,
+    message,
+    max_tokens,
+    steps,
+    block_len,
+    temperature,
+    cfg_scale,
+):
+    app = get_app()
+    audio, status = app.run_i2s(
+        image=image,
+        message=message,
+        max_new_tokens=int(max_tokens),
+        steps=int(steps),
+        block_length=int(block_len),
+        temperature=float(temperature),
+        cfg_scale=float(cfg_scale),
+    )
+    return audio, status
+@spaces.GPU
+def chat_handler(
+    message,
+    max_tokens,
+    steps,
+    block_len,
+    temperature,
+):
+    app = get_app()
+    text, status = app.run_chat(
+        message=message,
+        max_new_tokens=int(max_tokens),
+        steps=int(steps),
+        block_length=int(block_len),
+        temperature=float(temperature),
+    )
+    return text, status
+@spaces.GPU
+def mmu_handler(
+    image_a,
+    image_b,
+    question,
+    max_tokens,
+    steps,
+    block_len,
+    temperature,
+):
+    app = get_app()
+    text, status = app.run_mmu_dual(
+        image_a=image_a,
+        image_b=image_b,
+        message=question,
+        max_new_tokens=int(max_tokens),
+        steps=int(steps),
+        block_length=int(block_len),
+        temperature=float(temperature),
+    )
+    return text, status
 @spaces.GPU
+def t2i_handler(
+    prompt,
+    timesteps,
+    temperature,
+    guidance,
+):
+    app = get_app()
+    image, status = app.run_t2i(
+        prompt=prompt,
+        timesteps=int(timesteps),
+        temperature=float(temperature),
+        guidance_scale=float(guidance),
     )
+    return image, status
+@spaces.GPU
+def i2i_handler(
+    instruction,
+    image,
+    timesteps,
+    temperature,
+    guidance,
+):
+    app = get_app()
+    image_out, status = app.run_i2i(
+        instruction=instruction,
+        source_image=image,
+        timesteps=int(timesteps),
+        temperature=float(temperature),
+        guidance_scale=float(guidance),
+    )
+    return image_out, status
+# ---------------------------
+# Gradio UI (10 tabs)
+# ---------------------------
+theme = gr.themes.Soft(primary_hue="blue", neutral_hue="gray")
+with gr.Blocks(
+    title="AIDAS Lab @ SNU - OMni-modal Diffusion",
+    css=CUSTOM_CSS,
+    theme=theme,
+    js=FORCE_LIGHT_MODE_JS,
+) as demo:
+    gr.Markdown(
+        "## Omni-modal Diffusion Foundation Model\n"
+        "### AIDAS Lab @ SNU"
+    )
+    with gr.Tab("Text → Speech (T2S)"):
+        with gr.Row():
+            t2s_text = gr.Textbox(
+                label="Input text",
+                lines=4,
+                placeholder="Type the speech you want to synthesize...",
+            )
+            t2s_audio = gr.Audio(label="Generated speech", type="numpy")
+        t2s_status = gr.Textbox(label="Status", interactive=False)
+        with gr.Accordion("Advanced settings", open=False):
+            t2s_max_tokens = gr.Slider(2, 512, value=384, step=2, label="Speech token length")
+            t2s_steps = gr.Slider(2, 512, value=128, step=2, label="Total refinement steps")
+            t2s_block = gr.Slider(2, 512, value=128, step=2, label="Block length")
+            t2s_temperature = gr.Slider(0.0, 2.0, value=1.0, step=0.05, label="Sampling temperature")
+            t2s_cfg = gr.Slider(0.0, 6.0, value=3.5, step=0.1, label="CFG scale")
+            with gr.Row():
+                t2s_gender = gr.Dropdown(["random", "female", "male"], value="random", label="Gender")
+                t2s_emotion = gr.Dropdown(["random", "angry", "happy", "neutral", "sad"], value="random", label="Emotion")
+            with gr.Row():
+                t2s_speed = gr.Dropdown(["random", "normal", "fast", "slow"], value="random", label="Speed")
+                t2s_pitch = gr.Dropdown(["random", "normal", "high", "low"], value="random", label="Pitch")
+        t2s_btn = gr.Button("Generate speech", variant="primary")
+        t2s_btn.click(
+            t2s_handler,
+            inputs=[
+                t2s_text,
+                t2s_max_tokens,
+                t2s_steps,
+                t2s_block,
+                t2s_temperature,
+                t2s_cfg,
+                t2s_gender,
+                t2s_emotion,
+                t2s_speed,
+                t2s_pitch,
+            ],
+            outputs=[t2s_audio, t2s_status],
+        )
+    with gr.Tab("Speech → Speech (S2S)"):
+        s2s_audio_in = gr.Audio(type="filepath", label="Source speech", sources=["microphone", "upload"])
+        s2s_audio_out = gr.Audio(type="numpy", label="Reply speech")
+        s2s_status = gr.Textbox(label="Status", interactive=False)
+        with gr.Accordion("Advanced settings", open=False):
+            s2s_max_tokens = gr.Slider(2, 512, value=256, step=2, label="Reply token length")
+            s2s_steps = gr.Slider(2, 512, value=128, step=2, label="Refinement steps")
+            s2s_block = gr.Slider(2, 512, value=128, step=2, label="Block length")
+            s2s_temperature = gr.Slider(0.0, 2.0, value=0.0, step=0.05, label="Sampling temperature")
+            s2s_cfg = gr.Slider(0.0, 6.0, value=4.0, step=0.1, label="CFG scale")
+        s2s_btn = gr.Button("Generate reply speech", variant="primary")
+        s2s_btn.click(
+            s2s_handler,
+            inputs=[
+                s2s_audio_in,
+                s2s_max_tokens,
+                s2s_steps,
+                s2s_block,
+                s2s_temperature,
+                s2s_cfg,
+            ],
+            outputs=[s2s_audio_out, s2s_status],
+        )
+    with gr.Tab("Speech → Text (S2T)"):
+        s2t_audio_in = gr.Audio(type="filepath", label="Speech input", sources=["microphone", "upload"])
+        s2t_text_out = gr.Textbox(label="Transcription", lines=4)
+        s2t_status = gr.Textbox(label="Status", interactive=False)
+        with gr.Accordion("Advanced settings", open=False):
+            s2t_steps = gr.Slider(2, 512, value=128, step=2, label="Denoising steps")
+            s2t_block = gr.Slider(2, 512, value=128, step=2, label="Block length")
+            s2t_max_tokens = gr.Slider(2, 512, value=128, step=2, label="Max new tokens")
+            s2t_remasking = gr.Dropdown(
+                ["low_confidence", "random"],
+                value="low_confidence",
+                label="Remasking strategy",
+            )
+        s2t_btn = gr.Button("Transcribe", variant="primary")
+        s2t_btn.click(
+            s2t_handler,
+            inputs=[s2t_audio_in, s2t_steps, s2t_block, s2t_max_tokens, s2t_remasking],
+            outputs=[s2t_text_out, s2t_status],
+        )
+    with gr.Tab("Video → Text (V2T)"):
+        v2t_video_in = gr.Video(
+            label="Upload or record video",
+            height=256,
+            sources=["upload", "webcam"],
+        )
+        v2t_text_out = gr.Textbox(label="Caption / answer", lines=4)
+        v2t_status = gr.Textbox(label="Status", interactive=False)
+        with gr.Accordion("Advanced settings", open=False):
+            v2t_steps = gr.Slider(2, 512, value=64, step=2, label="Denoising steps")
+            v2t_block = gr.Slider(2, 512, value=64, step=2, label="Block length")
+            v2t_max_tokens = gr.Slider(2, 512, value=64, step=2, label="Max new tokens")
+        v2t_btn = gr.Button("Generate caption", variant="primary")
+        v2t_btn.click(
+            v2t_handler,
+            inputs=[v2t_video_in, v2t_steps, v2t_block, v2t_max_tokens],
+            outputs=[v2t_text_out, v2t_status],
+        )
+    with gr.Tab("Video → Speech (V2S)"):
+        v2s_video_in = gr.Video(
+            label="Upload or record video",
+            height=256,
+            sources=["upload", "webcam"],
+        )
+        v2s_prompt = gr.Textbox(
+            label="Optional instruction",
+            placeholder="(Optional) e.g., 'Describe this scene in spoken form.'",
+        )
+        v2s_audio_out = gr.Audio(type="numpy", label="Generated speech")
+        v2s_status = gr.Textbox(label="Status", interactive=False)
+        with gr.Accordion("Advanced settings", open=False):
+            v2s_max_tokens = gr.Slider(2, 512, value=256, step=2, label="Reply token length")
+            v2s_steps = gr.Slider(2, 512, value=128, step=2, label="Refinement steps")
+            v2s_block = gr.Slider(2, 512, value=128, step=2, label="Block length")
+            v2s_temperature = gr.Slider(0.0, 2.0, value=1.0, step=0.05, label="Sampling temperature")
+            v2s_cfg = gr.Slider(0.0, 6.0, value=3.0, step=0.1, label="CFG scale")
+        v2s_btn = gr.Button("Generate speech from video", variant="primary")
+        v2s_btn.click(
+            v2s_handler,
+            inputs=[
+                v2s_video_in,
+                v2s_prompt,
+                v2s_max_tokens,
+                v2s_steps,
+                v2s_block,
+                v2s_temperature,
+                v2s_cfg,
+            ],
+            outputs=[v2s_audio_out, v2s_status],
+        )
+    with gr.Tab("Image → Speech (I2S)"):
+        i2s_image_in = gr.Image(type="pil", label="Image input", sources=["upload"])
+        i2s_prompt = gr.Textbox(
+            label="Optional question",
+            placeholder="(Optional) e.g., 'Describe this image aloud.'",
+        )
+        i2s_audio_out = gr.Audio(type="numpy", label="Spoken description")
+        i2s_status = gr.Textbox(label="Status", interactive=False)
+        with gr.Accordion("Advanced settings", open=False):
+            i2s_max_tokens = gr.Slider(2, 512, value=256, step=2, label="Reply token length")
+            i2s_steps = gr.Slider(2, 512, value=256, step=2, label="Refinement steps")
+            i2s_block = gr.Slider(2, 512, value=256, step=2, label="Block length")
+            i2s_temperature = gr.Slider(0.0, 2.0, value=1.0, step=0.05, label="Sampling temperature")
+            i2s_cfg = gr.Slider(0.0, 6.0, value=3.0, step=0.1, label="CFG scale")
+        i2s_btn = gr.Button("Generate spoken description", variant="primary")
+        i2s_btn.click(
+            i2s_handler,
+            inputs=[
+                i2s_image_in,
+                i2s_prompt,
+                i2s_max_tokens,
+                i2s_steps,
+                i2s_block,
+                i2s_temperature,
+                i2s_cfg,
+            ],
+            outputs=[i2s_audio_out, i2s_status],
+        )
+    with gr.Tab("Text Chat"):
+        chat_in = gr.Textbox(
+            label="Message",
+            lines=4,
+            placeholder="Ask anything. The model will reply in text.",
+        )
+        chat_out = gr.Textbox(label="Assistant reply", lines=6)
+        chat_status = gr.Textbox(label="Status", interactive=False)
+        with gr.Accordion("Advanced settings", open=False):
+            chat_max_tokens = gr.Slider(2, 512, value=64, step=2, label="Reply max tokens")
+            chat_steps = gr.Slider(2, 512, value=64, step=2, label="Refinement steps")
+            chat_block = gr.Slider(2, 512, value=64, step=2, label="Block length")
+            chat_temperature_slider = gr.Slider(0.0, 2.0, value=0.8, step=0.05, label="Sampling temperature")
+        chat_btn = gr.Button("Send", variant="primary")
+        chat_btn.click(
+            chat_handler,
+            inputs=[
+                chat_in,
+                chat_max_tokens,
+                chat_steps,
+                chat_block,
+                chat_temperature_slider,
+            ],
+            outputs=[chat_out, chat_status],
+        )
+    with gr.Tab("MMU (2 images → text)"):
+        mmu_img_a = gr.Image(type="pil", label="Image A", sources=["upload"])
+        mmu_img_b = gr.Image(type="pil", label="Image B", sources=["upload"])
+        mmu_question = gr.Textbox(
+            label="Question",
+            lines=3,
+            placeholder="Ask about the relationship or differences between the two images.",
+        )
+        mmu_answer = gr.Textbox(label="Answer", lines=6)
+        mmu_status = gr.Textbox(label="Status", interactive=False)
+        with gr.Accordion("Advanced settings", open=False):
+            mmu_max_tokens = gr.Slider(2, 512, value=256, step=2, label="Answer max tokens")
+            mmu_steps = gr.Slider(2, 512, value=256, step=2, label="Refinement steps")
+            mmu_block = gr.Slider(2, 512, value=128, step=2, label="Block length")
+            mmu_temperature = gr.Slider(0.0, 2.0, value=0.7, step=0.05, label="Sampling temperature")
+        mmu_btn = gr.Button("Answer about the two images", variant="primary")
+        mmu_btn.click(
+            mmu_handler,
+            inputs=[
+                mmu_img_a,
+                mmu_img_b,
+                mmu_question,
+                mmu_max_tokens,
+                mmu_steps,
+                mmu_block,
+                mmu_temperature,
+            ],
+            outputs=[mmu_answer, mmu_status],
+        )
+    with gr.Tab("Text → Image (T2I)"):
+        t2i_prompt = gr.Textbox(
+            label="Prompt",
+            lines=4,
+            placeholder="Describe the image you want to generate...",
+        )
+        t2i_image_out = gr.Image(label="Generated image")
+        t2i_status = gr.Textbox(label="Status", interactive=False)
+        with gr.Accordion("Advanced settings", open=False):
+            t2i_timesteps = gr.Slider(4, 128, value=32, step=2, label="Timesteps")
+            t2i_temperature = gr.Slider(0.0, 2.0, value=1.0, step=0.05, label="Sampling temperature")
+            t2i_guidance = gr.Slider(0.0, 8.0, value=3.5, step=0.1, label="CFG scale")
+        t2i_btn = gr.Button("Generate image", variant="primary")
+        t2i_btn.click(
+            t2i_handler,
+            inputs=[t2i_prompt, t2i_timesteps, t2i_temperature, t2i_guidance],
+            outputs=[t2i_image_out, t2i_status],
+        )
+    with gr.Tab("Image Editing (I2I)"):
+        i2i_image_in = gr.Image(type="pil", label="Reference image", sources=["upload"])
+        i2i_instr = gr.Textbox(
+            label="Editing instruction",
+            lines=4,
+            placeholder="Describe how you want to edit the image...",
+        )
+        i2i_image_out = gr.Image(label="Edited image")
+        i2i_status = gr.Textbox(label="Status", interactive=False)
+        with gr.Accordion("Advanced settings", open=False):
+            i2i_timesteps = gr.Slider(4, 128, value=18, step=2, label="Timesteps")
+            i2i_temperature = gr.Slider(0.0, 2.0, value=1.0, step=0.05, label="Sampling temperature")
+            i2i_guidance = gr.Slider(0.0, 8.0, value=3.5, step=0.1, label="CFG scale")
+        i2i_btn = gr.Button("Apply edit", variant="primary")
+        i2i_btn.click(
+            i2i_handler,
+            inputs=[i2i_instr, i2i_image_in, i2i_timesteps, i2i_temperature, i2i_guidance],
+            outputs=[i2i_image_out, i2i_status],
+        )
 if __name__ == "__main__":
+    demo.launch()