Spaces:

jaeikkim
/

AIDAS-Omni-Modal-Diffusion

Sleeping

App Files Files Community

jake commited on Nov 20, 2025

Commit

df89a6a

1 Parent(s): 05fc139

TF

Browse files

Files changed (1) hide show

app.py +239 -65

app.py CHANGED Viewed

@@ -168,6 +168,86 @@ def download_checkpoint() -> Path:
     return aliased
 # ---------------------------
 # Global OmadaDemo instance
 # ---------------------------
@@ -180,9 +260,8 @@ def get_app() -> OmadaDemo:
     if APP is not None:
         return APP
-    # Download everything once
     ckpt_dir = download_checkpoint()
-    asset_root = download_assets()
     style_root = download_style()
     # Wire style centroids to expected locations
@@ -440,11 +519,21 @@ with gr.Blocks(
     theme=theme,
     js=FORCE_LIGHT_MODE_JS,
 ) as demo:
     gr.Markdown(
         "## Omni-modal Diffusion Foundation Model\n"
         "### AIDAS Lab @ SNU"
     )
     with gr.Tab("Text → Speech (T2S)"):
         with gr.Row():
             t2s_text = gr.Textbox(
@@ -484,6 +573,15 @@ with gr.Blocks(
             outputs=[t2s_audio, t2s_status],
         )
     with gr.Tab("Speech → Speech (S2S)"):
         s2s_audio_in = gr.Audio(type="filepath", label="Source speech", sources=["microphone", "upload"])
         s2s_audio_out = gr.Audio(type="numpy", label="Reply speech")
@@ -508,6 +606,15 @@ with gr.Blocks(
             outputs=[s2s_audio_out, s2s_status],
         )
     with gr.Tab("Speech → Text (S2T)"):
         s2t_audio_in = gr.Audio(type="filepath", label="Speech input", sources=["microphone", "upload"])
         s2t_text_out = gr.Textbox(label="Transcription", lines=4)
@@ -528,6 +635,15 @@ with gr.Blocks(
             outputs=[s2t_text_out, s2t_status],
         )
     with gr.Tab("Video → Text (V2T)"):
         v2t_video_in = gr.Video(
             label="Upload or record video",
@@ -547,6 +663,15 @@ with gr.Blocks(
             outputs=[v2t_text_out, v2t_status],
         )
     with gr.Tab("Video → Speech (V2S)"):
         v2s_video_in = gr.Video(
             label="Upload or record video",
@@ -580,35 +705,64 @@ with gr.Blocks(
             outputs=[v2s_audio_out, v2s_status],
         )
-    with gr.Tab("Image → Speech (I2S)"):
-        i2s_image_in = gr.Image(type="pil", label="Image input", sources=["upload"])
-        i2s_prompt = gr.Textbox(
-            label="Optional question",
-            placeholder="(Optional) e.g., 'Describe this image aloud.'",
         )
-        i2s_audio_out = gr.Audio(type="numpy", label="Spoken description")
-        i2s_status = gr.Textbox(label="Status", interactive=False)
         with gr.Accordion("Advanced settings", open=False):
-            i2s_max_tokens = gr.Slider(2, 512, value=256, step=2, label="Reply token length")
-            i2s_steps = gr.Slider(2, 512, value=256, step=2, label="Refinement steps")
-            i2s_block = gr.Slider(2, 512, value=256, step=2, label="Block length")
-            i2s_temperature = gr.Slider(0.0, 2.0, value=1.0, step=0.05, label="Sampling temperature")
-            i2s_cfg = gr.Slider(0.0, 6.0, value=3.0, step=0.1, label="CFG scale")
-        i2s_btn = gr.Button("Generate spoken description", variant="primary")
-        i2s_btn.click(
-            i2s_handler,
-            inputs=[
-                i2s_image_in,
-                i2s_prompt,
-                i2s_max_tokens,
-                i2s_steps,
-                i2s_block,
-                i2s_temperature,
-                i2s_cfg,
-            ],
-            outputs=[i2s_audio_out, i2s_status],
         )
     with gr.Tab("Text Chat"):
         chat_in = gr.Textbox(
             label="Message",
@@ -635,6 +789,55 @@ with gr.Blocks(
             outputs=[chat_out, chat_status],
         )
     with gr.Tab("MMU (2 images → text)"):
         mmu_img_a = gr.Image(type="pil", label="Image A", sources=["upload"])
         mmu_img_b = gr.Image(type="pil", label="Image B", sources=["upload"])
@@ -665,45 +868,16 @@ with gr.Blocks(
             outputs=[mmu_answer, mmu_status],
         )
-    with gr.Tab("Text → Image (T2I)"):
-        t2i_prompt = gr.Textbox(
-            label="Prompt",
-            lines=4,
-            placeholder="Describe the image you want to generate...",
-        )
-        t2i_image_out = gr.Image(label="Generated image")
-        t2i_status = gr.Textbox(label="Status", interactive=False)
-        with gr.Accordion("Advanced settings", open=False):
-            t2i_timesteps = gr.Slider(4, 128, value=32, step=2, label="Timesteps")
-            t2i_temperature = gr.Slider(0.0, 2.0, value=1.0, step=0.05, label="Sampling temperature")
-            t2i_guidance = gr.Slider(0.0, 8.0, value=3.5, step=0.1, label="CFG scale")
-        t2i_btn = gr.Button("Generate image", variant="primary")
-        t2i_btn.click(
-            t2i_handler,
-            inputs=[t2i_prompt, t2i_timesteps, t2i_temperature, t2i_guidance],
-            outputs=[t2i_image_out, t2i_status],
-        )
-    with gr.Tab("Image Editing (I2I)"):
-        i2i_image_in = gr.Image(type="pil", label="Reference image", sources=["upload"])
-        i2i_instr = gr.Textbox(
-            label="Editing instruction",
-            lines=4,
-            placeholder="Describe how you want to edit the image...",
-        )
-        i2i_image_out = gr.Image(label="Edited image")
-        i2i_status = gr.Textbox(label="Status", interactive=False)
-        with gr.Accordion("Advanced settings", open=False):
-            i2i_timesteps = gr.Slider(4, 128, value=18, step=2, label="Timesteps")
-            i2i_temperature = gr.Slider(0.0, 2.0, value=1.0, step=0.05, label="Sampling temperature")
-            i2i_guidance = gr.Slider(0.0, 8.0, value=3.5, step=0.1, label="CFG scale")
-        i2i_btn = gr.Button("Apply edit", variant="primary")
-        i2i_btn.click(
-            i2i_handler,
-            inputs=[i2i_instr, i2i_image_in, i2i_timesteps, i2i_temperature, i2i_guidance],
-            outputs=[i2i_image_out, i2i_status],
-        )
 if __name__ == "__main__":
     demo.launch()

     return aliased
+# ---------------------------
+# Assets & examples from HF dataset
+# ---------------------------
+ASSET_ROOT = download_assets()
+DEMO_ROOT = ASSET_ROOT / "demo"
+LOGO_PATH = DEMO_ROOT / "logo.png"
+T2S_TEXT_PATH = DEMO_ROOT / "t2s" / "text.txt"
+CHAT_TEXT_PATH = DEMO_ROOT / "chat" / "text.txt"
+T2I_TEXT_PATH = DEMO_ROOT / "t2i" / "text.txt"
+def _load_text_examples(path: Path):
+    if not path.exists():
+        return []
+    try:
+        lines = [
+            line.strip()
+            for line in path.read_text(encoding="utf-8").splitlines()
+            if line.strip()
+        ]
+    except Exception:
+        return []
+    return [[line] for line in lines]
+def _load_media_examples(subdir: str, suffixes):
+    d = DEMO_ROOT / subdir
+    if not d.exists():
+        return []
+    examples = []
+    for p in sorted(d.iterdir()):
+        if p.is_file() and p.suffix.lower() in suffixes:
+            examples.append([str(p)])
+    return examples
+# 텍스트 기반 예제
+T2S_EXAMPLES = _load_text_examples(T2S_TEXT_PATH)
+CHAT_EXAMPLES = _load_text_examples(CHAT_TEXT_PATH)
+T2I_EXAMPLES = _load_text_examples(T2I_TEXT_PATH)
+# 오디오 / 비디오 / 이미지 예제
+_AUDIO_SUFFIXES = {".wav", ".mp3", ".flac", ".ogg"}
+_VIDEO_SUFFIXES = {".mp4", ".mov", ".avi", ".webm"}
+_IMAGE_SUFFIXES = {".png", ".jpg", ".jpeg", ".webp"}
+S2T_EXAMPLES = _load_media_examples("s2t", _AUDIO_SUFFIXES)
+V2T_EXAMPLES = _load_media_examples("v2t", _VIDEO_SUFFIXES)
+S2S_EXAMPLES = _load_media_examples("s2s", _AUDIO_SUFFIXES)
+if not S2S_EXAMPLES and S2T_EXAMPLES:
+    S2S_EXAMPLES = S2T_EXAMPLES[: min(4, len(S2T_EXAMPLES))]
+V2S_EXAMPLES = _load_media_examples("v2s", _VIDEO_SUFFIXES)
+if not V2S_EXAMPLES and V2T_EXAMPLES:
+    V2S_EXAMPLES = V2T_EXAMPLES[: min(4, len(V2T_EXAMPLES))]
+I2S_EXAMPLES = _load_media_examples("i2s", _IMAGE_SUFFIXES)
+# MMU: 2 images + question
+MMU_DIR = DEMO_ROOT / "mmu"
+MMU_EXAMPLES = []
+if MMU_DIR.exists():
+    mmu_imgs = [
+        p for p in sorted(MMU_DIR.iterdir())
+        if p.is_file() and p.suffix.lower() in _IMAGE_SUFFIXES
+    ]
+    if len(mmu_imgs) >= 2:
+        MMU_EXAMPLES = [[
+            str(mmu_imgs[0]),
+            str(mmu_imgs[1]),
+            "What are the differences between the two images?"
+        ]]
+# i2s가 없고 mmu 예제가 있으면, 첫 번째 이미지를 이미지 예제로 재사용
+if not I2S_EXAMPLES and MMU_EXAMPLES:
+    I2S_EXAMPLES = [[MMU_EXAMPLES[0][0]]]
 # ---------------------------
 # Global OmadaDemo instance
 # ---------------------------
     if APP is not None:
         return APP
+    # Download ckpt + style centroids once
     ckpt_dir = download_checkpoint()
     style_root = download_style()
     # Wire style centroids to expected locations
     theme=theme,
     js=FORCE_LIGHT_MODE_JS,
 ) as demo:
+    # 로고 (있으면)
+    if LOGO_PATH.exists():
+        gr.Image(
+            value=str(LOGO_PATH),
+            show_label=False,
+            height=140,
+            interactive=False,
+        )
     gr.Markdown(
         "## Omni-modal Diffusion Foundation Model\n"
         "### AIDAS Lab @ SNU"
     )
+    # ---------- T2S ----------
     with gr.Tab("Text → Speech (T2S)"):
         with gr.Row():
             t2s_text = gr.Textbox(
             outputs=[t2s_audio, t2s_status],
         )
+        if T2S_EXAMPLES:
+            gr.Markdown("**Sample prompts**")
+            gr.Examples(
+                examples=T2S_EXAMPLES,
+                inputs=[t2s_text],
+                examples_per_page=4,
+            )
+    # ---------- S2S ----------
     with gr.Tab("Speech → Speech (S2S)"):
         s2s_audio_in = gr.Audio(type="filepath", label="Source speech", sources=["microphone", "upload"])
         s2s_audio_out = gr.Audio(type="numpy", label="Reply speech")
             outputs=[s2s_audio_out, s2s_status],
         )
+        if S2S_EXAMPLES:
+            gr.Markdown("**Sample S2S clips**")
+            gr.Examples(
+                examples=S2S_EXAMPLES,
+                inputs=[s2s_audio_in],
+                examples_per_page=4,
+            )
+    # ---------- S2T ----------
     with gr.Tab("Speech → Text (S2T)"):
         s2t_audio_in = gr.Audio(type="filepath", label="Speech input", sources=["microphone", "upload"])
         s2t_text_out = gr.Textbox(label="Transcription", lines=4)
             outputs=[s2t_text_out, s2t_status],
         )
+        if S2T_EXAMPLES:
+            gr.Markdown("**Sample S2T clips**")
+            gr.Examples(
+                examples=S2T_EXAMPLES,
+                inputs=[s2t_audio_in],
+                examples_per_page=4,
+            )
+    # ---------- V2T ----------
     with gr.Tab("Video → Text (V2T)"):
         v2t_video_in = gr.Video(
             label="Upload or record video",
             outputs=[v2t_text_out, v2t_status],
         )
+        if V2T_EXAMPLES:
+            gr.Markdown("**Sample videos**")
+            gr.Examples(
+                examples=V2T_EXAMPLES,
+                inputs=[v2t_video_in],
+                examples_per_page=4,
+            )
+    # ---------- V2S ----------
     with gr.Tab("Video → Speech (V2S)"):
         v2s_video_in = gr.Video(
             label="Upload or record video",
             outputs=[v2s_audio_out, v2s_status],
         )
+        if V2S_EXAMPLES:
+            gr.Markdown("**Sample videos**")
+            gr.Examples(
+                examples=V2S_EXAMPLES,
+                inputs=[v2s_video_in],
+                examples_per_page=4,
+            )
+    # ---------- T2I ----------
+    with gr.Tab("Text → Image (T2I)"):
+        t2i_prompt = gr.Textbox(
+            label="Prompt",
+            lines=4,
+            placeholder="Describe the image you want to generate...",
         )
+        t2i_image_out = gr.Image(label="Generated image")
+        t2i_status = gr.Textbox(label="Status", interactive=False)
         with gr.Accordion("Advanced settings", open=False):
+            t2i_timesteps = gr.Slider(4, 128, value=32, step=2, label="Timesteps")
+            t2i_temperature = gr.Slider(0.0, 2.0, value=1.0, step=0.05, label="Sampling temperature")
+            t2i_guidance = gr.Slider(0.0, 8.0, value=3.5, step=0.1, label="CFG scale")
+        t2i_btn = gr.Button("Generate image", variant="primary")
+        t2i_btn.click(
+            t2i_handler,
+            inputs=[t2i_prompt, t2i_timesteps, t2i_temperature, t2i_guidance],
+            outputs=[t2i_image_out, t2i_status],
+        )
+        if T2I_EXAMPLES:
+            gr.Markdown("**Sample prompts**")
+            gr.Examples(
+                examples=T2I_EXAMPLES,
+                inputs=[t2i_prompt],
+                examples_per_page=4,
+            )
+    # ---------- I2I ----------
+    with gr.Tab("Image Editing (I2I)"):
+        i2i_image_in = gr.Image(type="pil", label="Reference image", sources=["upload"])
+        i2i_instr = gr.Textbox(
+            label="Editing instruction",
+            lines=4,
+            placeholder="Describe how you want to edit the image...",
+        )
+        i2i_image_out = gr.Image(label="Edited image")
+        i2i_status = gr.Textbox(label="Status", interactive=False)
+        with gr.Accordion("Advanced settings", open=False):
+            i2i_timesteps = gr.Slider(4, 128, value=18, step=2, label="Timesteps")
+            i2i_temperature = gr.Slider(0.0, 2.0, value=1.0, step=0.05, label="Sampling temperature")
+            i2i_guidance = gr.Slider(0.0, 8.0, value=3.5, step=0.1, label="CFG scale")
+        i2i_btn = gr.Button("Apply edit", variant="primary")
+        i2i_btn.click(
+            i2i_handler,
+            inputs=[i2i_instr, i2i_image_in, i2i_timesteps, i2i_temperature, i2i_guidance],
+            outputs=[i2i_image_out, i2i_status],
         )
+    # ---------- Chat ----------
     with gr.Tab("Text Chat"):
         chat_in = gr.Textbox(
             label="Message",
             outputs=[chat_out, chat_status],
         )
+        if CHAT_EXAMPLES:
+            gr.Markdown("**Sample prompts**")
+            gr.Examples(
+                examples=CHAT_EXAMPLES,
+                inputs=[chat_in],
+                examples_per_page=4,
+            )
+    # ---------- I2S ----------
+    with gr.Tab("Image → Speech (I2S)"):
+        i2s_image_in = gr.Image(type="pil", label="Image input", sources=["upload"])
+        i2s_prompt = gr.Textbox(
+            label="Optional question",
+            placeholder="(Optional) e.g., 'Describe this image aloud.'",
+        )
+        i2s_audio_out = gr.Audio(type="numpy", label="Spoken description")
+        i2s_status = gr.Textbox(label="Status", interactive=False)
+        with gr.Accordion("Advanced settings", open=False):
+            i2s_max_tokens = gr.Slider(2, 512, value=256, step=2, label="Reply token length")
+            i2s_steps = gr.Slider(2, 512, value=256, step=2, label="Refinement steps")
+            i2s_block = gr.Slider(2, 512, value=256, step=2, label="Block length")
+            i2s_temperature = gr.Slider(0.0, 2.0, value=1.0, step=0.05, label="Sampling temperature")
+            i2s_cfg = gr.Slider(0.0, 6.0, value=3.0, step=0.1, label="CFG scale")
+        i2s_btn = gr.Button("Generate spoken description", variant="primary")
+        i2s_btn.click(
+            i2s_handler,
+            inputs=[
+                i2s_image_in,
+                i2s_prompt,
+                i2s_max_tokens,
+                i2s_steps,
+                i2s_block,
+                i2s_temperature,
+                i2s_cfg,
+            ],
+            outputs=[i2s_audio_out, i2s_status],
+        )
+        if I2S_EXAMPLES:
+            gr.Markdown("**Sample images**")
+            gr.Examples(
+                examples=I2S_EXAMPLES,
+                inputs=[i2s_image_in],
+                examples_per_page=4,
+            )
+    # ---------- MMU ----------
     with gr.Tab("MMU (2 images → text)"):
         mmu_img_a = gr.Image(type="pil", label="Image A", sources=["upload"])
         mmu_img_b = gr.Image(type="pil", label="Image B", sources=["upload"])
             outputs=[mmu_answer, mmu_status],
         )
+        if MMU_EXAMPLES:
+            gr.Markdown("**Sample MMU example**")
+            gr.Examples(
+                examples=MMU_EXAMPLES,
+                inputs=[mmu_img_a, mmu_img_b, mmu_question],
+                examples_per_page=1,
+            )
+        # I2I는 별도 예제 텍스트/이미지 구조가 애매해서 일단 생략
+        # (필요하면 demo/i2i_prompt.txt + demo/i2i_images/ 로 나눠서 넣고 wiring 하면 됨)
 if __name__ == "__main__":
     demo.launch()