Spaces:

AIDAS-Lab
/

Dynin-Omni

Running on Zero

App Files Files Community

jaeikkim commited on Feb 25

Commit

f9c9d68

1 Parent(s): 19f1a97

Dynin-Omni

Browse files

Files changed (4) hide show

.gradio/certificate.pem +31 -0
MMaDA/inference/common.py +12 -0
MMaDA/inference/gradio_multimodal_demo_inst.py +8 -6
app.py +125 -35

.gradio/certificate.pem ADDED Viewed

	@@ -0,0 +1,31 @@

+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----

MMaDA/inference/common.py CHANGED Viewed

@@ -42,9 +42,21 @@ def get_vq_model_audio(cfg, device):
 def build_uni_prompting(cfg) -> Tuple[UniversalPrompting, AutoTokenizer]:
     tokenizer = AutoTokenizer.from_pretrained(cfg.model.omada.tokenizer_path, padding_side="left")
     uni_prompting = UniversalPrompting(
         tokenizer,
         max_text_len=cfg.dataset.preprocessing.max_seq_length,
         max_audio_len=cfg.dataset.preprocessing.max_aud_length,
         special_tokens=(
             "<|soi|>", "<|eoi|>", "<|sov|>", "<|eov|>", "<|t2i|>",

 def build_uni_prompting(cfg) -> Tuple[UniversalPrompting, AutoTokenizer]:
     tokenizer = AutoTokenizer.from_pretrained(cfg.model.omada.tokenizer_path, padding_side="left")
+    dataset_cfg = getattr(cfg, "dataset", None)
+    dataset_params = getattr(dataset_cfg, "params", None) if dataset_cfg else None
+    preproc_cfg = getattr(dataset_cfg, "preprocessing", None) if dataset_cfg else None
+    # MMU image tokens at 480 resolution are typically ~900 tokens (with patch size 16),
+    # so 512 can silently drop the whole image in mmu_mult_prompt.
+    max_image_len = int(
+        getattr(dataset_params, "max_image_len", 0)
+        or getattr(preproc_cfg, "max_image_len", 0)
+        or getattr(preproc_cfg, "max_seq_length_image", 0)
+        or 1024
+    )
     uni_prompting = UniversalPrompting(
         tokenizer,
         max_text_len=cfg.dataset.preprocessing.max_seq_length,
+        max_image_len=max_image_len,
         max_audio_len=cfg.dataset.preprocessing.max_aud_length,
         special_tokens=(
             "<|soi|>", "<|eoi|>", "<|sov|>", "<|eov|>", "<|t2i|>",

MMaDA/inference/gradio_multimodal_demo_inst.py CHANGED Viewed

@@ -948,15 +948,15 @@ class OmadaDemo:
                 "temperature": 0.0,
             },
             "i2i": {
-                "timesteps": 64,
                 "guidance_scale": 2.5,
                 "temperature": 0.0,
             },
             # Match defaults used in inference scripts for eval parity.
             "t2s": {
-                "steps": 128,
-                "block_length": 128,
-                "max_new_tokens": int(self.max_audio_len_short),
                 "temperature": 0.0,
                 "guidance_scale": float(_cfg_get(training_cfg, "guidance_scale", 3.5)),
             },
@@ -2639,7 +2639,7 @@ class OmadaDemo:
         if not encoded_images:
             return "", "Failed to encode the provided image(s)."
-        question = (question or "").strip() or "Describe the visual content."
         if "<|start_header_id|>" in question:
             prompt = question
         else:
@@ -2706,7 +2706,9 @@ class OmadaDemo:
         ).strip()
         print(
             f"[MMU] input_prompt_len={input_prompt_len} output_len={int(output_ids.shape[1])} "
-            f"gen_len={int(gen_slice.numel())} first_ids={gen_slice[:16].detach().cpu().tolist()}",
             flush=True,
         )
         try:

                 "temperature": 0.0,
             },
             "i2i": {
+                "timesteps": 32,
                 "guidance_scale": 2.5,
                 "temperature": 0.0,
             },
             # Match defaults used in inference scripts for eval parity.
             "t2s": {
+                "steps": 256,
+                "block_length": 256,
+                "max_new_tokens": 512,
                 "temperature": 0.0,
                 "guidance_scale": float(_cfg_get(training_cfg, "guidance_scale", 3.5)),
             },
         if not encoded_images:
             return "", "Failed to encode the provided image(s)."
+        question = (question or "").strip() or "Describe the given image in detail."
         if "<|start_header_id|>" in question:
             prompt = question
         else:
         ).strip()
         print(
             f"[MMU] input_prompt_len={input_prompt_len} output_len={int(output_ids.shape[1])} "
+            f"gen_len={int(gen_slice.numel())} image_tok_len={int(encoded_images[0].numel()) if encoded_images else -1} "
+            f"max_image_len={int(getattr(self.uni_prompting, 'max_image_len', -1))} "
+            f"first_ids={gen_slice[:16].detach().cpu().tolist()}",
             flush=True,
         )
         try:

app.py CHANGED Viewed

@@ -104,7 +104,7 @@ from inference.gradio_multimodal_demo_inst import (  # noqa: E402
 def download_assets() -> Path:
     """Download demo assets (logo + sample prompts/media) and return the root path."""
-    repo_id = os.getenv("ASSET_REPO_ID", "jaeikkim/AIDAS-Omni-Modal-Diffusion-assets")
     revision = os.getenv("ASSET_REVISION", "main")
     token = os.getenv("HF_TOKEN")
     cache_dir = PROJECT_ROOT / "_asset_cache"
@@ -247,8 +247,10 @@ def _load_i2i_examples():
     )
     n = min(len(image_files), len(text_files))
     examples = []
-    for i in range(2):
         img_path = image_files[i]
         txt_path = text_files[i]
         instruction = txt_path.read_text(encoding="utf-8").strip()
@@ -373,14 +375,16 @@ def _render_image_message(status: str, image: Image.Image):
         return _render_response(status)
     encoded = base64.b64encode(buffer.getvalue()).decode("ascii")
-    image_html = (
-        "<div class='omada-response-block'>"
         "<img src='data:image/png;base64,"
         f"{encoded}"
-        "' alt='Generated image' style='max-width:100%;border-radius:12px;' />"
         "</div>"
     )
-    return _render_response(status, image_html)
 def _render_user_message(mode: str, message: str, image_in, audio_in, video_in, defer_video: bool = False) -> str:
@@ -588,6 +592,7 @@ V2T_EXAMPLES = _load_media_examples("v2t", {".mp4", ".mov", ".avi", ".webm"})
 # MMU images
 MMU_DIR = ASSET_ROOT / "mmu"
 MMU_EXAMPLES: List[List[str]] = []
 if MMU_DIR.exists():
     for path in sorted(
         [
@@ -598,7 +603,7 @@ if MMU_DIR.exists():
     ):
         MMU_EXAMPLES.append([
             str(path),
-            "Describe the important objects and their relationships in this image.",
         ])
@@ -1203,6 +1208,41 @@ html, body, .gradio-container {
   background: rgba(255, 255, 255, 0.50) !important;
   color: #1f2937 !important;
 }
 .omada-chip button {
   color: #273247 !important;
 }
@@ -1619,7 +1659,7 @@ with gr.Blocks(**_blocks_kwargs) as demo:
             {
                 "label": "🖼️ Image QA",
                 "mode": "MMU (Image → Text)",
-                "text": MMU_EXAMPLES[0][1] if MMU_EXAMPLES else "Describe the important objects and their relationships in this image.",
                 "image": MMU_EXAMPLES[0][0] if MMU_EXAMPLES else None,
                 "audio": None,
                 "video": None,
@@ -1630,7 +1670,7 @@ with gr.Blocks(**_blocks_kwargs) as demo:
                 "text": "",
                 "image": None,
                 "audio": None,
-                "video": V2T_EXAMPLES[0][0] if V2T_EXAMPLES else None,
             },
             {
                 "label": "🎨 Image Generation",
@@ -1720,9 +1760,9 @@ with gr.Blocks(**_blocks_kwargs) as demo:
         adv_t2s = gr.Column(visible=False)
         with adv_t2s:
-            t2s_max_tokens = gr.Slider(2, 512, value=384, step=2, label="Speech token length", interactive=True)
-            t2s_steps = gr.Slider(2, 512, value=128, step=2, label="T2S refinement steps", interactive=True)
-            t2s_block = gr.Slider(2, 512, value=128, step=2, label="T2S block length", interactive=True)
             t2s_temperature = gr.Slider(0.0, 2.0, value=0.0, step=0.05, label="T2S temperature", interactive=True)
             t2s_cfg = gr.Slider(0.0, 6.0, value=3.5, step=0.1, label="T2S CFG scale", interactive=True)
             t2s_gender = gr.Dropdown(["random", "female", "male"], value="random", label="T2S gender", interactive=True)
@@ -1751,7 +1791,7 @@ with gr.Blocks(**_blocks_kwargs) as demo:
         adv_i2i = gr.Column(visible=False)
         with adv_i2i:
-            i2i_timesteps = gr.Slider(4, 128, value=64, step=2, label="I2I timesteps", interactive=True)
             i2i_temperature = gr.Slider(0.0, 2.0, value=0.0, step=0.05, label="I2I temperature", interactive=True)
             i2i_guidance = gr.Slider(0.0, 8.0, value=2.5, step=0.1, label="I2I CFG scale", interactive=True)
@@ -1885,6 +1925,41 @@ def _format_user_message(msg: str) -> str:
     return msg.strip() if msg else " "
 def _is_identity_query(message: str) -> bool:
     q = re.sub(r"[^a-z0-9\s]", " ", (message or "").lower())
     q = re.sub(r"\s+", " ", q).strip()
@@ -1942,12 +2017,12 @@ def _chat_handler(
     mmu_temperature,
 ):
     _set_global_seed()
-    history = history or []
     message = (message or "").strip()
     defer_video = mode == "MMU (Video → Text)" and bool(video_in)
     display_user = _render_user_message(mode, message, image_in, audio_in, video_in, defer_video=defer_video)
-    history.append((display_user, _render_text_message("Generating...", "")))
-    yield history, ""
     if mode == "Chat" and _is_identity_query(message):
         fixed = (
@@ -1955,15 +2030,17 @@ def _chat_handler(
             "I can understand and generate text, images, speech, and video within a single architecture."
         )
         history[-1] = (display_user, _render_text_message("Assistant reply generated.", fixed))
-        yield history, ""
         return
     if defer_video:
         display_user = _render_user_message(mode, message, image_in, audio_in, video_in, defer_video=False)
         history[-1] = (display_user, history[-1][1])
-        yield history, ""
     app = get_app()
     # Respect UI mode: Auto uses eval-matched defaults, Custom uses UI values.
     app.force_eval_settings = str(auto_mode).strip().lower() == "auto"
@@ -1978,13 +2055,13 @@ def _chat_handler(
         ):
             response = _render_response(status, reply_html)
             history[-1] = (display_user, response)
-            yield history, ""
         return
     if mode == "TTS":
         if not message:
             history[-1] = (display_user, _render_text_message("Please type some text.", ""))
-            yield history, ""
             return
         audio, status = app.run_t2s(
             message,
@@ -1999,13 +2076,13 @@ def _chat_handler(
             t2s_pitch,
         )
         history[-1] = (display_user, _render_audio_message(status, audio))
-        yield history, ""
         return
     if mode == "ASR":
         if not audio_in:
             history[-1] = (display_user, _render_text_message("Please upload audio.", ""))
-            yield history, ""
             return
         for text, status in app.run_s2t_stream(
             audio_in,
@@ -2016,13 +2093,13 @@ def _chat_handler(
             update_every=32,
         ):
             history[-1] = (display_user, _render_text_message(status, text))
-            yield history, ""
         return
     if mode == "MMU (Video → Text)":
         if not video_in:
             history[-1] = (display_user, _render_text_message("Please upload a video.", ""))
-            yield history, ""
             return
         for text, status in app.run_v2t_stream(
             video_in,
@@ -2032,13 +2109,13 @@ def _chat_handler(
             update_every=32,
         ):
             history[-1] = (display_user, _render_text_message(status, text))
-            yield history, ""
         return
     if mode == "Image Generation":
         if not message:
             history[-1] = (display_user, _render_text_message("Please provide a prompt.", ""))
-            yield history, ""
             return
         for image, status in app.run_t2i_stream(
             message,
@@ -2048,17 +2125,17 @@ def _chat_handler(
             update_every=2,
         ):
             history[-1] = (display_user, _render_image_message(status, image))
-            yield history, ""
         return
     if mode == "Image Editing":
         if not image_in:
             history[-1] = (display_user, _render_text_message("Please upload an image.", ""))
-            yield history, ""
             return
         if not message:
             history[-1] = (display_user, _render_text_message("Please provide an edit instruction.", ""))
-            yield history, ""
             return
         for image, status in app.run_i2i_stream(
             message,
@@ -2069,14 +2146,22 @@ def _chat_handler(
             update_every=2,
         ):
             history[-1] = (display_user, _render_image_message(status, image))
-            yield history, ""
         return
     if mode == "MMU (Image → Text)":
         if not image_in:
             history[-1] = (display_user, _render_text_message("Please upload an image.", ""))
-            yield history, ""
             return
         reply, status = app.run_mmu(
             images=[image_in],
             message=message,
@@ -2086,11 +2171,11 @@ def _chat_handler(
             temperature=mmu_temperature,
         )
         history[-1] = (display_user, _render_text_message(status, reply))
-        yield history, ""
         return
     history[-1] = (display_user, _render_text_message("Unsupported mode.", ""))
-    yield history, ""
 with demo:
     def _hide_intro():
@@ -2178,14 +2263,19 @@ with demo:
 if __name__ == "__main__":
     _launch_kwargs = {
-        "allowed_paths": [str(PREVIEW_DIR), "/tmp"],
     }
     if GRADIO_V6_PLUS:
         _launch_kwargs.update(
             {
                 "css": CUSTOM_CSS + EXTRA_CSS,
                 "theme": theme,
-                "js": FORCE_LIGHT_MODE_JS,
             }
         )
     demo.launch(**_launch_kwargs)

 def download_assets() -> Path:
     """Download demo assets (logo + sample prompts/media) and return the root path."""
+    repo_id = os.getenv("ASSET_REPO_ID", "jaeikkim/Dynin-Omni-Demo-Assets")
     revision = os.getenv("ASSET_REVISION", "main")
     token = os.getenv("HF_TOKEN")
     cache_dir = PROJECT_ROOT / "_asset_cache"
     )
     n = min(len(image_files), len(text_files))
+    if n == 0:
+        return []
     examples = []
+    for i in range(n):
         img_path = image_files[i]
         txt_path = text_files[i]
         instruction = txt_path.read_text(encoding="utf-8").strip()
         return _render_response(status)
     encoded = base64.b64encode(buffer.getvalue()).decode("ascii")
+    safe_status = html.escape(status or "")
+    return (
+        "<div class='omada-image-only'>"
+        f"<p class='omada-image-status'>{safe_status}</p>"
         "<img src='data:image/png;base64,"
         f"{encoded}"
+        "' alt='Generated image' style='display:block;width:auto;height:auto;max-width:min(100%,720px);"
+        "border-radius:0;image-rendering:crisp-edges;image-rendering:-webkit-optimize-contrast;filter:none;opacity:1;' />"
         "</div>"
     )
 def _render_user_message(mode: str, message: str, image_in, audio_in, video_in, defer_video: bool = False) -> str:
 # MMU images
 MMU_DIR = ASSET_ROOT / "mmu"
 MMU_EXAMPLES: List[List[str]] = []
+DEFAULT_MMU_PROMPT = "Describe the given image in detail."
 if MMU_DIR.exists():
     for path in sorted(
         [
     ):
         MMU_EXAMPLES.append([
             str(path),
+            DEFAULT_MMU_PROMPT,
         ])
   background: rgba(255, 255, 255, 0.50) !important;
   color: #1f2937 !important;
 }
+/* Keep generated images crisp (no frosted overlay on image replies) */
+.gradio-chatbot .message {
+  backdrop-filter: none !important;
+  -webkit-backdrop-filter: none !important;
+}
+.gradio-chatbot .message.bot:has(.omada-image-only) {
+  background: transparent !important;
+  border: none !important;
+  box-shadow: none !important;
+  padding: 0 !important;
+  margin: 0 !important;
+}
+.omada-image-only {
+  display: inline-block;
+  background: transparent !important;
+  border: 0 !important;
+  box-shadow: none !important;
+  padding: 0 !important;
+  margin: 0 !important;
+  opacity: 1 !important;
+  filter: none !important;
+}
+.gradio-chatbot .message.bot:has(.omada-image-only) *,
+.omada-image-only * {
+  background: transparent !important;
+  box-shadow: none !important;
+  filter: none !important;
+  opacity: 1 !important;
+}
+.omada-image-status {
+  margin: 0 0 6px 0 !important;
+  font-size: 0.85rem !important;
+  color: #42526b !important;
+  font-weight: 600 !important;
+}
 .omada-chip button {
   color: #273247 !important;
 }
             {
                 "label": "🖼️ Image QA",
                 "mode": "MMU (Image → Text)",
+                "text": MMU_EXAMPLES[0][1] if MMU_EXAMPLES else DEFAULT_MMU_PROMPT,
                 "image": MMU_EXAMPLES[0][0] if MMU_EXAMPLES else None,
                 "audio": None,
                 "video": None,
                 "text": "",
                 "image": None,
                 "audio": None,
+                "video": V2T_EXAMPLES[-1][0] if V2T_EXAMPLES else None,
             },
             {
                 "label": "🎨 Image Generation",
         adv_t2s = gr.Column(visible=False)
         with adv_t2s:
+            t2s_max_tokens = gr.Slider(2, 512, value=512, step=2, label="Speech token length", interactive=True)
+            t2s_steps = gr.Slider(2, 512, value=256, step=2, label="T2S refinement steps", interactive=True)
+            t2s_block = gr.Slider(2, 512, value=256, step=2, label="T2S block length", interactive=True)
             t2s_temperature = gr.Slider(0.0, 2.0, value=0.0, step=0.05, label="T2S temperature", interactive=True)
             t2s_cfg = gr.Slider(0.0, 6.0, value=3.5, step=0.1, label="T2S CFG scale", interactive=True)
             t2s_gender = gr.Dropdown(["random", "female", "male"], value="random", label="T2S gender", interactive=True)
         adv_i2i = gr.Column(visible=False)
         with adv_i2i:
+            i2i_timesteps = gr.Slider(4, 128, value=32, step=2, label="I2I timesteps", interactive=True)
             i2i_temperature = gr.Slider(0.0, 2.0, value=0.0, step=0.05, label="I2I temperature", interactive=True)
             i2i_guidance = gr.Slider(0.0, 8.0, value=2.5, step=0.1, label="I2I CFG scale", interactive=True)
     return msg.strip() if msg else " "
+def _normalize_chat_history(history):
+    if not history:
+        return []
+    if isinstance(history, list) and history and isinstance(history[0], dict):
+        pairs = []
+        pending_user = None
+        for msg in history:
+            role = msg.get("role")
+            content = msg.get("content", "")
+            if role == "user":
+                if pending_user is not None:
+                    pairs.append((pending_user, ""))
+                pending_user = content
+            elif role == "assistant":
+                if pending_user is None:
+                    pairs.append((" ", content))
+                else:
+                    pairs.append((pending_user, content))
+                    pending_user = None
+        if pending_user is not None:
+            pairs.append((pending_user, ""))
+        return pairs
+    return list(history)
+def _serialize_chat_history(pairs):
+    if not GRADIO_V6_PLUS:
+        return pairs
+    messages = []
+    for user_msg, assistant_msg in pairs:
+        messages.append({"role": "user", "content": user_msg if user_msg is not None else " "})
+        messages.append({"role": "assistant", "content": assistant_msg if assistant_msg is not None else ""})
+    return messages
 def _is_identity_query(message: str) -> bool:
     q = re.sub(r"[^a-z0-9\s]", " ", (message or "").lower())
     q = re.sub(r"\s+", " ", q).strip()
     mmu_temperature,
 ):
     _set_global_seed()
+    history = _normalize_chat_history(history)
     message = (message or "").strip()
     defer_video = mode == "MMU (Video → Text)" and bool(video_in)
     display_user = _render_user_message(mode, message, image_in, audio_in, video_in, defer_video=defer_video)
+    history.append((display_user, _render_text_message("Model loading...", "")))
+    yield _serialize_chat_history(history), ""
     if mode == "Chat" and _is_identity_query(message):
         fixed = (
             "I can understand and generate text, images, speech, and video within a single architecture."
         )
         history[-1] = (display_user, _render_text_message("Assistant reply generated.", fixed))
+        yield _serialize_chat_history(history), ""
         return
     if defer_video:
         display_user = _render_user_message(mode, message, image_in, audio_in, video_in, defer_video=False)
         history[-1] = (display_user, history[-1][1])
+        yield _serialize_chat_history(history), ""
     app = get_app()
+    history[-1] = (display_user, _render_text_message("Generating...", ""))
+    yield _serialize_chat_history(history), ""
     # Respect UI mode: Auto uses eval-matched defaults, Custom uses UI values.
     app.force_eval_settings = str(auto_mode).strip().lower() == "auto"
         ):
             response = _render_response(status, reply_html)
             history[-1] = (display_user, response)
+            yield _serialize_chat_history(history), ""
         return
     if mode == "TTS":
         if not message:
             history[-1] = (display_user, _render_text_message("Please type some text.", ""))
+            yield _serialize_chat_history(history), ""
             return
         audio, status = app.run_t2s(
             message,
             t2s_pitch,
         )
         history[-1] = (display_user, _render_audio_message(status, audio))
+        yield _serialize_chat_history(history), ""
         return
     if mode == "ASR":
         if not audio_in:
             history[-1] = (display_user, _render_text_message("Please upload audio.", ""))
+            yield _serialize_chat_history(history), ""
             return
         for text, status in app.run_s2t_stream(
             audio_in,
             update_every=32,
         ):
             history[-1] = (display_user, _render_text_message(status, text))
+            yield _serialize_chat_history(history), ""
         return
     if mode == "MMU (Video → Text)":
         if not video_in:
             history[-1] = (display_user, _render_text_message("Please upload a video.", ""))
+            yield _serialize_chat_history(history), ""
             return
         for text, status in app.run_v2t_stream(
             video_in,
             update_every=32,
         ):
             history[-1] = (display_user, _render_text_message(status, text))
+            yield _serialize_chat_history(history), ""
         return
     if mode == "Image Generation":
         if not message:
             history[-1] = (display_user, _render_text_message("Please provide a prompt.", ""))
+            yield _serialize_chat_history(history), ""
             return
         for image, status in app.run_t2i_stream(
             message,
             update_every=2,
         ):
             history[-1] = (display_user, _render_image_message(status, image))
+            yield _serialize_chat_history(history), ""
         return
     if mode == "Image Editing":
         if not image_in:
             history[-1] = (display_user, _render_text_message("Please upload an image.", ""))
+            yield _serialize_chat_history(history), ""
             return
         if not message:
             history[-1] = (display_user, _render_text_message("Please provide an edit instruction.", ""))
+            yield _serialize_chat_history(history), ""
             return
         for image, status in app.run_i2i_stream(
             message,
             update_every=2,
         ):
             history[-1] = (display_user, _render_image_message(status, image))
+            yield _serialize_chat_history(history), ""
         return
     if mode == "MMU (Image → Text)":
         if not image_in:
             history[-1] = (display_user, _render_text_message("Please upload an image.", ""))
+            yield _serialize_chat_history(history), ""
             return
+        # Keep MMU QA consistent with chat mask-pill UX.
+        try:
+            mmu_mask_count = max(16, min(int(mmu_max_tokens or 128), 256))
+        except Exception:
+            mmu_mask_count = 128
+        mmu_mask_surface = " ".join(["<mdm_mask>"] * mmu_mask_count)
+        history[-1] = (display_user, _render_text_message("Generating...", mmu_mask_surface))
+        yield _serialize_chat_history(history), ""
         reply, status = app.run_mmu(
             images=[image_in],
             message=message,
             temperature=mmu_temperature,
         )
         history[-1] = (display_user, _render_text_message(status, reply))
+        yield _serialize_chat_history(history), ""
         return
     history[-1] = (display_user, _render_text_message("Unsupported mode.", ""))
+    yield _serialize_chat_history(history), ""
 with demo:
     def _hide_intro():
 if __name__ == "__main__":
     _launch_kwargs = {
+        "allowed_paths": [
+            str(PREVIEW_DIR),
+            str(PROJECT_ROOT),
+            str(ASSET_ROOT),
+            "/tmp",
+        ],
     }
     if GRADIO_V6_PLUS:
         _launch_kwargs.update(
             {
                 "css": CUSTOM_CSS + EXTRA_CSS,
                 "theme": theme,
+                "js": FORCE_LIGHT_MODE_JS
             }
         )
     demo.launch(**_launch_kwargs)