Spaces:

AIDAS-Lab
/

Dynin-Omni

Configuration error

App Files Files Community

jaeikkim commited on Feb 24

Commit

7b822c3

1 Parent(s): afc1fa2

Dynin-Omni

Browse files

Files changed (6) hide show

.gitignore +3 -0
EMOVA_speech_tokenizer/emova_speech_tokenizer/speech_tokenization/condition_style_centroid +1 -0
MMaDA/inference/gradio_multimodal_demo_inst.py +721 -78
MMaDA/models/modeling_omada.py +287 -2
MMaDA/models/speech_tokenization/condition_style_centroid +1 -0
app.py +1233 -243

.gitignore CHANGED Viewed

@@ -1,3 +1,6 @@
 __pycache__/
 *.pyc
 MMaDA/inference/demo/ti2ti/

 __pycache__/
 *.pyc
 MMaDA/inference/demo/ti2ti/
+_asset_cache/
+_preview_cache/
+_style_cache/

EMOVA_speech_tokenizer/emova_speech_tokenizer/speech_tokenization/condition_style_centroid ADDED Viewed

	@@ -0,0 +1 @@


1	+ /dataset/omada/AIDAS-Omni-Modal-Diffusion/_style_cache

MMaDA/inference/gradio_multimodal_demo_inst.py CHANGED Viewed

@@ -25,6 +25,7 @@ import io
 import os
 import math
 import random
 import sys
 import tempfile
 import wave
@@ -207,6 +208,14 @@ html, body, body.dark, html.dark {
   box-shadow: none;
   border: 1px solid var(--omada-border);
   background: #ffffff;
 }
 .omada-controls {
   gap: 16px !important;
@@ -724,7 +733,7 @@ import cv2
 import gradio as gr
 import numpy as np
 import torch
-from omegaconf import DictConfig, OmegaConf
 from PIL import Image
 from inference.common import (
@@ -745,6 +754,12 @@ def _cfg_get(cfg, key, default=None):
     if cfg is None:
         return default
     if isinstance(cfg, dict):
         return cfg.get(key, default)
     try:
@@ -897,6 +912,14 @@ class OmadaDemo:
             )
         )
         self.max_text_len = int(getattr(self.train_cfg.dataset.preprocessing, "max_seq_length", 1024))
         model_seq_len = getattr(self.model.config, "num_vq_tokens", None)
         if model_seq_len is None:
@@ -913,7 +936,55 @@ class OmadaDemo:
         self.noise_type = _cfg_get(training_cfg, "noise_type", "mask")
         self.predict_all_tokens = bool(_cfg_get(training_cfg, "predict_all_tokens", False))
         self.t2i_default_timesteps = int(_cfg_get(training_cfg, "generation_timesteps", 20))
-        self.i2i_default_timesteps = int(_cfg_get(training_cfg, "i2i_eval_timesteps", 24))
         self.audio_condition_default = "gender-female_emotion-neutral_speed-normal_pitch-normal"
         style_map = getattr(getattr(self.vq_audio, "config", None), "u2s_style2idx", None)
@@ -942,6 +1013,13 @@ class OmadaDemo:
         speed_choice: str,
         pitch_choice: str,
     ) -> Tuple[Optional[Tuple[int, np.ndarray]], str]:
         if text is None or not text.strip():
             return None, "Please provide text to synthesize."
@@ -1013,6 +1091,110 @@ class OmadaDemo:
         status = f"Speech generated! ({gender}/{emotion}/{speed}/{pitch})."
         return audio, status
     # ------------------------------------------------------------------
     # Speech-to-Speech
     # ------------------------------------------------------------------
@@ -1134,10 +1316,20 @@ class OmadaDemo:
         max_new_tokens: int,
         remasking: str,
     ) -> Tuple[str, str]:
         if not audio_path:
             return "", "Please upload an audio file first."
         tokens = self.vq_audio.encode(audio_path).to(self.device)
         offset = self.text_vocab_size + self.speech_codebook
         tokens = tokens + offset
@@ -1175,13 +1367,93 @@ class OmadaDemo:
                 remasking=str(remasking),
             )
-        decoded = self.uni_prompting.text_tokenizer.batch_decode(
-            output_ids[:, input_ids.shape[1]:],
-            skip_special_tokens=True,
-        )[0]
         return decoded.strip(), "Transcription generated successfully."
     # ------------------------------------------------------------------
     # Video-to-Text
     # ------------------------------------------------------------------
@@ -1192,6 +1464,11 @@ class OmadaDemo:
         block_length: int,
         max_new_tokens: int,
     ) -> Tuple[str, str]:
         resolved_path, converted = self._prepare_video_path(video_path)
         if not resolved_path:
@@ -1241,14 +1518,80 @@ class OmadaDemo:
         raw_all = self.uni_prompting.text_tokenizer.decode(output_ids[0], skip_special_tokens=False)
         print("[V2T] RAW ALL:", repr(raw_all))
-        decoded = self.uni_prompting.text_tokenizer.batch_decode(
-            output_ids[:, input_ids.shape[1]:],
-            skip_special_tokens=True,
-        )[0]
         print("[V2T] DECODED SLICE:", repr(decoded))
         return decoded.strip(), "Video caption generated successfully."
     # ------------------------------------------------------------------
     # Text-to-Image
     # ------------------------------------------------------------------
@@ -1259,6 +1602,11 @@ class OmadaDemo:
         temperature: float,
         guidance_scale: float,
     ) -> Tuple[Optional[Image.Image], str]:
         if not prompt or not prompt.strip():
             return None, "Please provide a text prompt."
@@ -1307,6 +1655,65 @@ class OmadaDemo:
         image = self._decode_image_tokens(gen_tokens[0])
         return image, "Image generated from text prompt."
     # ------------------------------------------------------------------
     # Image-to-Image Editing
     # ------------------------------------------------------------------
@@ -1318,6 +1725,11 @@ class OmadaDemo:
         temperature: float,
         guidance_scale: float,
     ) -> Tuple[Optional[Image.Image], str]:
         if source_image is None:
             return None, "Please upload a reference image."
         if not instruction or not instruction.strip():
@@ -1378,6 +1790,80 @@ class OmadaDemo:
         image = self._decode_image_tokens(gen_tokens[0])
         return image, "Edited image generated."
     # ------------------------------------------------------------------
     # Video-to-Speech
     # ------------------------------------------------------------------
@@ -1550,6 +2036,12 @@ class OmadaDemo:
         block_length: int,
         temperature: float,
     ) -> Tuple[str, str]:
         content = (message or "").strip()
         if not content:
             return "", "Type a message to start chatting."
@@ -1592,10 +2084,10 @@ class OmadaDemo:
         else:
             output_ids, step_snapshots = output_result, []
-        decoded = tokenizer.batch_decode(
-            output_ids[:, input_ids.shape[1]:],
-            skip_special_tokens=True,
-        )[0]
         return decoded.strip(), "Assistant reply generated."
     def run_chat_stream(
@@ -1609,6 +2101,12 @@ class OmadaDemo:
         max_tokens_per_step: int = 0,
         update_every: int = 25,
     ):
         content = (message or "").strip()
         if not content:
             yield "", "Type a message to start chatting.", True
@@ -1655,12 +2153,12 @@ class OmadaDemo:
                 if len(step_snapshots) > max_step_snapshots:
                     step_snapshots = step_snapshots[-max_step_snapshots:]
             step_counter += 1
             if update_every > 1 and step_counter % update_every != 0:
                 continue
-            decoded = tokenizer.batch_decode(
-                snapshot[:, prompt_len:],
-                skip_special_tokens=True,
-            )[0].strip()
             steps_html = self._render_diffusion_steps(
                 step_snapshots,
                 max_tokens_per_step=max_tokens_per_step,
@@ -1671,10 +2169,11 @@ class OmadaDemo:
             yield "", "Assistant reply generated.", True
             return
-        decoded = tokenizer.batch_decode(
-            latest_ids[:, input_ids.shape[1]:],
-            skip_special_tokens=True,
-        )[0].strip()
         step_snapshots = [latest_ids[0, input_ids.shape[1]:].detach().cpu()]
         steps_html = self._render_diffusion_steps(
             step_snapshots,
@@ -1682,6 +2181,36 @@ class OmadaDemo:
         )
         yield self._format_chat_output(decoded, steps_html), "Assistant reply generated.", True
     # ------------------------------------------------------------------
     # General MMU (N Images → Text)
     # ------------------------------------------------------------------
@@ -1694,6 +2223,12 @@ class OmadaDemo:
         block_length: int,
         temperature: float,
     ) -> Tuple[str, str]:
         """
         MMU demo now consumes exactly one image. If callers pass a list (for
         backwards compatibility), we keep only the first valid image.
@@ -1762,49 +2297,78 @@ class OmadaDemo:
         )
     def _format_chat_output(self, text: str, steps_html: str = "") -> str:
-        """Wrap <think> blocks in a collapsible section for chat UI."""
-        safe_text = text or ""
-        start_tag = "<think>"
-        end_tag = "</think>"
-        out = []
-        idx = 0
-        injected_steps = False
-        while True:
-            start = safe_text.find(start_tag, idx)
-            if start == -1:
-                tail = safe_text[idx:]
-                if tail:
-                    out.append(html.escape(tail).replace("\n", "<br>"))
-                break
-            prefix = safe_text[idx:start]
             if prefix:
-                out.append(html.escape(prefix).replace("\n", "<br>"))
-            end = safe_text.find(end_tag, start + len(start_tag))
-            if end == -1:
-                out.append(html.escape(safe_text[start:]).replace("\n", "<br>"))
-                break
-            think_body = safe_text[start + len(start_tag):end].strip()
-            think_block = html.escape(think_body).replace("\n", "<br>")
-            if steps_html and not injected_steps:
-                think_block = f"{think_block}{steps_html}"
-                injected_steps = True
-            out.append(
-                "\n<details><summary>Show think</summary>\n\n"
-                f"{think_block}\n"
-                "</details>\n"
-            )
-            idx = end + len(end_tag)
-        if steps_html and not injected_steps:
-            out.append(
-                "\n<details><summary>Show think</summary>\n\n"
-                f"{steps_html}\n"
-                "</details>\n"
-            )
-        body = "".join(out).strip()
         if not body:
             return ""
         return f"<div class='omada-response-block'>{body}</div>"
     def _render_diffusion_steps(
         self,
         step_snapshots: List[torch.Tensor],
@@ -2096,6 +2660,10 @@ class OmadaDemo:
         mmu_input_ids = mmu_input_ids.to(self.device)
         prompt_masks = prompt_masks.to(self.device)
         answer_tokens = int((prompt_masks == 0).sum(dim=1).max().item())
         default_budget = max(1, answer_tokens) if answer_tokens > 0 else min(self.max_text_len, 256)
         gen_tokens = int(max_new_tokens or default_budget)
@@ -2108,14 +2676,7 @@ class OmadaDemo:
         )
         temperature = float(temperature if temperature is not None else 0.7)
-        if gen_tokens > 0:
-            mask_block = torch.full(
-                (mmu_input_ids.size(0), gen_tokens),
-                self.mask_token_id,
-                dtype=torch.long,
-                device=self.device,
-            )
-            mmu_input_ids = torch.cat([mmu_input_ids, mask_block], dim=1)
         with torch.no_grad():
             output_ids = self.model.mmu_generate(
@@ -2128,14 +2689,57 @@ class OmadaDemo:
                 mask_id=self.mask_token_id,
             )
-        decoded = self.uni_prompting.text_tokenizer.batch_decode(
-            output_ids[:, mmu_input_ids.shape[1]:],
-            skip_special_tokens=True,
-        )[0].strip()
         if not decoded:
             return "", "MMU response was empty."
         return decoded, "Image understanding succeeded."
     def _generate_text_tokens(
         self,
         prompt_ids: torch.Tensor,
@@ -2213,6 +2817,13 @@ class OmadaDemo:
                 transfer_index = torch.zeros_like(work, dtype=torch.bool)
                 for b in range(batch_size):
                     k = int(num_transfer_tokens[b, inner_step].item())
                     if k <= 0:
                         continue
@@ -2223,6 +2834,15 @@ class OmadaDemo:
                 if return_steps and batch_size > 0:
                     step_snapshots.append(work[0, prompt_len:].detach().cpu())
         if return_steps:
             return work, step_snapshots
         return work
@@ -2303,6 +2923,13 @@ class OmadaDemo:
                 transfer_index = torch.zeros_like(work, dtype=torch.bool)
                 for b in range(batch_size):
                     k = int(num_transfer_tokens[b, inner_step].item())
                     if k <= 0:
                         continue
@@ -2312,6 +2939,14 @@ class OmadaDemo:
                 work[transfer_index] = x0[transfer_index]
                 yield work.clone(), prompt_len
 def build_demo(app: OmadaDemo, share: bool, server_name: str, server_port: Optional[int]):
     theme = gr.themes.Soft(primary_hue="blue", neutral_hue="gray")
     with gr.Blocks(title="AIDAS Lab @ SNU", css=CUSTOM_CSS, theme=theme, js=FORCE_LIGHT_MODE_JS) as demo:
@@ -2750,15 +3385,23 @@ def build_demo(app: OmadaDemo, share: bool, server_name: str, server_port: Optio
             response = ""
             if mode == "Text":
-                reply, status = app.run_chat(
                     message,
                     chat_max_tokens,
                     chat_steps,
                     chat_block,
                     chat_temperature,
-                )
-                response = _render_text_message(status, reply)
-                display_user_raw = message or "[Text request]"
             elif mode == "Text → Speech":
                 if not message:
                     status = "Please type some text for speech synthesis."

 import os
 import math
 import random
+import re
 import sys
 import tempfile
 import wave
   box-shadow: none;
   border: 1px solid var(--omada-border);
   background: #ffffff;
+  overflow-y: auto !important;
+}
+.omada-chat-column .gradio-chatbot .wrap,
+.omada-chat-column .gradio-chatbot .message-wrap {
+  overflow-y: auto !important;
+}
+.omada-chat-column .gradio-chatbot .message {
+  overflow-wrap: anywhere;
 }
 .omada-controls {
   gap: 16px !important;
 import gradio as gr
 import numpy as np
 import torch
+from omegaconf import DictConfig, ListConfig, OmegaConf
 from PIL import Image
 from inference.common import (
     if cfg is None:
         return default
+    if isinstance(cfg, (list, tuple, ListConfig)):
+        for item in cfg:
+            value = _cfg_get(item, key, None)
+            if value is not None:
+                return value
+        return default
     if isinstance(cfg, dict):
         return cfg.get(key, default)
     try:
             )
         )
         self.max_text_len = int(getattr(self.train_cfg.dataset.preprocessing, "max_seq_length", 1024))
+        self.max_seq_mmu = int(
+            getattr(
+                self.train_cfg.dataset.preprocessing,
+                "max_seq_length_mmu",
+                self.max_text_len,
+            )
+        )
+        self.chat_mask_surface_token = "<mdm_mask>"
         model_seq_len = getattr(self.model.config, "num_vq_tokens", None)
         if model_seq_len is None:
         self.noise_type = _cfg_get(training_cfg, "noise_type", "mask")
         self.predict_all_tokens = bool(_cfg_get(training_cfg, "predict_all_tokens", False))
         self.t2i_default_timesteps = int(_cfg_get(training_cfg, "generation_timesteps", 20))
+        # Align i2i defaults with eval (use generation_timesteps unless explicitly set).
+        self.i2i_default_timesteps = int(_cfg_get(training_cfg, "generation_timesteps", 20))
+        # Force demo to use eval-matched defaults unless explicitly disabled.
+        self.force_eval_settings = str(os.getenv("FORCE_EVAL_SETTINGS", "1")).lower() not in {"0", "false", "no"}
+        self.eval_defaults = {
+            "t2i": {
+                "timesteps": 16,
+                "guidance_scale": 2.5,
+                "temperature": 0.0,
+            },
+            "i2i": {
+                "timesteps": 64,
+                "guidance_scale": 2.5,
+                "temperature": 0.0,
+            },
+            # Match defaults used in inference scripts for eval parity.
+            "t2s": {
+                "steps": 128,
+                "block_length": 128,
+                "max_new_tokens": int(self.max_audio_len_short),
+                "temperature": 0.0,
+                "guidance_scale": float(_cfg_get(training_cfg, "guidance_scale", 3.5)),
+            },
+            "s2t": {
+                "steps": 128,
+                "block_length": 16,
+                "max_new_tokens": 128,
+                "remasking": "low_confidence",
+            },
+            "v2t": {
+                "steps": 256,
+                "block_length": 16,
+                "max_new_tokens": 256,
+            },
+            # LLM eval uses gen_length=steps=block_length=16
+            "chat": {
+                "steps": 512,
+                "block_length": 16,
+                "max_new_tokens": 512,
+                "temperature": 0.0,
+            },
+            "mmu": {
+                "steps": 128,
+                "block_length": 16,
+                "max_new_tokens": 128,
+                "temperature": 0.0,
+            },
+        }
         self.audio_condition_default = "gender-female_emotion-neutral_speed-normal_pitch-normal"
         style_map = getattr(getattr(self.vq_audio, "config", None), "u2s_style2idx", None)
         speed_choice: str,
         pitch_choice: str,
     ) -> Tuple[Optional[Tuple[int, np.ndarray]], str]:
+        if self.force_eval_settings:
+            d = self.eval_defaults["t2s"]
+            max_new_tokens = int(d["max_new_tokens"])
+            steps = int(d["steps"])
+            block_length = int(d["block_length"])
+            temperature = float(d["temperature"])
+            cfg_scale = float(d["guidance_scale"])
         if text is None or not text.strip():
             return None, "Please provide text to synthesize."
         status = f"Speech generated! ({gender}/{emotion}/{speed}/{pitch})."
         return audio, status
+    def run_t2s_stream(
+        self,
+        text: str,
+        max_new_tokens: int,
+        steps: int,
+        block_length: int,
+        temperature: float,
+        cfg_scale: float,
+        gender_choice: str,
+        emotion_choice: str,
+        speed_choice: str,
+        pitch_choice: str,
+        update_every: Optional[int] = None,
+    ):
+        if self.force_eval_settings:
+            d = self.eval_defaults["t2s"]
+            max_new_tokens = int(d["max_new_tokens"])
+            steps = int(d["steps"])
+            block_length = int(d["block_length"])
+            temperature = float(d["temperature"])
+            cfg_scale = float(d["guidance_scale"])
+        if text is None or not text.strip():
+            yield None, "Please provide text to synthesize."
+            return
+        speech_len, steps, block_length = self._prepare_block_schedule(
+            max_new_tokens,
+            steps,
+            block_length,
+        )
+        gender = self._resolve_choice(gender_choice, self.genders)
+        emotion = self._resolve_choice(emotion_choice, self.emotions)
+        speed = self._resolve_choice(speed_choice, self.speeds)
+        pitch = self._resolve_choice(pitch_choice, self.pitches)
+        text = text.strip().upper()
+        prompt = (
+            "<|start_header_id|>user<|end_header_id|>\n"
+            f"{random.choice(T2S_INSTRUCTION)}\n{text}"
+            "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n"
+        )
+        audio_tokens = torch.full(
+            (1, speech_len),
+            fill_value=self.mask_token_id,
+            dtype=torch.long,
+            device=self.device,
+        )
+        input_ids, attention_mask = self.uni_prompting(([prompt], audio_tokens), "t2s_gen")
+        input_ids = input_ids.to(self.device)
+        attention_mask = attention_mask.to(self.device)
+        condition = f"gender-{gender}_emotion-{emotion}_speed-{speed}_pitch-{pitch}"
+        last_audio = None
+        accumulated = None
+        prev_len = 0
+        for rel_list, step_status in self.model.t2s_generate_mmu_like_stream(
+            input_ids=input_ids,
+            max_new_tokens=int(speech_len),
+            steps=int(steps),
+            block_length=int(block_length),
+            temperature=float(temperature),
+            cfg_scale=float(cfg_scale),
+            mask_token_id=self.mask_token_id,
+            attention_mask=attention_mask,
+            uni_prompting=self.uni_prompting,
+            codebook_size=self.codebook_size,
+            update_every=update_every,
+        ):
+            if not rel_list:
+                continue
+            rel = rel_list[0]
+            if isinstance(rel, torch.Tensor):
+                rel_ids = rel.detach().cpu().tolist()
+            else:
+                rel_ids = list(rel)
+            if not rel_ids:
+                continue
+            if prev_len >= len(rel_ids):
+                continue
+            new_ids = rel_ids[prev_len:]
+            prev_len = len(rel_ids)
+            speech_units = "".join(f"<|speech_{sid}|>" for sid in new_ids)
+            wav = self.vq_audio.decode(
+                speech_units,
+                condition=condition,
+                output_wav_file=os.path.join("/tmp", "omada_t2s_stream.wav"),
+            )
+            chunk = wav.astype(np.float32)
+            if accumulated is None:
+                accumulated = chunk
+            else:
+                accumulated = np.concatenate([accumulated, chunk], axis=0)
+            audio = (self.sample_rate, accumulated)
+            last_audio = audio
+            yield audio, f"{step_status} ({gender}/{emotion}/{speed}/{pitch})"
+        if last_audio is not None:
+            yield last_audio, f"Speech generated! ({gender}/{emotion}/{speed}/{pitch})."
     # ------------------------------------------------------------------
     # Speech-to-Speech
     # ------------------------------------------------------------------
         max_new_tokens: int,
         remasking: str,
     ) -> Tuple[str, str]:
+        if self.force_eval_settings:
+            d = self.eval_defaults["s2t"]
+            steps = int(d["steps"])
+            block_length = int(d["block_length"])
+            max_new_tokens = int(d["max_new_tokens"])
+            remasking = str(d["remasking"])
         if not audio_path:
             return "", "Please upload an audio file first."
+        remasking = str(remasking).lower()
+        if remasking == "full":
+            remasking = "low_confidence"
         tokens = self.vq_audio.encode(audio_path).to(self.device)
         offset = self.text_vocab_size + self.speech_codebook
         tokens = tokens + offset
                 remasking=str(remasking),
             )
+        decoded = self._decode_chat_tokens(
+            output_ids[0, input_ids.shape[1]:],
+            self.uni_prompting.text_tokenizer,
+        ).strip()
+        decoded = self._postprocess_chat_text(decoded)
+        decoded = self._strip_trailing_masks(decoded)
+        decoded = self._remove_mask_artifacts(decoded)
         return decoded.strip(), "Transcription generated successfully."
+    def run_s2t_stream(
+        self,
+        audio_path: Optional[str],
+        steps: int,
+        block_length: int,
+        max_new_tokens: int,
+        remasking: str,
+        update_every: int = 32,
+    ):
+        if self.force_eval_settings:
+            d = self.eval_defaults["s2t"]
+            steps = int(d["steps"])
+            block_length = int(d["block_length"])
+            max_new_tokens = int(d["max_new_tokens"])
+            remasking = str(d["remasking"])
+        if not audio_path:
+            yield "", "Please upload an audio file first."
+            return
+        remasking = str(remasking).lower()
+        if remasking == "full":
+            remasking = "low_confidence"
+        tokens = self.vq_audio.encode(audio_path).to(self.device)
+        offset = self.text_vocab_size + self.speech_codebook
+        tokens = tokens + offset
+        spt = self.uni_prompting.sptids_dict
+        audio_block = torch.cat(
+            [
+                spt['<|s2t|>'].to(self.device).unsqueeze(0),
+                spt['<|soa|>'].to(self.device).unsqueeze(0),
+                tokens.to(self.device),
+                spt['<|eoa|>'].to(self.device).unsqueeze(0),
+            ],
+            dim=1,
+        )
+        prompt_text = random.choice(S2T_INSTRUCTION)
+        chat_prompt = (
+            "<|start_header_id|>user<|end_header_id|>\n"
+            f"{prompt_text}"
+            "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n"
+        )
+        prompt_tensor = self.uni_prompting.text_tokenizer(
+            chat_prompt,
+            return_tensors="pt",
+        ).input_ids.to(self.device)
+        input_ids = torch.cat([audio_block, prompt_tensor], dim=1)
+        step_counter = 0
+        latest_decoded = ""
+        for snapshot, prompt_len in self._generate_text_tokens_stream(
+            input_ids,
+            max_new_tokens=int(max_new_tokens),
+            steps=int(steps),
+            block_length=int(block_length),
+            temperature=1.0,
+            cfg_scale=0.0,
+            attention_mask=None,
+            remasking=remasking,
+        ):
+            step_counter += 1
+            if update_every > 1 and step_counter % update_every != 0:
+                continue
+            decoded = self._decode_chat_tokens(
+                snapshot[0, prompt_len:],
+                self.uni_prompting.text_tokenizer,
+            ).strip()
+            decoded = self._postprocess_chat_text(decoded)
+            latest_decoded = decoded
+            yield decoded, "Generating..."
+        finalized = self._remove_mask_artifacts(self._strip_trailing_masks(latest_decoded))
+        yield finalized.strip(), "Transcription generated successfully."
     # ------------------------------------------------------------------
     # Video-to-Text
     # ------------------------------------------------------------------
         block_length: int,
         max_new_tokens: int,
     ) -> Tuple[str, str]:
+        if self.force_eval_settings:
+            d = self.eval_defaults["v2t"]
+            steps = int(d["steps"])
+            block_length = int(d["block_length"])
+            max_new_tokens = int(d["max_new_tokens"])
         resolved_path, converted = self._prepare_video_path(video_path)
         if not resolved_path:
         raw_all = self.uni_prompting.text_tokenizer.decode(output_ids[0], skip_special_tokens=False)
         print("[V2T] RAW ALL:", repr(raw_all))
+        decoded = self._decode_chat_tokens(
+            output_ids[0, input_ids.shape[1]:],
+            self.uni_prompting.text_tokenizer,
+        )
         print("[V2T] DECODED SLICE:", repr(decoded))
+        decoded = self._postprocess_chat_text(decoded)
+        decoded = self._strip_trailing_masks(decoded)
+        decoded = self._remove_mask_artifacts(decoded)
         return decoded.strip(), "Video caption generated successfully."
+    def run_v2t_stream(
+        self,
+        video_path: Any,
+        steps: int,
+        block_length: int,
+        max_new_tokens: int,
+        update_every: int = 32,
+    ):
+        if self.force_eval_settings:
+            d = self.eval_defaults["v2t"]
+            steps = int(d["steps"])
+            block_length = int(d["block_length"])
+            max_new_tokens = int(d["max_new_tokens"])
+        resolved_path, converted = self._prepare_video_path(video_path)
+        if not resolved_path:
+            yield "", "Please upload or record a video first."
+            return
+        try:
+            video_tokens = self._extract_video_tokens(resolved_path, num_frame=self.num_frames_v2t)
+        except Exception as exc:
+            yield "", f"Failed to process video: {exc}"
+            return
+        prompt_text = random.choice(V2T_INSTRUCTION)
+        prompt = (
+            "<|start_header_id|>user<|end_header_id|>\n"
+            f"{prompt_text}"
+            "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n"
+        )
+        prompt_tensor = self.uni_prompting.text_tokenizer(
+            prompt,
+            return_tensors="pt",
+        ).input_ids.to(self.device)
+        input_ids = torch.cat([video_tokens, prompt_tensor], dim=1)
+        step_counter = 0
+        latest_decoded = ""
+        for snapshot, prompt_len in self._generate_text_tokens_stream(
+            input_ids,
+            max_new_tokens=int(max_new_tokens),
+            steps=int(steps),
+            block_length=int(block_length),
+            temperature=1.0,
+            cfg_scale=0.0,
+            attention_mask=None,
+            remasking="low_confidence",
+        ):
+            step_counter += 1
+            if update_every > 1 and step_counter % update_every != 0:
+                continue
+            decoded = self._decode_chat_tokens(
+                snapshot[0, prompt_len:],
+                self.uni_prompting.text_tokenizer,
+            ).strip()
+            decoded = self._postprocess_chat_text(decoded)
+            latest_decoded = decoded
+            yield decoded, "Generating..."
+        finalized = self._remove_mask_artifacts(self._strip_trailing_masks(latest_decoded))
+        yield finalized.strip(), "Video caption generated successfully."
     # ------------------------------------------------------------------
     # Text-to-Image
     # ------------------------------------------------------------------
         temperature: float,
         guidance_scale: float,
     ) -> Tuple[Optional[Image.Image], str]:
+        if self.force_eval_settings:
+            d = self.eval_defaults["t2i"]
+            timesteps = int(d["timesteps"])
+            temperature = float(d["temperature"])
+            guidance_scale = float(d["guidance_scale"])
         if not prompt or not prompt.strip():
             return None, "Please provide a text prompt."
         image = self._decode_image_tokens(gen_tokens[0])
         return image, "Image generated from text prompt."
+    def run_t2i_stream(
+        self,
+        prompt: str,
+        timesteps: int,
+        temperature: float,
+        guidance_scale: float,
+        update_every: int = 2,
+    ):
+        if self.force_eval_settings:
+            d = self.eval_defaults["t2i"]
+            timesteps = int(d["timesteps"])
+            temperature = float(d["temperature"])
+            guidance_scale = float(d["guidance_scale"])
+        if not prompt or not prompt.strip():
+            yield None, "Please provide a text prompt."
+            return
+        image_seq_len = 1024
+        image_tokens = torch.full(
+            (1, image_seq_len),
+            self.mask_token_id,
+            dtype=torch.long,
+            device=self.device,
+        )
+        input_ids, attention_mask = self.uni_prompting(([prompt.strip()], image_tokens), "t2i_gen")
+        input_ids = input_ids.to(self.device)
+        attention_mask = attention_mask.to(self.device)
+        if guidance_scale > 0:
+            uncond_ids, uncond_mask = self.uni_prompting(([""], image_tokens.clone()), "t2i_gen")
+            uncond_ids = uncond_ids.to(self.device)
+            uncond_mask = uncond_mask.to(self.device)
+        else:
+            uncond_ids = None
+            uncond_mask = None
+        step_count = 0
+        for pil_image, status in self.model.t2i_generate_decoding_stepwise(
+            input_ids=input_ids,
+            uncond_input_ids=uncond_ids,
+            attention_mask=attention_mask,
+            uncond_attention_mask=uncond_mask,
+            guidance_scale=float(guidance_scale),
+            temperature=float(temperature),
+            timesteps=int(timesteps),
+            noise_schedule=self.mask_schedule,
+            noise_type=self.noise_type,
+            predict_all_tokens=self.predict_all_tokens,
+            seq_len=image_seq_len,
+            mask_token_id=self.mask_token_id,
+            codebook_size=self.codebook_size,
+            uni_prompting=self.uni_prompting,
+            config=self.train_cfg,
+            vq_model=self.vq_image,
+        ):
+            step_count += 1
+            if update_every <= 1 or step_count % update_every == 0 or step_count == int(timesteps):
+                yield pil_image, status
     # ------------------------------------------------------------------
     # Image-to-Image Editing
     # ------------------------------------------------------------------
         temperature: float,
         guidance_scale: float,
     ) -> Tuple[Optional[Image.Image], str]:
+        if self.force_eval_settings:
+            d = self.eval_defaults["i2i"]
+            timesteps = int(d["timesteps"])
+            temperature = float(d["temperature"])
+            guidance_scale = float(d["guidance_scale"])
         if source_image is None:
             return None, "Please upload a reference image."
         if not instruction or not instruction.strip():
         image = self._decode_image_tokens(gen_tokens[0])
         return image, "Edited image generated."
+    def run_i2i_stream(
+        self,
+        instruction: str,
+        source_image: Optional[Image.Image],
+        timesteps: int,
+        temperature: float,
+        guidance_scale: float,
+        update_every: int = 2,
+    ):
+        if self.force_eval_settings:
+            d = self.eval_defaults["i2i"]
+            timesteps = int(d["timesteps"])
+            temperature = float(d["temperature"])
+            guidance_scale = float(d["guidance_scale"])
+        if source_image is None:
+            yield None, "Please upload a reference image."
+            return
+        if not instruction or not instruction.strip():
+            yield None, "Provide editing instructions for the image."
+            return
+        try:
+            input_tokens = self._prepare_image_tokens(source_image, resolution=self.image_resolution)
+        except Exception as exc:
+            yield None, f"Failed to encode input image: {exc}"
+            return
+        seq_len = int(input_tokens.shape[-1])
+        output_placeholder = torch.full(
+            (1, seq_len),
+            self.mask_token_id,
+            dtype=torch.long,
+            device=self.device,
+        )
+        input_ids, attention_mask = self.uni_prompting(
+            ([instruction.strip()], input_tokens, output_placeholder),
+            "i2i_gen",
+        )
+        input_ids = input_ids.to(self.device)
+        attention_mask = attention_mask.to(self.device)
+        uncond_ids = None
+        uncond_attn = None
+        if guidance_scale > 0:
+            uncond_ids, uncond_attn = self.uni_prompting(
+                ([""], input_tokens.clone(), torch.full_like(output_placeholder, self.mask_token_id)),
+                "i2i_gen",
+            )
+            uncond_ids = uncond_ids.to(self.device)
+            uncond_attn = uncond_attn.to(self.device)
+        step_count = 0
+        for pil_image, status in self.model.i2i_generate_decoding_stepwise(
+            input_ids=input_ids,
+            uncond_input_ids=uncond_ids,
+            attention_mask=attention_mask,
+            uncond_attention_mask=uncond_attn,
+            temperature=float(temperature),
+            timesteps=int(timesteps),
+            guidance_scale=float(guidance_scale),
+            noise_schedule=self.mask_schedule,
+            noise_type=self.noise_type,
+            seq_len=seq_len,
+            mask_token_id=self.mask_token_id,
+            codebook_size=self.codebook_size,
+            uni_prompting=self.uni_prompting,
+            config=self.train_cfg,
+            vq_model=self.vq_image,
+        ):
+            step_count += 1
+            if update_every <= 1 or step_count % update_every == 0 or step_count == int(timesteps):
+                yield pil_image, status
     # ------------------------------------------------------------------
     # Video-to-Speech
     # ------------------------------------------------------------------
         block_length: int,
         temperature: float,
     ) -> Tuple[str, str]:
+        if self.force_eval_settings:
+            d = self.eval_defaults["chat"]
+            max_new_tokens = int(d["max_new_tokens"])
+            steps = int(d["steps"])
+            block_length = int(d["block_length"])
+            temperature = float(d["temperature"])
         content = (message or "").strip()
         if not content:
             return "", "Type a message to start chatting."
         else:
             output_ids, step_snapshots = output_result, []
+        decoded = self._decode_chat_tokens(output_ids[0, input_ids.shape[1]:], tokenizer)
+        decoded = self._postprocess_chat_text(decoded)
+        decoded = self._strip_trailing_masks(decoded)
+        decoded = self._remove_mask_artifacts(decoded)
         return decoded.strip(), "Assistant reply generated."
     def run_chat_stream(
         max_tokens_per_step: int = 0,
         update_every: int = 25,
     ):
+        if self.force_eval_settings:
+            d = self.eval_defaults["chat"]
+            max_new_tokens = int(d["max_new_tokens"])
+            steps = int(d["steps"])
+            block_length = int(d["block_length"])
+            temperature = float(d["temperature"])
         content = (message or "").strip()
         if not content:
             yield "", "Type a message to start chatting.", True
                 if len(step_snapshots) > max_step_snapshots:
                     step_snapshots = step_snapshots[-max_step_snapshots:]
             step_counter += 1
+            raw_decoded = self._decode_chat_tokens(snapshot[0, prompt_len:], tokenizer)
+            print(f"[CHAT_STREAM][step={step_counter}] raw_decoded={raw_decoded!r}", flush=True)
             if update_every > 1 and step_counter % update_every != 0:
                 continue
+            decoded = raw_decoded.strip()
+            decoded = self._postprocess_chat_text(decoded)
             steps_html = self._render_diffusion_steps(
                 step_snapshots,
                 max_tokens_per_step=max_tokens_per_step,
             yield "", "Assistant reply generated.", True
             return
+        decoded = self._decode_chat_tokens(latest_ids[0, input_ids.shape[1]:], tokenizer).strip()
+        print(f"[CHAT_STREAM][final] raw_decoded={decoded!r}", flush=True)
+        decoded = self._postprocess_chat_text(decoded)
+        decoded = self._strip_trailing_masks(decoded)
+        decoded = self._remove_mask_artifacts(decoded)
         step_snapshots = [latest_ids[0, input_ids.shape[1]:].detach().cpu()]
         steps_html = self._render_diffusion_steps(
             step_snapshots,
         )
         yield self._format_chat_output(decoded, steps_html), "Assistant reply generated.", True
+    def _decode_chat_tokens(self, token_ids: torch.Tensor, tokenizer) -> str:
+        """Decode chat tokens while preserving mask placeholders for UI."""
+        ids = token_ids.detach().cpu().tolist()
+        pieces = []
+        run_ids = []
+        def _flush_run():
+            nonlocal run_ids
+            if not run_ids:
+                return
+            try:
+                decoded_run = tokenizer.decode(
+                    run_ids,
+                    skip_special_tokens=False,
+                    clean_up_tokenization_spaces=False,
+                )
+            except Exception:
+                decoded_run = ""
+            pieces.append(decoded_run if decoded_run is not None else "")
+            run_ids = []
+        for tid in ids:
+            if int(tid) == int(self.mask_token_id):
+                _flush_run()
+                pieces.append(self.chat_mask_surface_token)
+            else:
+                run_ids.append(int(tid))
+        _flush_run()
+        return "".join(pieces)
     # ------------------------------------------------------------------
     # General MMU (N Images → Text)
     # ------------------------------------------------------------------
         block_length: int,
         temperature: float,
     ) -> Tuple[str, str]:
+        if self.force_eval_settings:
+            d = self.eval_defaults["mmu"]
+            max_new_tokens = int(d["max_new_tokens"])
+            steps = int(d["steps"])
+            block_length = int(d["block_length"])
+            temperature = float(d["temperature"])
         """
         MMU demo now consumes exactly one image. If callers pass a list (for
         backwards compatibility), we keep only the first valid image.
         )
     def _format_chat_output(self, text: str, steps_html: str = "") -> str:
+        """Render chat text inline; only mask tokens are shown as pills."""
+        safe_text = (text or "").strip()
+        if not safe_text:
+            return ""
+        def _fmt_tokens(segment: str) -> str:
+            mask_pat = r"(<MDM_MASK>|<\|?MDM_MASK[^>\s]*\|?>|\[MASK\]|<MASK>|MASK_TOKEN|<\|?MASK[^>\s]*\|?>)"
+            pieces = re.split(mask_pat, segment, flags=re.IGNORECASE)
+            out = []
+            for p in pieces:
+                if not p:
+                    continue
+                if re.fullmatch(mask_pat, p, flags=re.IGNORECASE):
+                    out.append("<span class='omada-token omada-token-mask'>MASK</span>")
+                else:
+                    out.append(html.escape(p).replace("\n", "<br>"))
+            return "".join(out)
+        parts = []
+        cursor = 0
+        for m in re.finditer(r"<think>(.*?)</think>", safe_text, flags=re.DOTALL | re.IGNORECASE):
+            prefix = safe_text[cursor:m.start()]
             if prefix:
+                parts.append(_fmt_tokens(prefix))
+            think_body = m.group(1) or ""
+            parts.append(f"<div class='omada-response-block'><b>Think:</b><br>{_fmt_tokens(think_body)}</div>")
+            cursor = m.end()
+        tail = safe_text[cursor:]
+        if tail:
+            parts.append(_fmt_tokens(tail))
+        body = "".join(parts).strip()
         if not body:
             return ""
         return f"<div class='omada-response-block'>{body}</div>"
+    def _postprocess_chat_text(self, text: str) -> str:
+        """Remove special/system tokens while keeping think content."""
+        if not text:
+            return ""
+        cleaned = text
+        # Normalize common malformed boundaries seen in streamed decode.
+        cleaned = cleaned.replace("</thinkboxed", "</think>boxed")
+        cleaned = cleaned.replace("<thinkboxed", "<think>boxed")
+        # Keep think tags/content; only strip protocol-level special tokens.
+        # Strip special tokens like <|eot_id|>, <|start_header_id|>, etc.
+        cleaned = re.sub(r"<\|[^>]*\|>", "", cleaned)
+        # Also remove truncated special tokens without the trailing ">".
+        cleaned = re.sub(r"<\|[^\n]*\|", "", cleaned)
+        cleaned = cleaned.replace("<|endoftext|>", "")
+        cleaned = cleaned.replace("<|endoftext|", "")
+        return cleaned.strip()
+    def _strip_trailing_masks(self, text: str) -> str:
+        if not text:
+            return ""
+        mask_tail = (
+            r"(?:\s*(?:\(|\[)?(?:<MDM_MASK>|<\|?MDM_MASK[^>\s]*\|?>|\[MASK\]|<MASK>|MASK_TOKEN)"
+            r"(?:\)|\])?)+\s*$"
+        )
+        return re.sub(mask_tail, "", text, flags=re.IGNORECASE).rstrip()
+    def _remove_mask_artifacts(self, text: str) -> str:
+        if not text:
+            return ""
+        mask_pat = r"(<MDM_MASK>|<\|?MDM_MASK[^>\s]*\|?>|\[MASK\]|<MASK>|MASK_TOKEN|<\|?MASK[^>\s]*\|?>)"
+        cleaned = re.sub(mask_pat, " ", text, flags=re.IGNORECASE)
+        # Some tokenizers may emit literal MASK text instead of the special token.
+        cleaned = re.sub(r"MASK", " ", cleaned)
+        cleaned = re.sub(r"\s+([,.;:!?])", r"\1", cleaned)
+        cleaned = re.sub(r"\s{2,}", " ", cleaned).strip()
+        return cleaned
     def _render_diffusion_steps(
         self,
         step_snapshots: List[torch.Tensor],
         mmu_input_ids = mmu_input_ids.to(self.device)
         prompt_masks = prompt_masks.to(self.device)
+        prompt_len = int(prompt_masks.sum(dim=1).max().item())
+        if prompt_len > 0:
+            mmu_input_ids = mmu_input_ids[:, :prompt_len]
         answer_tokens = int((prompt_masks == 0).sum(dim=1).max().item())
         default_budget = max(1, answer_tokens) if answer_tokens > 0 else min(self.max_text_len, 256)
         gen_tokens = int(max_new_tokens or default_budget)
         )
         temperature = float(temperature if temperature is not None else 0.7)
+        input_prompt_len = mmu_input_ids.shape[1]
         with torch.no_grad():
             output_ids = self.model.mmu_generate(
                 mask_id=self.mask_token_id,
             )
+        gen_slice = output_ids[0, input_prompt_len:]
+        if gen_slice.numel() == 0:
+            # Some checkpoints may return only generated ids (without prepended prompt).
+            gen_slice = output_ids[0]
+        decoded = self._decode_chat_tokens(
+            gen_slice,
+            self.uni_prompting.text_tokenizer,
+        ).strip()
+        print(
+            f"[MMU] input_prompt_len={input_prompt_len} output_len={int(output_ids.shape[1])} "
+            f"gen_len={int(gen_slice.numel())} first_ids={gen_slice[:16].detach().cpu().tolist()}",
+            flush=True,
+        )
+        print(f"[MMU] raw_decoded={decoded!r}", flush=True)
+        decoded = self._postprocess_chat_text(decoded)
+        decoded = self._strip_trailing_masks(decoded)
+        decoded = self._remove_mask_artifacts(decoded)
         if not decoded:
             return "", "MMU response was empty."
         return decoded, "Image understanding succeeded."
+    def _finalize_generation_masks(
+        self,
+        work: torch.Tensor,
+        prompt_len: int,
+        attention_bias: Optional[torch.Tensor] = None,
+        cfg_scale: float = 0.0,
+    ) -> torch.Tensor:
+        """Force-fill any residual masks after scheduled diffusion steps."""
+        if work.numel() == 0:
+            return work
+        if not (work[:, prompt_len:] == self.mask_token_id).any():
+            return work
+        with torch.no_grad():
+            if cfg_scale > 0.0:
+                prompt_index = work != self.mask_token_id
+                unconditional = work.clone()
+                unconditional[prompt_index] = self.mask_token_id
+                model_input = torch.cat([work, unconditional], dim=0)
+                logits = self.model(model_input).logits
+                cond_logits, uncond_logits = torch.chunk(logits, 2, dim=0)
+                logits = uncond_logits + (cfg_scale + 1.0) * (cond_logits - uncond_logits)
+            else:
+                logits = self.model(work, attention_bias=attention_bias).logits
+            greedy = torch.argmax(logits, dim=-1)
+            mask_idx = work == self.mask_token_id
+            work = torch.where(mask_idx, greedy, work)
+        return work
     def _generate_text_tokens(
         self,
         prompt_ids: torch.Tensor,
                 transfer_index = torch.zeros_like(work, dtype=torch.bool)
                 for b in range(batch_size):
+                    block_mask_now = torch.where(work[b, block_slice] == self.mask_token_id)[0]
+                    if inner_step == inner_steps - 1:
+                        # Guarantee: no masks remain in this block after its last step.
+                        if block_mask_now.numel() > 0:
+                            transfer_index[b, prompt_len + block_idx * block_length + block_mask_now] = True
+                        continue
                     k = int(num_transfer_tokens[b, inner_step].item())
                     if k <= 0:
                         continue
                 if return_steps and batch_size > 0:
                     step_snapshots.append(work[0, prompt_len:].detach().cpu())
+        work = self._finalize_generation_masks(
+            work,
+            prompt_len=prompt_len,
+            attention_bias=attention_bias,
+            cfg_scale=cfg_scale,
+        )
+        if return_steps and batch_size > 0:
+            step_snapshots.append(work[0, prompt_len:].detach().cpu())
         if return_steps:
             return work, step_snapshots
         return work
                 transfer_index = torch.zeros_like(work, dtype=torch.bool)
                 for b in range(batch_size):
+                    block_mask_now = torch.where(work[b, block_slice] == self.mask_token_id)[0]
+                    if inner_step == inner_steps - 1:
+                        # Guarantee: no masks remain in this block after its last step.
+                        if block_mask_now.numel() > 0:
+                            transfer_index[b, prompt_len + block_idx * block_length + block_mask_now] = True
+                        continue
                     k = int(num_transfer_tokens[b, inner_step].item())
                     if k <= 0:
                         continue
                 work[transfer_index] = x0[transfer_index]
                 yield work.clone(), prompt_len
+        work = self._finalize_generation_masks(
+            work,
+            prompt_len=prompt_len,
+            attention_bias=attention_bias,
+            cfg_scale=cfg_scale,
+        )
+        yield work.clone(), prompt_len
 def build_demo(app: OmadaDemo, share: bool, server_name: str, server_port: Optional[int]):
     theme = gr.themes.Soft(primary_hue="blue", neutral_hue="gray")
     with gr.Blocks(title="AIDAS Lab @ SNU", css=CUSTOM_CSS, theme=theme, js=FORCE_LIGHT_MODE_JS) as demo:
             response = ""
             if mode == "Text":
+                display_user_raw = message or "[Text request]"
+                display_user = _format_user_message(display_user_raw)
+                history = history + [(display_user, _render_text_message("Generating...", ""))]
+                yield history, ""
+                for reply_html, status, done in app.run_chat_stream(
                     message,
                     chat_max_tokens,
                     chat_steps,
                     chat_block,
                     chat_temperature,
+                    update_every=32,
+                ):
+                    response = _render_text_message(status, reply_html)
+                    history[-1] = (display_user, response)
+                    yield history, ""
+                return
             elif mode == "Text → Speech":
                 if not message:
                     status = "Please type some text for speech synthesis."

MMaDA/models/modeling_omada.py CHANGED Viewed

@@ -597,6 +597,196 @@ class OMadaModelLM(LLaDAModelLM):
         return final_outputs
     @torch.no_grad()
     def t2s_fixed_generate(
             self,
@@ -2116,8 +2306,8 @@ class OMadaModelLM(LLaDAModelLM):
                 uncond_input_ids = torch.cat(
                     [uncond_prefix, input_ids[:, resolution + 1:]], dim=1)
                 model_input = torch.cat([input_ids, uncond_input_ids])
-                attention_mask = torch.cat([attention_mask, uncond_attention_mask], dim=0)
-                attention_bias = (attention_mask[:, :, None] & attention_mask[:, None, :]).bool().unsqueeze(1)
                 logits = self(model_input, attention_bias=attention_bias).logits
                 # print(f"logits.shape: {logits.shape}")
                 cond_logits, uncond_logits = torch.chunk(logits, 2, dim=0)
@@ -2178,6 +2368,101 @@ class OMadaModelLM(LLaDAModelLM):
         return sampled_ids
 AutoConfig.register("omada", OMadaConfig)

         return final_outputs
+    @torch.no_grad()
+    def t2s_generate_mmu_like_stream(
+            self,
+            input_ids: torch.LongTensor,
+            max_new_tokens: Optional[int] = None,
+            steps: int = 256,
+            block_length: int = 128,
+            temperature: float = 0.0,
+            cfg_scale: float = 0.0,
+            mask_token_id: int = 126336,
+            attention_mask: Optional[torch.LongTensor] = None,
+            uni_prompting=None,
+            codebook_size: Optional[int] = None,
+            audio_codebook_size: int = 4096,
+            update_every: Optional[int] = None,
+    ):
+        """
+        Stream speech token generation. Yields intermediate token lists.
+        """
+        if uni_prompting is None:
+            raise ValueError("uni_prompting must be provided")
+        if block_length <= 0:
+            raise ValueError("block_length must be positive")
+        batch_size, seq_len = input_ids.shape
+        device = input_ids.device
+        mask_positions_full = (input_ids == mask_token_id)
+        if not mask_positions_full.any():
+            raise ValueError("No mask tokens detected for T2S generation")
+        mask_cols = torch.where(mask_positions_full[0])[0]
+        speech_region_start = mask_cols[0].item()
+        speech_region_len = mask_cols.numel()
+        mask_counts = mask_positions_full.sum(dim=1)
+        if not torch.all(mask_counts == mask_counts[0]):
+            raise ValueError("All batch items must contain the same number of masked speech tokens for MMU-like generation")
+        if max_new_tokens is None:
+            max_new_tokens = speech_region_len
+        else:
+            max_new_tokens = min(max_new_tokens, speech_region_len)
+        block_length = max(1, min(block_length, max_new_tokens))
+        num_blocks = math.ceil(max_new_tokens / block_length)
+        inner_steps = max(1, steps // num_blocks)
+        codebook_base = codebook_size if codebook_size is not None else getattr(self.config, "codebook_size", 8192)
+        speech_vocab_start = len(uni_prompting.text_tokenizer) + codebook_base
+        speech_vocab_end = speech_vocab_start + audio_codebook_size
+        eoa_token_id = uni_prompting.sptids_dict['<|eoa|>'][0].item()
+        eos_token_id = uni_prompting.text_tokenizer.eos_token_id
+        vq_code_relative_eoa_id = audio_codebook_size
+        vq_code_relative_eos_id = audio_codebook_size + 1
+        work = input_ids.clone()
+        attention_bias = None
+        if attention_mask is not None:
+            attention_bias = (attention_mask[:, :, None] & attention_mask[:, None, :]).bool().unsqueeze(1)
+        speech_indices = mask_cols[:max_new_tokens]
+        total_steps = num_blocks * inner_steps
+        global_step = 0
+        def _extract_relative_tokens(work_tensor: torch.Tensor):
+            audio_slice = slice(speech_region_start, speech_region_start + speech_region_len)
+            audio_region = work_tensor[:, audio_slice]
+            final_outputs = []
+            for seq in audio_region:
+                mask_tensor = seq.new_full(seq.shape, mask_token_id)
+                rel_eoa = seq.new_full(seq.shape, vq_code_relative_eoa_id)
+                rel_eos = seq.new_full(seq.shape, vq_code_relative_eos_id)
+                relative = torch.where(
+                    seq == mask_token_id,
+                    mask_tensor,
+                    torch.where(
+                        seq == eoa_token_id,
+                        rel_eoa,
+                        torch.where(
+                            seq == eos_token_id,
+                            rel_eos,
+                            seq - speech_vocab_start
+                        )
+                    )
+                )
+                eoa_positions = (relative >= vq_code_relative_eoa_id).nonzero(as_tuple=True)[0]
+                if eoa_positions.numel() > 0:
+                    relative = relative[:eoa_positions[0]]
+                final_outputs.append(relative[relative != mask_token_id])
+            return final_outputs
+        for block_idx in range(num_blocks):
+            block_start = block_idx * block_length
+            block_end = min(block_start + block_length, max_new_tokens)
+            curr_indices = speech_indices[block_start:block_end]
+            if curr_indices.numel() == 0:
+                continue
+            block_mask = mask_positions_full[:, curr_indices]
+            num_transfer_tokens = get_num_transfer_tokens(block_mask, inner_steps)
+            for inner_step in range(inner_steps):
+                if cfg_scale > 0.0:
+                    un_cond = work.clone()
+                    un_cond[:, speech_indices] = mask_token_id
+                    stacked = torch.cat([work, un_cond], dim=0)
+                    if attention_bias is not None:
+                        att_bias = torch.cat([attention_bias, attention_bias], dim=0)
+                    else:
+                        att_bias = None
+                    logits = self(stacked, attention_bias=att_bias).logits
+                    cond_logits, uncond_logits = torch.chunk(logits, 2, dim=0)
+                    logits = uncond_logits + (cfg_scale + 1.0) * (cond_logits - uncond_logits)
+                else:
+                    logits = self(work, attention_bias=attention_bias).logits
+                logits_block = logits.index_select(1, curr_indices.to(device))
+                logits_vq = logits_block[:, :, speech_vocab_start:speech_vocab_end]
+                logits_eoa = logits_block[:, :, eoa_token_id:eoa_token_id + 1]
+                logits_eos = logits_block[:, :, eos_token_id:eos_token_id + 1]
+                combined_logits = torch.cat([logits_vq, logits_eoa, logits_eos], dim=-1)
+                if temperature > 0.0:
+                    combined_logits = combined_logits / max(temperature, 1e-5)
+                probs = F.softmax(combined_logits, dim=-1)
+                sampled = torch.multinomial(
+                    probs.view(-1, probs.size(-1)), 1
+                ).view(batch_size, curr_indices.numel())
+                selected_probs = torch.gather(probs, -1, sampled.unsqueeze(-1)).squeeze(-1)
+                eos_tensor = sampled.new_full(sampled.shape, eos_token_id)
+                eoa_tensor = sampled.new_full(sampled.shape, eoa_token_id)
+                sampled_absolute = torch.where(
+                    sampled == vq_code_relative_eos_id,
+                    eos_tensor,
+                    torch.where(
+                        sampled == vq_code_relative_eoa_id,
+                        eoa_tensor,
+                        sampled + speech_vocab_start
+                    )
+                )
+                current_block_vals = work.index_select(1, curr_indices)
+                mask_current = current_block_vals == mask_token_id
+                confidence = torch.where(
+                    mask_current,
+                    selected_probs,
+                    torch.full_like(selected_probs, float('-inf'))
+                )
+                finalize = torch.zeros_like(mask_current, dtype=torch.bool)
+                for b in range(batch_size):
+                    available = mask_current[b].sum().item()
+                    if available == 0:
+                        continue
+                    transfer = min(int(num_transfer_tokens[b, inner_step].item()), available)
+                    if transfer <= 0:
+                        continue
+                    _, idxs = torch.topk(confidence[b], k=transfer, largest=True)
+                    finalize[b, idxs] = True
+                mask_fill = sampled_absolute.new_full(sampled_absolute.shape, mask_token_id)
+                updates = torch.where(finalize, sampled_absolute, mask_fill)
+                new_block = torch.where(mask_current, updates, current_block_vals)
+                work[:, curr_indices] = new_block
+                mask_positions_full[:, curr_indices] = new_block == mask_token_id
+                global_step += 1
+                should_yield = False
+                if update_every is not None and update_every > 0:
+                    if global_step % update_every == 0 or global_step == total_steps:
+                        should_yield = True
+                else:
+                    if inner_step == inner_steps - 1 or global_step == total_steps:
+                        should_yield = True
+                if should_yield:
+                    yield _extract_relative_tokens(work), f"Step {global_step}/{total_steps}"
+                if not mask_positions_full[:, curr_indices].any():
+                    break
+        return
     @torch.no_grad()
     def t2s_fixed_generate(
             self,
                 uncond_input_ids = torch.cat(
                     [uncond_prefix, input_ids[:, resolution + 1:]], dim=1)
                 model_input = torch.cat([input_ids, uncond_input_ids])
+                all_attention_mask = torch.cat([attention_mask, uncond_attention_mask], dim=0)
+                attention_bias = (all_attention_mask[:, :, None] & all_attention_mask[:, None, :]).bool().unsqueeze(1)
                 logits = self(model_input, attention_bias=attention_bias).logits
                 # print(f"logits.shape: {logits.shape}")
                 cond_logits, uncond_logits = torch.chunk(logits, 2, dim=0)
         return sampled_ids
+    @torch.no_grad()
+    def i2i_generate_decoding_stepwise(
+            self,
+            input_ids: torch.LongTensor = None,
+            uncond_input_ids: torch.LongTensor = None,
+            attention_mask=None,
+            uncond_attention_mask=None,
+            temperature=1.0,
+            timesteps=18,  # ideal number of steps is 18 in maskgit paper
+            guidance_scale=0,
+            noise_schedule=cosine_schedule,
+            generator: torch.Generator = None,
+            config=None,
+            seq_len=1024,
+            mask_token_id=126336,
+            resolution=512,
+            codebook_size=8192,
+            vq_model=None,
+            **kwargs,
+    ):
+        """
+        Stepwise i2i decoding that yields intermediate images per step.
+        """
+        if vq_model is None:
+            raise ValueError("vq_model is required for stepwise decoding.")
+        mask_count = (input_ids == mask_token_id).sum().item()
+        num_vq_tokens = seq_len
+        num_new_special_tokens = 0
+        uni_prompting = kwargs.get("uni_prompting", None)
+        input_ids_minus_lm_vocab_size = input_ids[:, -(num_vq_tokens + 1):-1].clone()
+        input_ids_minus_lm_vocab_size = torch.where(
+            input_ids_minus_lm_vocab_size == mask_token_id,
+            mask_token_id,
+            input_ids_minus_lm_vocab_size - len(uni_prompting.text_tokenizer) - num_new_special_tokens,
+        )
+        if uncond_input_ids is not None:
+            uncond_prefix = uncond_input_ids[:, :resolution + 1]
+        for step in range(timesteps):
+            if uncond_input_ids is not None and guidance_scale > 0:
+                uncond_input_ids = torch.cat(
+                    [uncond_prefix, input_ids[:, resolution + 1:]], dim=1)
+                model_input = torch.cat([input_ids, uncond_input_ids])
+                all_attention_mask = torch.cat([attention_mask, uncond_attention_mask], dim=0)
+                attention_bias = (all_attention_mask[:, :, None] & all_attention_mask[:, None, :]).bool().unsqueeze(1)
+                logits = self(model_input, attention_bias=attention_bias).logits
+                cond_logits, uncond_logits = torch.chunk(logits, 2, dim=0)
+                logits = (1 + guidance_scale) * cond_logits - guidance_scale * uncond_logits
+                logits = logits[:, -(num_vq_tokens + 1):-1,
+                                len(uni_prompting.text_tokenizer) + num_new_special_tokens:
+                                len(uni_prompting.text_tokenizer) + num_new_special_tokens + codebook_size]
+            else:
+                attention_bias = (attention_mask[:, :, None] & attention_mask[:, None, :]).bool().unsqueeze(1)
+                logits = self(input_ids, attention_bias=attention_bias).logits
+                logits = logits[:, -(num_vq_tokens + 1):-1,
+                                len(uni_prompting.text_tokenizer) + num_new_special_tokens:
+                                len(uni_prompting.text_tokenizer) + num_new_special_tokens + codebook_size]
+            probs = logits.softmax(dim=-1)
+            sampled = probs.reshape(-1, logits.size(-1))
+            sampled_ids = torch.multinomial(sampled, 1, generator=generator)[:, 0].view(*logits.shape[:-1])
+            unknown_map = input_ids_minus_lm_vocab_size == mask_token_id
+            sampled_ids = torch.where(unknown_map, sampled_ids, input_ids_minus_lm_vocab_size)
+            current_image_vq_indices = torch.clamp(sampled_ids.clone(), 0, codebook_size - 1)
+            current_image = vq_model.decode_code(current_image_vq_indices)
+            images = torch.clamp((current_image + 1.0) / 2.0, min=0.0, max=1.0)
+            images *= 255.0
+            images = images.permute(0, 2, 3, 1).cpu().numpy().astype(np.uint8)
+            pil_images = Image.fromarray(images[0])
+            yield pil_images, f"Step {step + 1}/{timesteps}"
+            ratio = 1.0 * (step + 1) / timesteps
+            mask_ratio = noise_schedule(torch.tensor(ratio))
+            selected_probs = torch.gather(probs, -1, sampled_ids.long()[..., None]).squeeze(-1)
+            selected_probs = torch.where(unknown_map, selected_probs, torch.finfo(selected_probs.dtype).max)
+            mask_len = (num_vq_tokens * mask_ratio).floor().unsqueeze(0).to(logits.device)
+            mask_len = torch.max(
+                torch.tensor([1], device=logits.device),
+                torch.min(unknown_map.sum(dim=-1, keepdim=True) - 1, mask_len),
+            )
+            temperature = temperature * (1.0 - ratio)
+            masking = mask_by_random_topk(mask_len, selected_probs, temperature, generator=generator)
+            input_ids[:, -(num_vq_tokens + 1):-1] = torch.where(
+                masking,
+                mask_token_id,
+                sampled_ids + len(uni_prompting.text_tokenizer) + num_new_special_tokens,
+            )
+            input_ids_minus_lm_vocab_size = torch.where(masking, mask_token_id, sampled_ids)
+        return sampled_ids
 AutoConfig.register("omada", OMadaConfig)

MMaDA/models/speech_tokenization/condition_style_centroid ADDED Viewed

	@@ -0,0 +1 @@


1	+ /dataset/omada/AIDAS-Omni-Modal-Diffusion/_style_cache

app.py CHANGED Viewed

@@ -11,8 +11,18 @@ import os
 import sys
 import subprocess
 import importlib
 from pathlib import Path
 from typing import List
 import gradio as gr
 import spaces
@@ -23,6 +33,9 @@ from packaging.version import parse as parse_version
 # ---------------------------
 PROJECT_ROOT = Path(__file__).resolve().parent
 MMADA_ROOT = PROJECT_ROOT / "MMaDA"
 if str(MMADA_ROOT) not in sys.path:
     sys.path.insert(0, str(MMADA_ROOT))
@@ -135,7 +148,7 @@ def download_checkpoint() -> Path:
             raise FileNotFoundError(f"MODEL_CHECKPOINT_PATH does not exist: {override_path}")
         return override_path
-    repo_id = os.getenv("MODEL_REPO_ID", "jaeikkim/AIDAS-Omni-Modal-Diffusion")
     revision = os.getenv("MODEL_REVISION", "main")
     token = os.getenv("HF_TOKEN")
     cache_dir = PROJECT_ROOT / "_ckpt_cache"
@@ -226,6 +239,323 @@ CHAT_EXAMPLES = _load_text_examples(ASSET_ROOT / "chat" / "text.txt")
 T2I_EXAMPLES = _load_text_examples(ASSET_ROOT / "t2i" / "text.txt")
 I2I_EXAMPLES = _load_i2i_examples()
 # audio / video / image examples
 S2T_EXAMPLES = _load_media_examples("s2t", {".wav", ".mp3", ".flac", ".ogg"})
 V2T_EXAMPLES = _load_media_examples("v2t", {".mp4", ".mov", ".avi", ".webm"})
@@ -277,15 +607,27 @@ def get_app() -> OmadaDemo:
     default_cfg = PROJECT_ROOT / "MMaDA" / "inference" / "demo" / "demo.yaml"
     legacy_cfg = PROJECT_ROOT / "MMaDA" / "configs" / "mmada_demo.yaml"
     train_config = os.getenv("TRAIN_CONFIG_PATH")
     if not train_config:
-        train_config = str(default_cfg if default_cfg.exists() else legacy_cfg)
     device = os.getenv("DEVICE", "cuda")
     APP = OmadaDemo(train_config=train_config, checkpoint=str(ckpt_dir), device=device)
     return APP
 # ---------------------------
 # ZeroGPU-wrapped handlers
 # ---------------------------
@@ -310,37 +652,40 @@ def t2s_handler(text, max_tokens, steps, block_len, temperature, cfg_scale, gend
 @spaces.GPU
 def s2t_handler(audio_path, steps, block_len, max_tokens, remasking):
     app = get_app()
-    text, status = app.run_s2t(
         audio_path=audio_path,
         steps=int(steps),
         block_length=int(block_len),
         max_new_tokens=int(max_tokens),
         remasking=str(remasking),
-    )
-    return text, status
 @spaces.GPU
 def v2t_handler(video, steps, block_len, max_tokens):
     app = get_app()
-    text, status = app.run_v2t(
         video_path=video,
         steps=int(steps),
         block_length=int(block_len),
         max_new_tokens=int(max_tokens),
-    )
-    return text, status
 @spaces.GPU
 def chat_handler(message, max_tokens, steps, block_len, temperature):
     app = get_app()
-    text, status = app.run_chat(
         message=message,
         max_new_tokens=int(max_tokens),
         steps=int(steps),
         block_length=int(block_len),
         temperature=float(temperature),
-    )
-    return text, status
 @spaces.GPU
 def mmu_handler(image, question, max_tokens, steps, block_len, temperature):
@@ -358,25 +703,27 @@ def mmu_handler(image, question, max_tokens, steps, block_len, temperature):
 @spaces.GPU
 def t2i_handler(prompt, timesteps, temperature, guidance):
     app = get_app()
-    image, status = app.run_t2i(
         prompt=prompt,
         timesteps=int(timesteps),
         temperature=float(temperature),
         guidance_scale=float(guidance),
-    )
-    return image, status
 @spaces.GPU
 def i2i_handler(instruction, image, timesteps, temperature, guidance):
     app = get_app()
-    image_out, status = app.run_i2i(
         instruction=instruction,
         source_image=image,
         timesteps=int(timesteps),
         temperature=float(temperature),
         guidance_scale=float(guidance),
-    )
-    return image_out, status
 # ---------------------------
@@ -384,255 +731,898 @@ def i2i_handler(instruction, image, timesteps, temperature, guidance):
 # ---------------------------
 theme = gr.themes.Soft(primary_hue="blue", neutral_hue="gray")
 with gr.Blocks(
     title="AIDAS Lab @ SNU - Omni-modal Diffusion",
-    css=CUSTOM_CSS,
     theme=theme,
     js=FORCE_LIGHT_MODE_JS,
 ) as demo:
-    with gr.Row():
-        if LOGO_PATH.exists():
-            gr.Image(
-                value=str(LOGO_PATH),
-                show_label=False,
-                height=80,
-                interactive=False,
-            )
-        gr.Markdown(
-            "## Omni-modal Diffusion Foundation Model\n"
-            "### AIDAS Lab @ SNU"
-        )
-    # ---- T2S ----
-    with gr.Tab("Text → Speech (T2S)"):
-        with gr.Row():
-            t2s_text = gr.Textbox(
-                label="Input text",
-                lines=4,
-                placeholder="Type the speech you want to synthesize...",
-            )
-            t2s_audio = gr.Audio(label="Generated speech", type="numpy")
-        t2s_status = gr.Textbox(label="Status", interactive=False)
-        with gr.Accordion("Advanced settings", open=False):
-            t2s_max_tokens = gr.Slider(2, 512, value=384, step=2, label="Speech token length")
-            t2s_steps = gr.Slider(2, 512, value=128, step=2, label="Total refinement steps")
-            t2s_block = gr.Slider(2, 512, value=128, step=2, label="Block length")
-            t2s_temperature = gr.Slider(0.0, 2.0, value=1.0, step=0.05, label="Sampling temperature")
-            t2s_cfg = gr.Slider(0.0, 6.0, value=3.5, step=0.1, label="CFG scale")
-            with gr.Row():
-                t2s_gender = gr.Dropdown(["random", "female", "male"], value="random", label="Gender")
-                t2s_emotion = gr.Dropdown(["random", "angry", "happy", "neutral", "sad"], value="random", label="Emotion")
-            with gr.Row():
-                t2s_speed = gr.Dropdown(["random", "normal", "fast", "slow"], value="random", label="Speed")
-                t2s_pitch = gr.Dropdown(["random", "normal", "high", "low"], value="random", label="Pitch")
-        if T2S_EXAMPLES:
-            with gr.Accordion("Sample prompts", open=False):
-                gr.Examples(
-                    examples=T2S_EXAMPLES,
-                    inputs=[t2s_text],
-                    examples_per_page=6,
-                )
-        t2s_btn = gr.Button("Generate speech", variant="primary")
-        t2s_btn.click(
-            t2s_handler,
-            inputs=[
-                t2s_text,
-                t2s_max_tokens,
-                t2s_steps,
-                t2s_block,
-                t2s_temperature,
-                t2s_cfg,
-                t2s_gender,
-                t2s_emotion,
-                t2s_speed,
-                t2s_pitch,
-            ],
-            outputs=[t2s_audio, t2s_status],
         )
-    # ---- S2T ----
-    with gr.Tab("Speech → Text (S2T)"):
-        s2t_audio_in = gr.Audio(type="filepath", label="Speech input", sources=["microphone", "upload"])
-        s2t_text_out = gr.Textbox(label="Transcription", lines=4)
-        s2t_status = gr.Textbox(label="Status", interactive=False)
-        with gr.Accordion("Advanced settings", open=False):
-            s2t_steps = gr.Slider(2, 512, value=128, step=2, label="Denoising steps")
-            s2t_block = gr.Slider(2, 512, value=128, step=2, label="Block length")
-            s2t_max_tokens = gr.Slider(2, 512, value=128, step=2, label="Max new tokens")
-            s2t_remasking = gr.Dropdown(
-                ["low_confidence", "random"],
-                value="low_confidence",
-                label="Remasking strategy",
-            )
-        if S2T_EXAMPLES:
-            with gr.Accordion("Sample clips", open=False):
-                gr.Examples(
-                    examples=S2T_EXAMPLES,
-                    inputs=[s2t_audio_in],
-                    examples_per_page=4,
-                )
-        s2t_btn = gr.Button("Transcribe", variant="primary")
-        s2t_btn.click(
-            s2t_handler,
-            inputs=[s2t_audio_in, s2t_steps, s2t_block, s2t_max_tokens, s2t_remasking],
-            outputs=[s2t_text_out, s2t_status],
         )
-    # ---- V2T ----
-    with gr.Tab("Video → Text (V2T)"):
-        v2t_video_in = gr.Video(
-            label="Upload or record video",
-            height=256,
-            sources=["upload", "webcam"],
-        )
-        v2t_text_out = gr.Textbox(label="Caption / answer", lines=4)
-        v2t_status = gr.Textbox(label="Status", interactive=False)
-        with gr.Accordion("Advanced settings", open=False):
-            v2t_steps = gr.Slider(2, 512, value=64, step=2, label="Denoising steps")
-            v2t_block = gr.Slider(2, 512, value=64, step=2, label="Block length")
-            v2t_max_tokens = gr.Slider(2, 512, value=64, step=2, label="Max new tokens")
-        if V2T_EXAMPLES:
-            with gr.Accordion("Sample videos", open=False):
-                gr.Examples(
-                    examples=V2T_EXAMPLES,
-                    inputs=[v2t_video_in],
-                    examples_per_page=4,
-                )
-        v2t_btn = gr.Button("Generate caption", variant="primary")
-        v2t_btn.click(
-            v2t_handler,
-            inputs=[v2t_video_in, v2t_steps, v2t_block, v2t_max_tokens],
-            outputs=[v2t_text_out, v2t_status],
         )
-    # ---- T2I ----
-    with gr.Tab("Text → Image (T2I)"):
-        t2i_prompt = gr.Textbox(
-            label="Prompt",
-            lines=4,
-            placeholder="Describe the image you want to generate...",
-        )
-        t2i_image_out = gr.Image(label="Generated image")
-        t2i_status = gr.Textbox(label="Status", interactive=False)
-        with gr.Accordion("Advanced settings", open=False):
-            t2i_timesteps = gr.Slider(4, 128, value=32, step=2, label="Timesteps")
-            t2i_temperature = gr.Slider(0.0, 2.0, value=1.0, step=0.05, label="Sampling temperature")
-            t2i_guidance = gr.Slider(0.0, 8.0, value=3.5, step=0.1, label="CFG scale")
-        if T2I_EXAMPLES:
-            with gr.Accordion("Sample prompts", open=False):
-                gr.Examples(
-                    examples=T2I_EXAMPLES,
-                    inputs=[t2i_prompt],
-                    examples_per_page=6,
-                )
-        t2i_btn = gr.Button("Generate image", variant="primary")
-        t2i_btn.click(
-            t2i_handler,
-            inputs=[t2i_prompt, t2i_timesteps, t2i_temperature, t2i_guidance],
-            outputs=[t2i_image_out, t2i_status],
-        )
-    # ---- I2I ----
-    with gr.Tab("Image Editing (I2I)"):
-        i2i_image_in = gr.Image(type="pil", label="Reference image", sources=["upload"])
-        i2i_instr = gr.Textbox(
-            label="Editing instruction",
-            lines=4,
-            placeholder="Describe how you want to edit the image...",
         )
-        i2i_image_out = gr.Image(label="Edited image")
-        i2i_status = gr.Textbox(label="Status", interactive=False)
-        with gr.Accordion("Advanced settings", open=False):
-            i2i_timesteps = gr.Slider(4, 128, value=32, step=2, label="Timesteps")
-            i2i_temperature = gr.Slider(0.0, 2.0, value=0.3, step=0.05, label="Sampling temperature")
-            i2i_guidance = gr.Slider(0.0, 8.0, value=3.5, step=0.1, label="CFG scale")
-        if I2I_EXAMPLES:
-            with gr.Accordion("Sample edits", open=False):
-                gr.Examples(
-                    examples=I2I_EXAMPLES,
-                    inputs=[i2i_image_in, i2i_instr],
-                    examples_per_page=4,
-                )
-        i2i_btn = gr.Button("Apply edit", variant="primary")
-        i2i_btn.click(
-            i2i_handler,
-            inputs=[i2i_instr, i2i_image_in, i2i_timesteps, i2i_temperature, i2i_guidance],
-            outputs=[i2i_image_out, i2i_status],
         )
-    # ---- Chat ----
-    with gr.Tab("Text Chat"):
-        chat_in = gr.Textbox(
-            label="Message",
-            lines=4,
-            placeholder="Ask anything. The model will reply in text.",
         )
-        chat_out = gr.Textbox(label="Assistant reply", lines=6)
-        chat_status = gr.Textbox(label="Status", interactive=False)
-        with gr.Accordion("Advanced settings", open=False):
-            chat_max_tokens = gr.Slider(2, 512, value=64, step=2, label="Reply max tokens")
-            chat_steps = gr.Slider(2, 512, value=64, step=2, label="Refinement steps")
-            chat_block = gr.Slider(2, 512, value=64, step=2, label="Block length")
-            chat_temperature_slider = gr.Slider(0.0, 2.0, value=0.8, step=0.05, label="Sampling temperature")
-        if CHAT_EXAMPLES:
-            with gr.Accordion("Sample prompts", open=False):
-                gr.Examples(
-                    examples=CHAT_EXAMPLES,
-                    inputs=[chat_in],
-                    examples_per_page=6,
-                )
-        chat_btn = gr.Button("Send", variant="primary")
-        chat_btn.click(
-            chat_handler,
-            inputs=[
-                chat_in,
                 chat_max_tokens,
                 chat_steps,
                 chat_block,
-                chat_temperature_slider,
-            ],
-            outputs=[chat_out, chat_status],
-        )
-    # ---- MMU ----
-    with gr.Tab("MMU (Image → Text)"):
-        mmu_img = gr.Image(type="pil", label="Input image", sources=["upload"])
-        mmu_question = gr.Textbox(
-            label="Question",
-            lines=3,
-            placeholder="Ask about the scene, objects, or context of the image.",
-        )
-        mmu_answer = gr.Textbox(label="Answer", lines=6)
-        mmu_status = gr.Textbox(label="Status", interactive=False)
-        with gr.Accordion("Advanced settings", open=False):
-            mmu_max_tokens = gr.Slider(2, 512, value=256, step=2, label="Answer max tokens")
-            mmu_steps = gr.Slider(2, 512, value=256, step=2, label="Refinement steps")
-            mmu_block = gr.Slider(2, 512, value=128, step=2, label="Block length")
-            mmu_temperature = gr.Slider(0.0, 2.0, value=0.7, step=0.05, label="Sampling temperature")
-        if MMU_EXAMPLES:
-            with gr.Accordion("Sample MMU prompts", open=False):
-                gr.Examples(
-                    examples=MMU_EXAMPLES,
-                    inputs=[mmu_img, mmu_question],
-                    examples_per_page=1,
-                )
-        mmu_btn = gr.Button("Answer about the image", variant="primary")
-        mmu_btn.click(
-            mmu_handler,
-            inputs=[
-                mmu_img,
-                mmu_question,
-                mmu_max_tokens,
-                mmu_steps,
-                mmu_block,
-                mmu_temperature,
-            ],
-            outputs=[mmu_answer, mmu_status],
         )
 if __name__ == "__main__":
-    demo.launch()

 import sys
 import subprocess
 import importlib
+import base64
+import html
+import io
+import re
+import wave
+import tempfile
+import shutil
+from urllib.parse import quote
 from pathlib import Path
 from typing import List
+import numpy as np
+from PIL import Image
 import gradio as gr
 import spaces
 # ---------------------------
 PROJECT_ROOT = Path(__file__).resolve().parent
+os.environ.setdefault("FORCE_EVAL_SETTINGS", "0")
+PREVIEW_DIR = PROJECT_ROOT / "_preview_cache"
+PREVIEW_DIR.mkdir(parents=True, exist_ok=True)
 MMADA_ROOT = PROJECT_ROOT / "MMaDA"
 if str(MMADA_ROOT) not in sys.path:
     sys.path.insert(0, str(MMADA_ROOT))
             raise FileNotFoundError(f"MODEL_CHECKPOINT_PATH does not exist: {override_path}")
         return override_path
+    repo_id = os.getenv("MODEL_REPO_ID", "snu-aidas/Dynin-Omni")
     revision = os.getenv("MODEL_REVISION", "main")
     token = os.getenv("HF_TOKEN")
     cache_dir = PROJECT_ROOT / "_ckpt_cache"
 T2I_EXAMPLES = _load_text_examples(ASSET_ROOT / "t2i" / "text.txt")
 I2I_EXAMPLES = _load_i2i_examples()
+def _render_response(status: str, body_html: str = "") -> str:
+    safe_status = html.escape(status or "")
+    parts = []
+    if safe_status:
+        parts.append(f"<p class='omada-response-status'>{safe_status}</p>")
+    if body_html:
+        parts.append(body_html)
+    content = "".join(parts)
+    return f"<div class='omada-response-container'>{content}</div>"
+def _render_text_message(status: str, content: str) -> str:
+    content = (content or "").strip()
+    if not content:
+        return _render_response(status)
+    safe_content = _format_tokenized_text(content)
+    body = f"<div class='omada-response-block'>{safe_content}</div>"
+    return _render_response(status, body)
+def _is_mask_like_token(token: str) -> bool:
+    t = token.strip()
+    if not t:
+        return False
+    upper = t.upper()
+    return (
+        upper in {"[MASK]", "<MASK>", "<|MASK|>", "<MASK_TOKEN>", "<|MASK_TOKEN|>"}
+        or upper in {"<MDM_MASK>", "MDM_MASK", "<|MDM_MASK|>"}
+        or "MASK" in upper
+    )
+def _is_special_token(token: str) -> bool:
+    t = token.strip()
+    return bool(t) and t.startswith("<|") and t.endswith("|>")
+def _format_tokenized_text(text: str) -> str:
+    if not text:
+        return ""
+    # Handle both complete and partially-streamed mask tokens.
+    mask_pat = r"(<[^>\n]*MASK[^>\n]*>?|\[MASK\]|MASK_TOKEN)"
+    chunks = re.split(mask_pat, text, flags=re.IGNORECASE)
+    out = []
+    for chunk in chunks:
+        if not chunk:
+            continue
+        if re.fullmatch(mask_pat, chunk, flags=re.IGNORECASE) or _is_mask_like_token(chunk):
+            out.append("<span class='omada-token-pill omada-token-mask'>MASK</span>")
+            continue
+        if chunk.isspace():
+            out.append(chunk.replace("\n", "<br>"))
+            continue
+        safe = html.escape(chunk)
+        if _is_special_token(chunk):
+            out.append(f"<span class='omada-token-pill omada-token-special'>{safe}</span>")
+        else:
+            out.append(safe)
+    return "".join(out).replace("\n", "<br>")
+def _render_audio_message(status: str, audio):
+    if not audio:
+        return _render_response(status)
+    sample_rate, data = audio
+    if data is None:
+        return _render_response(status)
+    waveform = np.asarray(data, dtype=np.float32)
+    if waveform.size == 0:
+        return _render_response(status)
+    if waveform.ndim == 1:
+        waveform = waveform[:, None]
+    channels = waveform.shape[1]
+    clipped = np.clip(waveform, -1.0, 1.0)
+    pcm16 = (clipped * 32767.0).astype(np.int16)
+    buffer = io.BytesIO()
+    with wave.open(buffer, "wb") as wav_writer:
+        wav_writer.setnchannels(channels)
+        wav_writer.setsampwidth(2)
+        wav_writer.setframerate(int(sample_rate))
+        wav_writer.writeframes(pcm16.tobytes())
+    encoded = base64.b64encode(buffer.getvalue()).decode("ascii")
+    audio_tag = (
+        "<div class='omada-audio-block'>"
+        "<audio controls preload='auto' playsinline>"
+        f"<source src='data:audio/wav;base64,{encoded}' type='audio/wav' /></audio>"
+        "</div>"
+    )
+    return _render_response(status, audio_tag)
+def _render_image_message(status: str, image: Image.Image):
+    if image is None:
+        return _render_response(status)
+    buffer = io.BytesIO()
+    try:
+        image.save(buffer, format="PNG")
+    except Exception:
+        return _render_response(status)
+    encoded = base64.b64encode(buffer.getvalue()).decode("ascii")
+    image_html = (
+        "<div class='omada-response-block'>"
+        "<img src='data:image/png;base64,"
+        f"{encoded}"
+        "' alt='Generated image' style='max-width:100%;border-radius:12px;' />"
+        "</div>"
+    )
+    return _render_response(status, image_html)
+def _render_user_message(mode: str, message: str, image_in, audio_in, video_in, defer_video: bool = False) -> str:
+    def _cache_media_copy(src_path: str) -> str:
+        path = str(src_path or "")
+        if not path or not os.path.exists(path):
+            return path
+        try:
+            suffix = Path(path).suffix or ""
+            fd, dst = tempfile.mkstemp(prefix="omada_media_", suffix=suffix, dir=str(PREVIEW_DIR))
+            os.close(fd)
+            shutil.copy2(path, dst)
+            return dst
+        except Exception:
+            return path
+    def _to_browser_mp4(video_path: str) -> str:
+        path = str(video_path or "")
+        if not path:
+            return path
+        try:
+            fd, out_path = tempfile.mkstemp(prefix="omada_preview_", suffix=".mp4", dir=str(PREVIEW_DIR))
+            os.close(fd)
+            cmd = [
+                "ffmpeg",
+                "-y",
+                "-i",
+                path,
+                "-an",
+                "-c:v",
+                "libx264",
+                "-pix_fmt",
+                "yuv420p",
+                "-movflags",
+                "+faststart",
+                out_path,
+            ]
+            proc = subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+            if proc.returncode == 0 and os.path.exists(out_path):
+                return out_path
+            if os.path.exists(out_path):
+                os.remove(out_path)
+        except Exception:
+            pass
+        return path
+    def _video_data_uri(video_path: str, mime: str, max_bytes: int = 25 * 1024 * 1024) -> str:
+        try:
+            size = os.path.getsize(video_path)
+            if size <= 0 or size > max_bytes:
+                return ""
+            with open(video_path, "rb") as f:
+                encoded = base64.b64encode(f.read()).decode("ascii")
+            return f"data:{mime};base64,{encoded}"
+        except Exception:
+            return ""
+    def _video_poster_data_uri(video_path: str) -> str:
+        try:
+            import cv2  # type: ignore
+            cap = cv2.VideoCapture(video_path)
+            ok, frame = cap.read()
+            cap.release()
+            if not ok or frame is None:
+                return ""
+            ok, buf = cv2.imencode(".jpg", frame)
+            if not ok:
+                return ""
+            encoded = base64.b64encode(buf.tobytes()).decode("ascii")
+            return f"data:image/jpeg;base64,{encoded}"
+        except Exception:
+            return ""
+    parts = []
+    text = (message or "").strip()
+    if image_in is not None:
+        try:
+            if isinstance(image_in, Image.Image):
+                buffer = io.BytesIO()
+                image_in.save(buffer, format="PNG")
+                encoded = base64.b64encode(buffer.getvalue()).decode("ascii")
+                parts.append(
+                    "<div class='omada-user-media'>"
+                    f"<img src='data:image/png;base64,{encoded}' alt='Input image' />"
+                    "</div>"
+                )
+            elif isinstance(image_in, str) and image_in:
+                try:
+                    with Image.open(image_in).convert("RGB") as pil_img:
+                        buf = io.BytesIO()
+                        pil_img.save(buf, format="PNG")
+                        encoded = base64.b64encode(buf.getvalue()).decode("ascii")
+                    parts.append(
+                        "<div class='omada-user-media'>"
+                        f"<img src='data:image/png;base64,{encoded}' alt='Input image' />"
+                        "</div>"
+                    )
+                except Exception:
+                    image_path = _cache_media_copy(image_in)
+                    parts.append(
+                        "<div class='omada-user-media'>"
+                        f"<img src='/file={quote(image_path)}' alt='Input image' />"
+                        "</div>"
+                    )
+        except Exception:
+            pass
+    if mode == "MMU (Video → Text)" and video_in:
+        if defer_video:
+            parts.append("<div class='omada-user-media'><div class='omada-video-loading'>Video loading...</div></div>")
+            if text:
+                parts.append(f"<div>{html.escape(text)}</div>")
+            return "".join(parts)
+        video_path = None
+        if isinstance(video_in, str):
+            video_path = video_in
+        elif isinstance(video_in, dict):
+            video_path = video_in.get("path") or video_in.get("name")
+        if video_path:
+            cached_original = _cache_media_copy(video_path)
+            preview_path = _to_browser_mp4(cached_original)
+            poster = _video_poster_data_uri(cached_original)
+            poster_attr = f" poster='{poster}'" if poster else ""
+            source_path = str(preview_path or cached_original)
+            fallback_path = str(cached_original)
+            def _video_mime(path: str) -> str:
+                ext = os.path.splitext(path.lower())[1]
+                return {
+                    ".mp4": "video/mp4",
+                    ".webm": "video/webm",
+                    ".mov": "video/quicktime",
+                    ".m4v": "video/mp4",
+                    ".avi": "video/x-msvideo",
+                    ".mkv": "video/x-matroska",
+                }.get(ext, "video/mp4")
+            parts.append(
+                "<div class='omada-user-media'>"
+                f"<video class='omada-user-video' controls playsinline preload='metadata'{poster_attr}>"
+                f"<source src='{(_video_data_uri(source_path, _video_mime(source_path)) or f'/file={quote(source_path)}')}' type='{_video_mime(source_path)}' />"
+                f"<source src='/file={quote(fallback_path)}' type='{_video_mime(fallback_path)}' />"
+                f"<a href='/file={quote(fallback_path)}' target='_blank' rel='noopener'>Open video</a>"
+                "</video>"
+                "</div>"
+            )
+    if audio_in is not None:
+        audio_path = ""
+        if isinstance(audio_in, str):
+            audio_path = audio_in
+        elif isinstance(audio_in, dict):
+            audio_path = audio_in.get("path") or audio_in.get("name") or ""
+        elif isinstance(audio_in, (tuple, list)) and len(audio_in) == 2:
+            try:
+                sample_rate, data = audio_in
+                waveform = np.asarray(data, dtype=np.float32)
+                if waveform.ndim == 1:
+                    waveform = waveform[:, None]
+                waveform = np.clip(waveform, -1.0, 1.0)
+                pcm16 = (waveform * 32767.0).astype(np.int16)
+                fd, temp_audio = tempfile.mkstemp(prefix="omada_user_audio_", suffix=".wav", dir=str(PREVIEW_DIR))
+                os.close(fd)
+                with wave.open(temp_audio, "wb") as wav_writer:
+                    wav_writer.setnchannels(pcm16.shape[1])
+                    wav_writer.setsampwidth(2)
+                    wav_writer.setframerate(int(sample_rate))
+                    wav_writer.writeframes(pcm16.tobytes())
+                audio_path = temp_audio
+            except Exception:
+                audio_path = ""
+        if audio_path:
+            ext = os.path.splitext(audio_path.lower())[1]
+            mime = {
+                ".wav": "audio/wav",
+                ".mp3": "audio/mpeg",
+                ".flac": "audio/flac",
+                ".ogg": "audio/ogg",
+                ".m4a": "audio/mp4",
+            }.get(ext, "audio/wav")
+            src = ""
+            try:
+                with open(audio_path, "rb") as f:
+                    encoded_audio = base64.b64encode(f.read()).decode("ascii")
+                src = f"data:{mime};base64,{encoded_audio}"
+            except Exception:
+                audio_path = _cache_media_copy(audio_path)
+                src = f"/file={quote(audio_path)}"
+            parts.append(
+                "<div class='omada-user-media'>"
+                f"<audio controls preload='metadata'><source src='{src}' type='{mime}' /></audio>"
+                f"<div><a href='{src}' target='_blank' rel='noopener'>Open audio</a></div>"
+                "</div>"
+            )
+    if text:
+        parts.append(f"<div>{html.escape(text)}</div>")
+    if not parts:
+        parts.append(f"<div>[{html.escape(mode)}]</div>")
+    return "".join(parts)
 # audio / video / image examples
 S2T_EXAMPLES = _load_media_examples("s2t", {".wav", ".mp3", ".flac", ".ogg"})
 V2T_EXAMPLES = _load_media_examples("v2t", {".mp4", ".mov", ".avi", ".webm"})
     default_cfg = PROJECT_ROOT / "MMaDA" / "inference" / "demo" / "demo.yaml"
     legacy_cfg = PROJECT_ROOT / "MMaDA" / "configs" / "mmada_demo.yaml"
+    eval_cfg = Path("/dataset/omada/OMaDA/MMaDA/configs/omada_instruction_tuning2.yaml")
     train_config = os.getenv("TRAIN_CONFIG_PATH")
     if not train_config:
+        if eval_cfg.exists():
+            train_config = str(eval_cfg)
+        else:
+            train_config = str(default_cfg if default_cfg.exists() else legacy_cfg)
     device = os.getenv("DEVICE", "cuda")
     APP = OmadaDemo(train_config=train_config, checkpoint=str(ckpt_dir), device=device)
     return APP
+def warmup_model_status() -> str:
+    try:
+        get_app()
+        return "Model status: Loaded. Inference is ready."
+    except Exception as exc:
+        return f"Model status: Load failed ({exc})."
 # ---------------------------
 # ZeroGPU-wrapped handlers
 # ---------------------------
 @spaces.GPU
 def s2t_handler(audio_path, steps, block_len, max_tokens, remasking):
     app = get_app()
+    for text, status in app.run_s2t_stream(
         audio_path=audio_path,
         steps=int(steps),
         block_length=int(block_len),
         max_new_tokens=int(max_tokens),
         remasking=str(remasking),
+        update_every=32,
+    ):
+        yield text, status
 @spaces.GPU
 def v2t_handler(video, steps, block_len, max_tokens):
     app = get_app()
+    for text, status in app.run_v2t_stream(
         video_path=video,
         steps=int(steps),
         block_length=int(block_len),
         max_new_tokens=int(max_tokens),
+        update_every=32,
+    ):
+        yield text, status
 @spaces.GPU
 def chat_handler(message, max_tokens, steps, block_len, temperature):
     app = get_app()
+    for reply_html, status, done in app.run_chat_stream(
         message=message,
         max_new_tokens=int(max_tokens),
         steps=int(steps),
         block_length=int(block_len),
         temperature=float(temperature),
+        update_every=32,
+    ):
+        yield reply_html, status
 @spaces.GPU
 def mmu_handler(image, question, max_tokens, steps, block_len, temperature):
 @spaces.GPU
 def t2i_handler(prompt, timesteps, temperature, guidance):
     app = get_app()
+    for image, status in app.run_t2i_stream(
         prompt=prompt,
         timesteps=int(timesteps),
         temperature=float(temperature),
         guidance_scale=float(guidance),
+        update_every=2,
+    ):
+        yield image, status
 @spaces.GPU
 def i2i_handler(instruction, image, timesteps, temperature, guidance):
     app = get_app()
+    for image_out, status in app.run_i2i_stream(
         instruction=instruction,
         source_image=image,
         timesteps=int(timesteps),
         temperature=float(temperature),
         guidance_scale=float(guidance),
+        update_every=2,
+    ):
+        yield image_out, status
 # ---------------------------
 # ---------------------------
 theme = gr.themes.Soft(primary_hue="blue", neutral_hue="gray")
+EXTRA_CSS = """
+html, body, .gradio-container {
+  background: var(--omada-surface) !important;
+  color: var(--omada-text-primary) !important;
+}
+.omada-shell {
+  min-height: 0;
+  display: flex;
+  flex-direction: column;
+  padding-bottom: 6px;
+}
+.omada-sample-row {
+  gap: 10px !important;
+  justify-content: center !important;
+  margin-bottom: 6px;
+}
+.omada-sample-row .gradio-button {
+  max-width: 280px !important;
+}
+.omada-hero {
+  text-align: center;
+  margin: 40px 0 24px 0;
+}
+.omada-hero h2 {
+  font-size: 2.2rem;
+  margin: 0;
+  color: var(--omada-dark-text);
+}
+.omada-hero p {
+  margin: 10px 0 0 0;
+  color: var(--omada-dark-muted);
+}
+.omada-input-row {
+  gap: 6px !important;
+  align-items: center !important;
+  display: flex !important;
+  flex-direction: row !important;
+  justify-content: center !important;
+  position: relative !important;
+  inset: auto !important;
+  top: auto !important;
+  right: auto !important;
+  bottom: auto !important;
+  left: auto !important;
+  transform: none !important;
+  background: var(--omada-surface-alt);
+  padding: 6px 14px;
+  border-radius: 999px;
+  z-index: 5;
+  width: min(980px, calc(100vw - 24px));
+  margin: 4px auto 8px;
+  box-shadow: 0 8px 24px rgba(0,0,0,0.08);
+  box-sizing: border-box;
+}
+.omada-input-row > * {
+  min-width: 0 !important;
+  margin: 0 !important;
+  align-self: center !important;
+  background: transparent !important;
+  box-shadow: none !important;
+  border: none !important;
+}
+.omada-input-row .gradio-textbox textarea {
+  background: var(--omada-surface) !important;
+  color: var(--omada-text-primary) !important;
+  border-radius: 999px !important;
+  border: 1px solid var(--omada-border) !important;
+  padding: 6px 10px !important;
+  min-height: 36px !important;
+}
+.omada-plus-btn button,
+.omada-send-btn button {
+  border-radius: 999px !important;
+  width: 36px !important;
+  min-width: 36px !important;
+  height: 36px !important;
+  background: var(--omada-surface) !important;
+  color: var(--omada-text-primary) !important;
+  border: 1px solid var(--omada-border) !important;
+  padding: 0 !important;
+  font-size: 1.2rem !important;
+  line-height: 1 !important;
+}
+.omada-plus-btn,
+.omada-send-btn {
+  flex: 0 0 36px !important;
+  display: flex !important;
+  align-items: center !important;
+  justify-content: center !important;
+}
+.omada-auto {
+  width: 110px !important;
+  flex: 0 0 110px !important;
+  display: flex !important;
+  align-items: center !important;
+}
+.omada-auto select {
+  height: 36px !important;
+  min-height: 36px !important;
+  font-size: 0.95rem !important;
+  padding: 0 12px !important;
+  background: var(--omada-surface) !important;
+  border: 1px solid var(--omada-border) !important;
+  color: var(--omada-text-primary) !important;
+  border-radius: 999px !important;
+  appearance: none !important;
+  -webkit-appearance: none !important;
+  -moz-appearance: none !important;
+  background-image: none !important;
+}
+.omada-auto svg,
+.omada-auto .wrap > svg,
+.omada-auto .dropdown-arrow {
+  display: none !important;
+}
+.omada-plus-btn button,
+.omada-send-btn button {
+  flex: 0 0 auto !important;
+}
+.omada-input-row .gradio-textbox {
+  width: 100% !important;
+  flex: 1 1 auto !important;
+  min-width: 0 !important;
+  opacity: 1 !important;
+  pointer-events: auto !important;
+  background: transparent !important;
+  border: none !important;
+  box-shadow: none !important;
+}
+.omada-input-row .gradio-textbox > div,
+.omada-input-row .gradio-dropdown,
+.omada-input-row .gradio-dropdown > div,
+.omada-plus-btn,
+.omada-send-btn,
+.omada-auto {
+  background: transparent !important;
+  border: none !important;
+  box-shadow: none !important;
+}
+.omada-send-btn {
+  margin-left: -2px !important;
+}
+.omada-input-row .gradio-textbox textarea {
+  width: 100% !important;
+  display: block !important;
+  pointer-events: auto !important;
+  opacity: 1 !important;
+  cursor: text !important;
+}
+.omada-panel-backdrop {
+  display: none !important;
+}
+.omada-panel {
+  position: relative !important;
+  top: auto !important;
+  left: auto !important;
+  transform: none !important;
+  max-height: none !important;
+  overflow: visible !important;
+  width: min(980px, calc(100vw - 24px));
+  margin: 0 auto 14px auto;
+  box-shadow: 0 20px 60px rgba(0,0,0,0.12);
+  z-index: 9999;
+  pointer-events: auto !important;
+  isolation: isolate;
+}
+.omada-controls-safe {
+  width: min(980px, calc(100vw - 24px));
+  margin: 0 auto 6px auto;
+}
+.omada-panel * {
+  pointer-events: auto;
+}
+.omada-panel input,
+.omada-panel select,
+.omada-panel textarea,
+.omada-panel button,
+.omada-panel .gradio-slider,
+.omada-panel .gradio-slider * {
+  pointer-events: auto !important;
+}
+.omada-panel .gradio-radio,
+.omada-panel .gradio-radio label,
+.omada-panel .gradio-radio input {
+  pointer-events: auto !important;
+  cursor: pointer !important;
+}
+.omada-panel .gradio-radio {
+  position: relative !important;
+  z-index: 300 !important;
+}
+.omada-panel .gradio-slider,
+.omada-panel .gradio-slider .wrap,
+.omada-panel .gradio-slider .wrap-inner,
+.omada-panel .gradio-slider input[type="range"],
+.omada-panel .gradio-slider input[type="number"],
+.omada-panel .gradio-dropdown,
+.omada-panel .gradio-dropdown select,
+.omada-panel .gradio-textbox textarea {
+  pointer-events: auto !important;
+  position: relative !important;
+  z-index: 400 !important;
+}
+.omada-panel .gradio-slider input[type="range"] {
+  touch-action: pan-x !important;
+}
+.omada-panel .gradio-dropdown,
+.omada-panel .gradio-dropdown .wrap {
+  z-index: 1000 !important;
+}
+.gradio-dropdown .options,
+.gradio-dropdown .wrap .options {
+  z-index: 2000 !important;
+}
+.gradio-container .input-status,
+.gradio-container .status,
+.gradio-container .status-dot,
+.gradio-container .status-indicator,
+.gradio-container .label-wrap .status,
+.gradio-container .label-wrap .status-dot {
+  display: none !important;
+}
+.omada-chatbot {
+  background: transparent !important;
+  border: none !important;
+}
+.gradio-chatbot .message {
+  border-radius: 18px !important;
+}
+.gradio-chatbot .message.user {
+  margin-left: auto !important;
+  background: #2e3037 !important;
+  color: var(--omada-text-primary) !important;
+  pointer-events: auto !important;
+}
+.gradio-chatbot .message.bot {
+  margin-right: auto !important;
+  background: #22242a !important;
+  color: var(--omada-text-primary) !important;
+  pointer-events: auto !important;
+}
+.gradio-chatbot .message.user *,
+.gradio-chatbot .message.bot * {
+  pointer-events: auto !important;
+}
+.omada-panel {
+  background: var(--omada-dark-panel);
+  border: 1px solid var(--omada-dark-border);
+  border-radius: 16px;
+  padding: 16px;
+}
+.omada-chip button {
+  border-radius: 999px !important;
+  background: linear-gradient(160deg, rgba(255,255,255,0.62), rgba(255,255,255,0.36)) !important;
+  color: #22324a !important;
+  border: 1px solid rgba(255,255,255,0.72) !important;
+  font-size: 0.68rem !important;
+  line-height: 1.2 !important;
+  padding: 6px 10px !important;
+  backdrop-filter: blur(14px) saturate(165%);
+  -webkit-backdrop-filter: blur(14px) saturate(165%);
+  box-shadow: 0 8px 20px rgba(36, 56, 92, 0.16) !important;
+}
+.omada-sample-row .gradio-button,
+.omada-sample-row .gradio-button > div,
+.omada-sample-row .gradio-button > button {
+  background: transparent !important;
+}
+.omada-chip button:hover {
+  transform: translateY(-1px);
+  background: linear-gradient(160deg, rgba(255,255,255,0.74), rgba(255,255,255,0.44)) !important;
+}
+.omada-video-loading {
+  width: 360px;
+  max-width: min(42vw, 360px);
+  min-height: 64px;
+  border-radius: 12px;
+  border: 1px solid var(--omada-glass-border);
+  background: rgba(255,255,255,0.35);
+  display: flex;
+  align-items: center;
+  justify-content: center;
+  font-size: 0.9rem;
+  color: #304463;
+  backdrop-filter: blur(10px) saturate(150%);
+  -webkit-backdrop-filter: blur(10px) saturate(150%);
+}
+.omada-user-media {
+  margin-bottom: 6px;
+}
+.omada-user-media img,
+.omada-user-media video {
+  max-width: 240px;
+  width: 240px;
+  max-height: 180px;
+  object-fit: contain;
+  border-radius: 10px;
+  border: 1px solid var(--omada-border);
+  display: block;
+}
+.omada-user-media .omada-user-video {
+  width: 360px;
+  max-width: min(42vw, 360px);
+  max-height: 240px;
+}
+.omada-user-media audio {
+  width: 360px;
+  max-width: min(42vw, 360px);
+  display: block;
+}
+.omada-response-status {
+  color: var(--omada-dark-muted) !important;
+}
+.omada-token-pill {
+  display: inline-block;
+  padding: 1px 8px;
+  margin: 1px 2px;
+  border-radius: 999px;
+  border: 1px solid var(--omada-border);
+  font-size: 0.82em;
+  line-height: 1.6;
+  vertical-align: baseline;
+  background: #f7f8fa;
+}
+.omada-token-mask {
+  border-color: #8da2c6;
+  background: #eef3ff;
+  color: #1f3d7a;
+  font-weight: 600;
+}
+.omada-token-special {
+  border-color: #c5ccd8;
+  background: #f3f4f7;
+  color: #4b5563;
+}
+/* Apple-like glass look */
+:root {
+  --omada-glass-bg: rgba(255, 255, 255, 0.22);
+  --omada-glass-strong: rgba(255, 255, 255, 0.32);
+  --omada-glass-border: rgba(255, 255, 255, 0.58);
+  --omada-glass-shadow: 0 20px 52px rgba(31, 38, 70, 0.14);
+}
+html, body, .gradio-container {
+  background:
+    radial-gradient(1200px 500px at 10% -10%, rgba(255,255,255,0.80), rgba(255,255,255,0.30) 45%, rgba(245,247,251,0.92) 100%),
+    linear-gradient(135deg, #edf1f7 0%, #e7ecf3 45%, #eff3f8 100%) !important;
+}
+.omada-input-row,
+.omada-controls-safe,
+.omada-panel,
+.gradio-chatbot .message,
+.omada-chip button,
+.omada-input-row .gradio-textbox textarea,
+.omada-plus-btn button,
+.omada-send-btn button,
+.omada-auto select {
+  background: var(--omada-glass-bg) !important;
+  border: 1px solid var(--omada-glass-border) !important;
+  box-shadow: var(--omada-glass-shadow) !important;
+  backdrop-filter: blur(22px) saturate(175%);
+  -webkit-backdrop-filter: blur(22px) saturate(175%);
+}
+.omada-controls-safe {
+  padding: 14px 16px !important;
+  border-radius: 28px !important;
+  margin: 10px auto 10px auto !important;
+}
+.omada-controls-safe > div {
+  padding: 10px 12px !important;
+  border-radius: 22px !important;
+}
+.omada-controls-safe .gradio-button,
+.omada-controls-safe button,
+.omada-controls-safe .gradio-dropdown,
+.omada-controls-safe .gradio-textbox,
+.omada-controls-safe .gradio-slider {
+  border-radius: 16px !important;
+}
+.omada-controls-safe .gradio-button {
+  border: 1px solid var(--omada-glass-border) !important;
+}
+.gradio-chatbot .message.user {
+  background: var(--omada-glass-strong) !important;
+  color: #1f2937 !important;
+}
+.gradio-chatbot .message.bot {
+  background: rgba(255, 255, 255, 0.50) !important;
+  color: #1f2937 !important;
+}
+.omada-chip button {
+  color: #273247 !important;
+}
+.omada-panel {
+  border-radius: 28px !important;
+  padding: 20px !important;
+}
+.omada-input-row {
+  border-radius: 999px !important;
+}
+"""
 with gr.Blocks(
     title="AIDAS Lab @ SNU - Omni-modal Diffusion",
+    css=CUSTOM_CSS + EXTRA_CSS,
     theme=theme,
     js=FORCE_LIGHT_MODE_JS,
 ) as demo:
+    model_status = gr.Markdown("Model status: Loading model...", visible=False)
+    demo.load(warmup_model_status, outputs=[model_status])
+    MODE_OPTIONS = [
+        "Chat",
+        "MMU (Image → Text)",
+        "MMU (Video → Text)",
+        "Image Generation",
+        "Image Editing",
+        "ASR",
+        "TTS",
+    ]
+    with gr.Column(elem_classes=["omada-shell"]):
+        chatbox = gr.Chatbot(
+            label=None,
+            height=850,
+            sanitize_html=False,
+            bubble_full_width=False,
+            elem_classes=["omada-chatbot"],
         )
+        sample_prompts = [ex[0] for ex in CHAT_EXAMPLES[:3]] if CHAT_EXAMPLES else []
+        sample_state = gr.State((sample_prompts + ["", "", ""])[:3])
+        sample_payloads = gr.State(
+            ([{"text": p, "image": None, "audio": None, "video": None} for p in sample_prompts] + [{"text": "", "image": None, "audio": None, "video": None}] * 3)[:3]
         )
+        with gr.Row(elem_classes=["omada-sample-row"]):
+            sample_buttons = []
+            for i in range(3):
+                label = sample_prompts[i] if i < len(sample_prompts) else ""
+                sample_buttons.append(gr.Button(label, elem_classes=["omada-chip"], visible=bool(label)))
+        with gr.Row(elem_classes=["omada-input-row"]):
+            plus_btn = gr.Button("+", elem_classes=["omada-plus-btn"], scale=1, min_width=36)
+            chat_input = gr.Textbox(
+                show_label=False,
+                placeholder="How can I help you today?",
+                lines=1,
+                scale=12,
+                min_width=0,
+            )
+            auto_dropdown = gr.Dropdown(
+                ["Auto", "Custom"],
+                value="Auto",
+                show_label=False,
+                elem_classes=["omada-auto"],
+                scale=2,
+                min_width=0,
+            )
+            send_button = gr.Button("↑", elem_classes=["omada-send-btn"], scale=1, min_width=36)
+    controls_visible = gr.State(False)
+    backdrop = gr.HTML("<div></div>", visible=False, elem_classes=["omada-panel-backdrop"])
+    controls_panel = gr.Column(visible=False, elem_classes=["omada-controls-safe"])
+    with controls_panel:
+        gr.Markdown("**Task Settings**")
+        mode_selector = gr.State("Chat")
+        selected_task_label = gr.Markdown("Selected task: `Chat`")
+        with gr.Row():
+            task_buttons = [gr.Button(option, size="sm") for option in MODE_OPTIONS]
+        media_image = gr.Image(type="pil", label="Image", sources=["upload"], visible=False)
+        media_audio = gr.Audio(type="filepath", label="Audio", sources=["microphone", "upload"], visible=False)
+        media_video = gr.Video(label="Video", sources=["upload", "webcam"], visible=False)
+        auto_mode = auto_dropdown
+        adv_chat = gr.Column(visible=False)
+        with adv_chat:
+            chat_max_tokens = gr.Slider(2, 512, value=512, step=2, label="Chat max tokens", interactive=True)
+            chat_steps = gr.Slider(2, 512, value=512, step=2, label="Chat steps", interactive=True)
+            chat_block = gr.Slider(2, 512, value=16, step=2, label="Chat block length", interactive=True)
+            chat_temperature_slider = gr.Slider(0.0, 2.0, value=0.0, step=0.05, label="Chat temperature", interactive=True)
+        adv_t2s = gr.Column(visible=False)
+        with adv_t2s:
+            t2s_max_tokens = gr.Slider(2, 512, value=384, step=2, label="Speech token length", interactive=True)
+            t2s_steps = gr.Slider(2, 512, value=128, step=2, label="T2S refinement steps", interactive=True)
+            t2s_block = gr.Slider(2, 512, value=128, step=2, label="T2S block length", interactive=True)
+            t2s_temperature = gr.Slider(0.0, 2.0, value=0.0, step=0.05, label="T2S temperature", interactive=True)
+            t2s_cfg = gr.Slider(0.0, 6.0, value=3.5, step=0.1, label="T2S CFG scale", interactive=True)
+            t2s_gender = gr.Dropdown(["random", "female", "male"], value="random", label="T2S gender", interactive=True)
+            t2s_emotion = gr.Dropdown(["random", "angry", "happy", "neutral", "sad"], value="random", label="T2S emotion", interactive=True)
+            t2s_speed = gr.Dropdown(["random", "normal", "fast", "slow"], value="random", label="T2S speed", interactive=True)
+            t2s_pitch = gr.Dropdown(["random", "normal", "high", "low"], value="random", label="T2S pitch", interactive=True)
+        adv_s2t = gr.Column(visible=False)
+        with adv_s2t:
+            s2t_steps = gr.Slider(2, 512, value=128, step=2, label="S2T steps", interactive=True)
+            s2t_block = gr.Slider(2, 512, value=16, step=2, label="S2T block length", interactive=True)
+            s2t_max_tokens = gr.Slider(2, 512, value=128, step=2, label="S2T max tokens", interactive=True)
+            s2t_remasking = gr.Dropdown(["low_confidence", "random"], value="low_confidence", label="S2T remasking", interactive=True)
+        adv_v2t = gr.Column(visible=False)
+        with adv_v2t:
+            v2t_steps = gr.Slider(2, 512, value=256, step=2, label="V2T steps", interactive=True)
+            v2t_block = gr.Slider(2, 512, value=16, step=2, label="V2T block length", interactive=True)
+            v2t_max_tokens = gr.Slider(2, 512, value=256, step=2, label="V2T max tokens", interactive=True)
+        adv_t2i = gr.Column(visible=False)
+        with adv_t2i:
+            t2i_timesteps = gr.Slider(4, 128, value=16, step=2, label="T2I timesteps", interactive=True)
+            t2i_temperature = gr.Slider(0.0, 2.0, value=0.0, step=0.05, label="T2I temperature", interactive=True)
+            t2i_guidance = gr.Slider(0.0, 8.0, value=2.5, step=0.1, label="T2I CFG scale", interactive=True)
+        adv_i2i = gr.Column(visible=False)
+        with adv_i2i:
+            i2i_timesteps = gr.Slider(4, 128, value=64, step=2, label="I2I timesteps", interactive=True)
+            i2i_temperature = gr.Slider(0.0, 2.0, value=0.0, step=0.05, label="I2I temperature", interactive=True)
+            i2i_guidance = gr.Slider(0.0, 8.0, value=2.5, step=0.1, label="I2I CFG scale", interactive=True)
+        adv_mmu = gr.Column(visible=False)
+        with adv_mmu:
+            mmu_max_tokens = gr.Slider(2, 512, value=128, step=2, label="MMU max tokens", interactive=True)
+            mmu_steps = gr.Slider(2, 512, value=128, step=2, label="MMU steps", interactive=True)
+            mmu_block = gr.Slider(2, 512, value=16, step=2, label="MMU block length", interactive=True)
+            mmu_temperature = gr.Slider(0.0, 2.0, value=0.0, step=0.05, label="MMU temperature", interactive=True)
+        save_btn = gr.Button("Save", variant="primary")
+    def _open_controls(auto_mode, mode):
+        adv_updates = _update_advanced(mode, auto_mode)
+        return (gr.update(visible=True), True, *adv_updates)
+    plus_btn.click(
+        _open_controls,
+        inputs=[auto_dropdown, mode_selector],
+        outputs=[controls_panel, controls_visible, adv_chat, adv_t2s, adv_s2t, adv_v2t, adv_t2i, adv_i2i, adv_mmu],
+    )
+    def _update_advanced(mode, auto_mode):
+        show = auto_mode == "Custom"
+        return (
+            gr.update(visible=show and mode == "Chat"),
+            gr.update(visible=show and mode == "TTS"),
+            gr.update(visible=show and mode == "ASR"),
+            gr.update(visible=show and mode == "MMU (Video → Text)"),
+            gr.update(visible=show and mode == "Image Generation"),
+            gr.update(visible=show and mode == "Image Editing"),
+            gr.update(visible=show and mode == "MMU (Image → Text)"),
         )
+    def _handle_custom(auto_mode, current_visible, mode):
+        if auto_mode == "Custom":
+            adv_updates = _update_advanced(mode, auto_mode)
+            return (gr.update(visible=True), True, *adv_updates)
+        # Auto -> show only task selector, hide advanced panels
+        adv_updates = (gr.update(visible=False),) * 7
+        return (gr.update(visible=True), True, *adv_updates)
+    auto_dropdown.change(
+        _handle_custom,
+        inputs=[auto_dropdown, controls_visible, mode_selector],
+        outputs=[controls_panel, controls_visible, adv_chat, adv_t2s, adv_s2t, adv_v2t, adv_t2i, adv_i2i, adv_mmu],
+    )
+    def _update_mode(mode):
+        show_image = mode in {"Image Editing", "MMU (Image → Text)"}
+        show_audio = mode in {"ASR"}
+        show_video = mode in {"MMU (Video → Text)"}
+        placeholders = {
+            "Chat": "How can I help you today?",
+            "TTS": "Type the speech you want to synthesize...",
+            "ASR": "Upload audio, then add notes here...",
+            "MMU (Video → Text)": "Upload video, then add notes here...",
+            "Image Generation": "Describe the image you want to generate...",
+            "Image Editing": "Describe how you want to edit the image...",
+            "MMU (Image → Text)": "Ask about the uploaded image...",
+        }
+        payloads = []
+        if mode == "Chat":
+            payloads = [{"text": ex[0], "image": None, "audio": None, "video": None} for ex in CHAT_EXAMPLES[:3]]
+        elif mode == "TTS":
+            payloads = [{"text": ex[0], "image": None, "audio": None, "video": None} for ex in T2S_EXAMPLES[:3]]
+        elif mode == "Image Generation":
+            payloads = [{"text": ex[0], "image": None, "audio": None, "video": None} for ex in T2I_EXAMPLES[:3]]
+        elif mode == "Image Editing":
+            payloads = [{"text": ex[1], "image": ex[0], "audio": None, "video": None} for ex in I2I_EXAMPLES[:3]]
+        elif mode == "MMU (Video → Text)":
+            payloads = [{"text": "", "image": None, "audio": None, "video": ex[0]} for ex in V2T_EXAMPLES[:3]]
+        elif mode == "ASR":
+            payloads = [{"text": "", "image": None, "audio": ex[0], "video": None} for ex in S2T_EXAMPLES[:3]]
+        elif mode == "MMU (Image → Text)":
+            payloads = [{"text": ex[1], "image": ex[0], "audio": None, "video": None} for ex in MMU_EXAMPLES[:3]]
+        payloads = (payloads + [{"text": "", "image": None, "audio": None, "video": None}] * 3)[:3]
+        samples = [p.get("text", "") or os.path.basename(p.get("video") or p.get("audio") or p.get("image") or "") for p in payloads]
+        return (
+            gr.update(visible=show_image),
+            gr.update(visible=show_audio),
+            gr.update(visible=show_video),
+            gr.update(placeholder=placeholders.get(mode, "How can I help you today?")),
+            [s for s in samples],
+            payloads,
+            gr.update(value=samples[0], visible=bool(samples[0])),
+            gr.update(value=samples[1], visible=bool(samples[1])),
+            gr.update(value=samples[2], visible=bool(samples[2])),
         )
+    _update_mode("Chat")
+    def _pick_mode(choice, auto_mode):
+        adv_updates = _update_advanced(choice, auto_mode)
+        return choice, f"Selected task: `{choice}`", *adv_updates
+    for idx, task_btn in enumerate(task_buttons):
+        task_btn.click(
+            lambda auto_mode, choice=MODE_OPTIONS[idx]: _pick_mode(choice, auto_mode),
+            inputs=[auto_dropdown],
+            outputs=[mode_selector, selected_task_label, adv_chat, adv_t2s, adv_s2t, adv_v2t, adv_t2i, adv_i2i, adv_mmu],
         )
+    def _save_controls(mode, auto_mode):
+        mode_updates = _update_mode(mode)
+        adv_updates = _update_advanced(mode, auto_mode)
+        return (
+            gr.update(visible=False),
+            False,
+            *mode_updates,
+            *adv_updates,
         )
+    save_btn.click(
+        _save_controls,
+        inputs=[mode_selector, auto_dropdown],
+        outputs=[
+            controls_panel,
+            controls_visible,
+            media_image,
+            media_audio,
+            media_video,
+            chat_input,
+            sample_state,
+            sample_payloads,
+            *sample_buttons,
+            adv_chat,
+            adv_t2s,
+            adv_s2t,
+            adv_v2t,
+            adv_t2i,
+            adv_i2i,
+            adv_mmu,
+        ],
+    )
+    def _format_user_message(msg: str) -> str:
+        return msg.strip() if msg else " "
+    def _chat_handler(
+        history,
+        message,
+        mode,
+        auto_mode,
+        image_in,
+        audio_in,
+        video_in,
+        chat_max_tokens,
+        chat_steps,
+        chat_block,
+        chat_temperature,
+        t2s_max_tokens,
+        t2s_steps,
+        t2s_block,
+        t2s_temperature,
+        t2s_cfg,
+        t2s_gender,
+        t2s_emotion,
+        t2s_speed,
+        t2s_pitch,
+        s2t_steps,
+        s2t_block,
+        s2t_max_tokens,
+        s2t_remasking,
+        v2t_steps,
+        v2t_block,
+        v2t_max_tokens,
+        t2i_timesteps,
+        t2i_temperature,
+        t2i_guidance,
+        i2i_timesteps,
+        i2i_temperature,
+        i2i_guidance,
+        mmu_max_tokens,
+        mmu_steps,
+        mmu_block,
+        mmu_temperature,
+    ):
+        history = history or []
+        message = (message or "").strip()
+        defer_video = mode == "MMU (Video → Text)" and bool(video_in)
+        display_user = _render_user_message(mode, message, image_in, audio_in, video_in, defer_video=defer_video)
+        history.append((display_user, _render_text_message("Generating...", "")))
+        yield history, ""
+        if defer_video:
+            display_user = _render_user_message(mode, message, image_in, audio_in, video_in, defer_video=False)
+            history[-1] = (display_user, history[-1][1])
+            yield history, ""
+        app = get_app()
+        # Respect UI mode: Auto uses eval-matched defaults, Custom uses UI values.
+        app.force_eval_settings = str(auto_mode).strip().lower() == "auto"
+        if mode == "Chat":
+            for reply_html, status, done in app.run_chat_stream(
+                message,
                 chat_max_tokens,
                 chat_steps,
                 chat_block,
+                chat_temperature,
+                update_every=64,
+            ):
+                response = _render_response(status, reply_html)
+                history[-1] = (display_user, response)
+                yield history, ""
+            return
+        if mode == "TTS":
+            if not message:
+                history[-1] = (display_user, _render_text_message("Please type some text.", ""))
+                yield history, ""
+                return
+            audio, status = app.run_t2s(
+                message,
+                t2s_max_tokens,
+                t2s_steps,
+                t2s_block,
+                t2s_temperature,
+                t2s_cfg,
+                t2s_gender,
+                t2s_emotion,
+                t2s_speed,
+                t2s_pitch,
+            )
+            history[-1] = (display_user, _render_audio_message(status, audio))
+            yield history, ""
+            return
+        if mode == "ASR":
+            if not audio_in:
+                history[-1] = (display_user, _render_text_message("Please upload audio.", ""))
+                yield history, ""
+                return
+            for text, status in app.run_s2t_stream(
+                audio_in,
+                s2t_steps,
+                s2t_block,
+                s2t_max_tokens,
+                s2t_remasking,
+                update_every=32,
+            ):
+                history[-1] = (display_user, _render_text_message(status, text))
+                yield history, ""
+            return
+        if mode == "MMU (Video → Text)":
+            if not video_in:
+                history[-1] = (display_user, _render_text_message("Please upload a video.", ""))
+                yield history, ""
+                return
+            for text, status in app.run_v2t_stream(
+                video_in,
+                v2t_steps,
+                v2t_block,
+                v2t_max_tokens,
+                update_every=32,
+            ):
+                history[-1] = (display_user, _render_text_message(status, text))
+                yield history, ""
+            return
+        if mode == "Image Generation":
+            if not message:
+                history[-1] = (display_user, _render_text_message("Please provide a prompt.", ""))
+                yield history, ""
+                return
+            for image, status in app.run_t2i_stream(
+                message,
+                t2i_timesteps,
+                t2i_temperature,
+                t2i_guidance,
+                update_every=2,
+            ):
+                history[-1] = (display_user, _render_image_message(status, image))
+                yield history, ""
+            return
+        if mode == "Image Editing":
+            if not image_in:
+                history[-1] = (display_user, _render_text_message("Please upload an image.", ""))
+                yield history, ""
+                return
+            if not message:
+                history[-1] = (display_user, _render_text_message("Please provide an edit instruction.", ""))
+                yield history, ""
+                return
+            for image, status in app.run_i2i_stream(
+                message,
+                image_in,
+                i2i_timesteps,
+                i2i_temperature,
+                i2i_guidance,
+                update_every=2,
+            ):
+                history[-1] = (display_user, _render_image_message(status, image))
+                yield history, ""
+            return
+        if mode == "MMU (Image → Text)":
+            if not image_in:
+                history[-1] = (display_user, _render_text_message("Please upload an image.", ""))
+                yield history, ""
+                return
+            reply, status = app.run_mmu(
+                images=[image_in],
+                message=message,
+                max_new_tokens=mmu_max_tokens,
+                steps=mmu_steps,
+                block_length=mmu_block,
+                temperature=mmu_temperature,
+            )
+            history[-1] = (display_user, _render_text_message(status, reply))
+            yield history, ""
+            return
+        history[-1] = (display_user, _render_text_message("Unsupported mode.", ""))
+        yield history, ""
+    submit_inputs = [
+        chatbox,
+        chat_input,
+        mode_selector,
+        auto_dropdown,
+        media_image,
+        media_audio,
+        media_video,
+        chat_max_tokens,
+        chat_steps,
+        chat_block,
+        chat_temperature_slider,
+        t2s_max_tokens,
+        t2s_steps,
+        t2s_block,
+        t2s_temperature,
+        t2s_cfg,
+        t2s_gender,
+        t2s_emotion,
+        t2s_speed,
+        t2s_pitch,
+        s2t_steps,
+        s2t_block,
+        s2t_max_tokens,
+        s2t_remasking,
+        v2t_steps,
+        v2t_block,
+        v2t_max_tokens,
+        t2i_timesteps,
+        t2i_temperature,
+        t2i_guidance,
+        i2i_timesteps,
+        i2i_temperature,
+        i2i_guidance,
+        mmu_max_tokens,
+        mmu_steps,
+        mmu_block,
+        mmu_temperature,
+    ]
+    submit_outputs = [chatbox, chat_input]
+    chat_input.submit(_chat_handler, inputs=submit_inputs, outputs=submit_outputs)
+    send_button.click(_chat_handler, inputs=submit_inputs, outputs=submit_outputs)
+    def _use_sample(payload_list, idx):
+        if not payload_list or idx >= len(payload_list):
+            return "", None, None, None
+        item = payload_list[idx] or {}
+        return item.get("text", ""), item.get("image"), item.get("audio"), item.get("video")
+    for i, btn in enumerate(sample_buttons):
+        btn.click(
+            lambda payloads, idx=i: _use_sample(payloads, idx),
+            inputs=[sample_payloads],
+            outputs=[chat_input, media_image, media_audio, media_video],
         )
 if __name__ == "__main__":
+    demo.launch(allowed_paths=[str(PREVIEW_DIR), "/tmp"])