jaeikkim commited on
Commit
f9c9d68
·
1 Parent(s): 19f1a97

Dynin-Omni

Browse files
.gradio/certificate.pem ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -----BEGIN CERTIFICATE-----
2
+ MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
3
+ TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
4
+ cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
5
+ WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
6
+ ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
7
+ MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
8
+ h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
9
+ 0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
10
+ A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
11
+ T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
12
+ B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
13
+ B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
14
+ KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
15
+ OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
16
+ jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
17
+ qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
18
+ rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
19
+ HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
20
+ hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
21
+ ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
22
+ 3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
23
+ NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
24
+ ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
25
+ TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
26
+ jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
27
+ oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
28
+ 4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
29
+ mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
30
+ emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
31
+ -----END CERTIFICATE-----
MMaDA/inference/common.py CHANGED
@@ -42,9 +42,21 @@ def get_vq_model_audio(cfg, device):
42
 
43
  def build_uni_prompting(cfg) -> Tuple[UniversalPrompting, AutoTokenizer]:
44
  tokenizer = AutoTokenizer.from_pretrained(cfg.model.omada.tokenizer_path, padding_side="left")
 
 
 
 
 
 
 
 
 
 
 
45
  uni_prompting = UniversalPrompting(
46
  tokenizer,
47
  max_text_len=cfg.dataset.preprocessing.max_seq_length,
 
48
  max_audio_len=cfg.dataset.preprocessing.max_aud_length,
49
  special_tokens=(
50
  "<|soi|>", "<|eoi|>", "<|sov|>", "<|eov|>", "<|t2i|>",
 
42
 
43
  def build_uni_prompting(cfg) -> Tuple[UniversalPrompting, AutoTokenizer]:
44
  tokenizer = AutoTokenizer.from_pretrained(cfg.model.omada.tokenizer_path, padding_side="left")
45
+ dataset_cfg = getattr(cfg, "dataset", None)
46
+ dataset_params = getattr(dataset_cfg, "params", None) if dataset_cfg else None
47
+ preproc_cfg = getattr(dataset_cfg, "preprocessing", None) if dataset_cfg else None
48
+ # MMU image tokens at 480 resolution are typically ~900 tokens (with patch size 16),
49
+ # so 512 can silently drop the whole image in mmu_mult_prompt.
50
+ max_image_len = int(
51
+ getattr(dataset_params, "max_image_len", 0)
52
+ or getattr(preproc_cfg, "max_image_len", 0)
53
+ or getattr(preproc_cfg, "max_seq_length_image", 0)
54
+ or 1024
55
+ )
56
  uni_prompting = UniversalPrompting(
57
  tokenizer,
58
  max_text_len=cfg.dataset.preprocessing.max_seq_length,
59
+ max_image_len=max_image_len,
60
  max_audio_len=cfg.dataset.preprocessing.max_aud_length,
61
  special_tokens=(
62
  "<|soi|>", "<|eoi|>", "<|sov|>", "<|eov|>", "<|t2i|>",
MMaDA/inference/gradio_multimodal_demo_inst.py CHANGED
@@ -948,15 +948,15 @@ class OmadaDemo:
948
  "temperature": 0.0,
949
  },
950
  "i2i": {
951
- "timesteps": 64,
952
  "guidance_scale": 2.5,
953
  "temperature": 0.0,
954
  },
955
  # Match defaults used in inference scripts for eval parity.
956
  "t2s": {
957
- "steps": 128,
958
- "block_length": 128,
959
- "max_new_tokens": int(self.max_audio_len_short),
960
  "temperature": 0.0,
961
  "guidance_scale": float(_cfg_get(training_cfg, "guidance_scale", 3.5)),
962
  },
@@ -2639,7 +2639,7 @@ class OmadaDemo:
2639
  if not encoded_images:
2640
  return "", "Failed to encode the provided image(s)."
2641
 
2642
- question = (question or "").strip() or "Describe the visual content."
2643
  if "<|start_header_id|>" in question:
2644
  prompt = question
2645
  else:
@@ -2706,7 +2706,9 @@ class OmadaDemo:
2706
  ).strip()
2707
  print(
2708
  f"[MMU] input_prompt_len={input_prompt_len} output_len={int(output_ids.shape[1])} "
2709
- f"gen_len={int(gen_slice.numel())} first_ids={gen_slice[:16].detach().cpu().tolist()}",
 
 
2710
  flush=True,
2711
  )
2712
  try:
 
948
  "temperature": 0.0,
949
  },
950
  "i2i": {
951
+ "timesteps": 32,
952
  "guidance_scale": 2.5,
953
  "temperature": 0.0,
954
  },
955
  # Match defaults used in inference scripts for eval parity.
956
  "t2s": {
957
+ "steps": 256,
958
+ "block_length": 256,
959
+ "max_new_tokens": 512,
960
  "temperature": 0.0,
961
  "guidance_scale": float(_cfg_get(training_cfg, "guidance_scale", 3.5)),
962
  },
 
2639
  if not encoded_images:
2640
  return "", "Failed to encode the provided image(s)."
2641
 
2642
+ question = (question or "").strip() or "Describe the given image in detail."
2643
  if "<|start_header_id|>" in question:
2644
  prompt = question
2645
  else:
 
2706
  ).strip()
2707
  print(
2708
  f"[MMU] input_prompt_len={input_prompt_len} output_len={int(output_ids.shape[1])} "
2709
+ f"gen_len={int(gen_slice.numel())} image_tok_len={int(encoded_images[0].numel()) if encoded_images else -1} "
2710
+ f"max_image_len={int(getattr(self.uni_prompting, 'max_image_len', -1))} "
2711
+ f"first_ids={gen_slice[:16].detach().cpu().tolist()}",
2712
  flush=True,
2713
  )
2714
  try:
app.py CHANGED
@@ -104,7 +104,7 @@ from inference.gradio_multimodal_demo_inst import ( # noqa: E402
104
 
105
  def download_assets() -> Path:
106
  """Download demo assets (logo + sample prompts/media) and return the root path."""
107
- repo_id = os.getenv("ASSET_REPO_ID", "jaeikkim/AIDAS-Omni-Modal-Diffusion-assets")
108
  revision = os.getenv("ASSET_REVISION", "main")
109
  token = os.getenv("HF_TOKEN")
110
  cache_dir = PROJECT_ROOT / "_asset_cache"
@@ -247,8 +247,10 @@ def _load_i2i_examples():
247
  )
248
 
249
  n = min(len(image_files), len(text_files))
 
 
250
  examples = []
251
- for i in range(2):
252
  img_path = image_files[i]
253
  txt_path = text_files[i]
254
  instruction = txt_path.read_text(encoding="utf-8").strip()
@@ -373,14 +375,16 @@ def _render_image_message(status: str, image: Image.Image):
373
  return _render_response(status)
374
 
375
  encoded = base64.b64encode(buffer.getvalue()).decode("ascii")
376
- image_html = (
377
- "<div class='omada-response-block'>"
 
 
378
  "<img src='data:image/png;base64,"
379
  f"{encoded}"
380
- "' alt='Generated image' style='max-width:100%;border-radius:12px;' />"
 
381
  "</div>"
382
  )
383
- return _render_response(status, image_html)
384
 
385
 
386
  def _render_user_message(mode: str, message: str, image_in, audio_in, video_in, defer_video: bool = False) -> str:
@@ -588,6 +592,7 @@ V2T_EXAMPLES = _load_media_examples("v2t", {".mp4", ".mov", ".avi", ".webm"})
588
  # MMU images
589
  MMU_DIR = ASSET_ROOT / "mmu"
590
  MMU_EXAMPLES: List[List[str]] = []
 
591
  if MMU_DIR.exists():
592
  for path in sorted(
593
  [
@@ -598,7 +603,7 @@ if MMU_DIR.exists():
598
  ):
599
  MMU_EXAMPLES.append([
600
  str(path),
601
- "Describe the important objects and their relationships in this image.",
602
  ])
603
 
604
 
@@ -1203,6 +1208,41 @@ html, body, .gradio-container {
1203
  background: rgba(255, 255, 255, 0.50) !important;
1204
  color: #1f2937 !important;
1205
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1206
  .omada-chip button {
1207
  color: #273247 !important;
1208
  }
@@ -1619,7 +1659,7 @@ with gr.Blocks(**_blocks_kwargs) as demo:
1619
  {
1620
  "label": "🖼️ Image QA",
1621
  "mode": "MMU (Image → Text)",
1622
- "text": MMU_EXAMPLES[0][1] if MMU_EXAMPLES else "Describe the important objects and their relationships in this image.",
1623
  "image": MMU_EXAMPLES[0][0] if MMU_EXAMPLES else None,
1624
  "audio": None,
1625
  "video": None,
@@ -1630,7 +1670,7 @@ with gr.Blocks(**_blocks_kwargs) as demo:
1630
  "text": "",
1631
  "image": None,
1632
  "audio": None,
1633
- "video": V2T_EXAMPLES[0][0] if V2T_EXAMPLES else None,
1634
  },
1635
  {
1636
  "label": "🎨 Image Generation",
@@ -1720,9 +1760,9 @@ with gr.Blocks(**_blocks_kwargs) as demo:
1720
 
1721
  adv_t2s = gr.Column(visible=False)
1722
  with adv_t2s:
1723
- t2s_max_tokens = gr.Slider(2, 512, value=384, step=2, label="Speech token length", interactive=True)
1724
- t2s_steps = gr.Slider(2, 512, value=128, step=2, label="T2S refinement steps", interactive=True)
1725
- t2s_block = gr.Slider(2, 512, value=128, step=2, label="T2S block length", interactive=True)
1726
  t2s_temperature = gr.Slider(0.0, 2.0, value=0.0, step=0.05, label="T2S temperature", interactive=True)
1727
  t2s_cfg = gr.Slider(0.0, 6.0, value=3.5, step=0.1, label="T2S CFG scale", interactive=True)
1728
  t2s_gender = gr.Dropdown(["random", "female", "male"], value="random", label="T2S gender", interactive=True)
@@ -1751,7 +1791,7 @@ with gr.Blocks(**_blocks_kwargs) as demo:
1751
 
1752
  adv_i2i = gr.Column(visible=False)
1753
  with adv_i2i:
1754
- i2i_timesteps = gr.Slider(4, 128, value=64, step=2, label="I2I timesteps", interactive=True)
1755
  i2i_temperature = gr.Slider(0.0, 2.0, value=0.0, step=0.05, label="I2I temperature", interactive=True)
1756
  i2i_guidance = gr.Slider(0.0, 8.0, value=2.5, step=0.1, label="I2I CFG scale", interactive=True)
1757
 
@@ -1885,6 +1925,41 @@ def _format_user_message(msg: str) -> str:
1885
  return msg.strip() if msg else " "
1886
 
1887
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1888
  def _is_identity_query(message: str) -> bool:
1889
  q = re.sub(r"[^a-z0-9\s]", " ", (message or "").lower())
1890
  q = re.sub(r"\s+", " ", q).strip()
@@ -1942,12 +2017,12 @@ def _chat_handler(
1942
  mmu_temperature,
1943
  ):
1944
  _set_global_seed()
1945
- history = history or []
1946
  message = (message or "").strip()
1947
  defer_video = mode == "MMU (Video → Text)" and bool(video_in)
1948
  display_user = _render_user_message(mode, message, image_in, audio_in, video_in, defer_video=defer_video)
1949
- history.append((display_user, _render_text_message("Generating...", "")))
1950
- yield history, ""
1951
 
1952
  if mode == "Chat" and _is_identity_query(message):
1953
  fixed = (
@@ -1955,15 +2030,17 @@ def _chat_handler(
1955
  "I can understand and generate text, images, speech, and video within a single architecture."
1956
  )
1957
  history[-1] = (display_user, _render_text_message("Assistant reply generated.", fixed))
1958
- yield history, ""
1959
  return
1960
 
1961
  if defer_video:
1962
  display_user = _render_user_message(mode, message, image_in, audio_in, video_in, defer_video=False)
1963
  history[-1] = (display_user, history[-1][1])
1964
- yield history, ""
1965
 
1966
  app = get_app()
 
 
1967
  # Respect UI mode: Auto uses eval-matched defaults, Custom uses UI values.
1968
  app.force_eval_settings = str(auto_mode).strip().lower() == "auto"
1969
 
@@ -1978,13 +2055,13 @@ def _chat_handler(
1978
  ):
1979
  response = _render_response(status, reply_html)
1980
  history[-1] = (display_user, response)
1981
- yield history, ""
1982
  return
1983
 
1984
  if mode == "TTS":
1985
  if not message:
1986
  history[-1] = (display_user, _render_text_message("Please type some text.", ""))
1987
- yield history, ""
1988
  return
1989
  audio, status = app.run_t2s(
1990
  message,
@@ -1999,13 +2076,13 @@ def _chat_handler(
1999
  t2s_pitch,
2000
  )
2001
  history[-1] = (display_user, _render_audio_message(status, audio))
2002
- yield history, ""
2003
  return
2004
 
2005
  if mode == "ASR":
2006
  if not audio_in:
2007
  history[-1] = (display_user, _render_text_message("Please upload audio.", ""))
2008
- yield history, ""
2009
  return
2010
  for text, status in app.run_s2t_stream(
2011
  audio_in,
@@ -2016,13 +2093,13 @@ def _chat_handler(
2016
  update_every=32,
2017
  ):
2018
  history[-1] = (display_user, _render_text_message(status, text))
2019
- yield history, ""
2020
  return
2021
 
2022
  if mode == "MMU (Video → Text)":
2023
  if not video_in:
2024
  history[-1] = (display_user, _render_text_message("Please upload a video.", ""))
2025
- yield history, ""
2026
  return
2027
  for text, status in app.run_v2t_stream(
2028
  video_in,
@@ -2032,13 +2109,13 @@ def _chat_handler(
2032
  update_every=32,
2033
  ):
2034
  history[-1] = (display_user, _render_text_message(status, text))
2035
- yield history, ""
2036
  return
2037
 
2038
  if mode == "Image Generation":
2039
  if not message:
2040
  history[-1] = (display_user, _render_text_message("Please provide a prompt.", ""))
2041
- yield history, ""
2042
  return
2043
  for image, status in app.run_t2i_stream(
2044
  message,
@@ -2048,17 +2125,17 @@ def _chat_handler(
2048
  update_every=2,
2049
  ):
2050
  history[-1] = (display_user, _render_image_message(status, image))
2051
- yield history, ""
2052
  return
2053
 
2054
  if mode == "Image Editing":
2055
  if not image_in:
2056
  history[-1] = (display_user, _render_text_message("Please upload an image.", ""))
2057
- yield history, ""
2058
  return
2059
  if not message:
2060
  history[-1] = (display_user, _render_text_message("Please provide an edit instruction.", ""))
2061
- yield history, ""
2062
  return
2063
  for image, status in app.run_i2i_stream(
2064
  message,
@@ -2069,14 +2146,22 @@ def _chat_handler(
2069
  update_every=2,
2070
  ):
2071
  history[-1] = (display_user, _render_image_message(status, image))
2072
- yield history, ""
2073
  return
2074
 
2075
  if mode == "MMU (Image → Text)":
2076
  if not image_in:
2077
  history[-1] = (display_user, _render_text_message("Please upload an image.", ""))
2078
- yield history, ""
2079
  return
 
 
 
 
 
 
 
 
2080
  reply, status = app.run_mmu(
2081
  images=[image_in],
2082
  message=message,
@@ -2086,11 +2171,11 @@ def _chat_handler(
2086
  temperature=mmu_temperature,
2087
  )
2088
  history[-1] = (display_user, _render_text_message(status, reply))
2089
- yield history, ""
2090
  return
2091
 
2092
  history[-1] = (display_user, _render_text_message("Unsupported mode.", ""))
2093
- yield history, ""
2094
 
2095
  with demo:
2096
  def _hide_intro():
@@ -2178,14 +2263,19 @@ with demo:
2178
 
2179
  if __name__ == "__main__":
2180
  _launch_kwargs = {
2181
- "allowed_paths": [str(PREVIEW_DIR), "/tmp"],
 
 
 
 
 
2182
  }
2183
  if GRADIO_V6_PLUS:
2184
  _launch_kwargs.update(
2185
  {
2186
  "css": CUSTOM_CSS + EXTRA_CSS,
2187
  "theme": theme,
2188
- "js": FORCE_LIGHT_MODE_JS,
2189
  }
2190
  )
2191
  demo.launch(**_launch_kwargs)
 
104
 
105
  def download_assets() -> Path:
106
  """Download demo assets (logo + sample prompts/media) and return the root path."""
107
+ repo_id = os.getenv("ASSET_REPO_ID", "jaeikkim/Dynin-Omni-Demo-Assets")
108
  revision = os.getenv("ASSET_REVISION", "main")
109
  token = os.getenv("HF_TOKEN")
110
  cache_dir = PROJECT_ROOT / "_asset_cache"
 
247
  )
248
 
249
  n = min(len(image_files), len(text_files))
250
+ if n == 0:
251
+ return []
252
  examples = []
253
+ for i in range(n):
254
  img_path = image_files[i]
255
  txt_path = text_files[i]
256
  instruction = txt_path.read_text(encoding="utf-8").strip()
 
375
  return _render_response(status)
376
 
377
  encoded = base64.b64encode(buffer.getvalue()).decode("ascii")
378
+ safe_status = html.escape(status or "")
379
+ return (
380
+ "<div class='omada-image-only'>"
381
+ f"<p class='omada-image-status'>{safe_status}</p>"
382
  "<img src='data:image/png;base64,"
383
  f"{encoded}"
384
+ "' alt='Generated image' style='display:block;width:auto;height:auto;max-width:min(100%,720px);"
385
+ "border-radius:0;image-rendering:crisp-edges;image-rendering:-webkit-optimize-contrast;filter:none;opacity:1;' />"
386
  "</div>"
387
  )
 
388
 
389
 
390
  def _render_user_message(mode: str, message: str, image_in, audio_in, video_in, defer_video: bool = False) -> str:
 
592
  # MMU images
593
  MMU_DIR = ASSET_ROOT / "mmu"
594
  MMU_EXAMPLES: List[List[str]] = []
595
+ DEFAULT_MMU_PROMPT = "Describe the given image in detail."
596
  if MMU_DIR.exists():
597
  for path in sorted(
598
  [
 
603
  ):
604
  MMU_EXAMPLES.append([
605
  str(path),
606
+ DEFAULT_MMU_PROMPT,
607
  ])
608
 
609
 
 
1208
  background: rgba(255, 255, 255, 0.50) !important;
1209
  color: #1f2937 !important;
1210
  }
1211
+ /* Keep generated images crisp (no frosted overlay on image replies) */
1212
+ .gradio-chatbot .message {
1213
+ backdrop-filter: none !important;
1214
+ -webkit-backdrop-filter: none !important;
1215
+ }
1216
+ .gradio-chatbot .message.bot:has(.omada-image-only) {
1217
+ background: transparent !important;
1218
+ border: none !important;
1219
+ box-shadow: none !important;
1220
+ padding: 0 !important;
1221
+ margin: 0 !important;
1222
+ }
1223
+ .omada-image-only {
1224
+ display: inline-block;
1225
+ background: transparent !important;
1226
+ border: 0 !important;
1227
+ box-shadow: none !important;
1228
+ padding: 0 !important;
1229
+ margin: 0 !important;
1230
+ opacity: 1 !important;
1231
+ filter: none !important;
1232
+ }
1233
+ .gradio-chatbot .message.bot:has(.omada-image-only) *,
1234
+ .omada-image-only * {
1235
+ background: transparent !important;
1236
+ box-shadow: none !important;
1237
+ filter: none !important;
1238
+ opacity: 1 !important;
1239
+ }
1240
+ .omada-image-status {
1241
+ margin: 0 0 6px 0 !important;
1242
+ font-size: 0.85rem !important;
1243
+ color: #42526b !important;
1244
+ font-weight: 600 !important;
1245
+ }
1246
  .omada-chip button {
1247
  color: #273247 !important;
1248
  }
 
1659
  {
1660
  "label": "🖼️ Image QA",
1661
  "mode": "MMU (Image → Text)",
1662
+ "text": MMU_EXAMPLES[0][1] if MMU_EXAMPLES else DEFAULT_MMU_PROMPT,
1663
  "image": MMU_EXAMPLES[0][0] if MMU_EXAMPLES else None,
1664
  "audio": None,
1665
  "video": None,
 
1670
  "text": "",
1671
  "image": None,
1672
  "audio": None,
1673
+ "video": V2T_EXAMPLES[-1][0] if V2T_EXAMPLES else None,
1674
  },
1675
  {
1676
  "label": "🎨 Image Generation",
 
1760
 
1761
  adv_t2s = gr.Column(visible=False)
1762
  with adv_t2s:
1763
+ t2s_max_tokens = gr.Slider(2, 512, value=512, step=2, label="Speech token length", interactive=True)
1764
+ t2s_steps = gr.Slider(2, 512, value=256, step=2, label="T2S refinement steps", interactive=True)
1765
+ t2s_block = gr.Slider(2, 512, value=256, step=2, label="T2S block length", interactive=True)
1766
  t2s_temperature = gr.Slider(0.0, 2.0, value=0.0, step=0.05, label="T2S temperature", interactive=True)
1767
  t2s_cfg = gr.Slider(0.0, 6.0, value=3.5, step=0.1, label="T2S CFG scale", interactive=True)
1768
  t2s_gender = gr.Dropdown(["random", "female", "male"], value="random", label="T2S gender", interactive=True)
 
1791
 
1792
  adv_i2i = gr.Column(visible=False)
1793
  with adv_i2i:
1794
+ i2i_timesteps = gr.Slider(4, 128, value=32, step=2, label="I2I timesteps", interactive=True)
1795
  i2i_temperature = gr.Slider(0.0, 2.0, value=0.0, step=0.05, label="I2I temperature", interactive=True)
1796
  i2i_guidance = gr.Slider(0.0, 8.0, value=2.5, step=0.1, label="I2I CFG scale", interactive=True)
1797
 
 
1925
  return msg.strip() if msg else " "
1926
 
1927
 
1928
+ def _normalize_chat_history(history):
1929
+ if not history:
1930
+ return []
1931
+ if isinstance(history, list) and history and isinstance(history[0], dict):
1932
+ pairs = []
1933
+ pending_user = None
1934
+ for msg in history:
1935
+ role = msg.get("role")
1936
+ content = msg.get("content", "")
1937
+ if role == "user":
1938
+ if pending_user is not None:
1939
+ pairs.append((pending_user, ""))
1940
+ pending_user = content
1941
+ elif role == "assistant":
1942
+ if pending_user is None:
1943
+ pairs.append((" ", content))
1944
+ else:
1945
+ pairs.append((pending_user, content))
1946
+ pending_user = None
1947
+ if pending_user is not None:
1948
+ pairs.append((pending_user, ""))
1949
+ return pairs
1950
+ return list(history)
1951
+
1952
+
1953
+ def _serialize_chat_history(pairs):
1954
+ if not GRADIO_V6_PLUS:
1955
+ return pairs
1956
+ messages = []
1957
+ for user_msg, assistant_msg in pairs:
1958
+ messages.append({"role": "user", "content": user_msg if user_msg is not None else " "})
1959
+ messages.append({"role": "assistant", "content": assistant_msg if assistant_msg is not None else ""})
1960
+ return messages
1961
+
1962
+
1963
  def _is_identity_query(message: str) -> bool:
1964
  q = re.sub(r"[^a-z0-9\s]", " ", (message or "").lower())
1965
  q = re.sub(r"\s+", " ", q).strip()
 
2017
  mmu_temperature,
2018
  ):
2019
  _set_global_seed()
2020
+ history = _normalize_chat_history(history)
2021
  message = (message or "").strip()
2022
  defer_video = mode == "MMU (Video → Text)" and bool(video_in)
2023
  display_user = _render_user_message(mode, message, image_in, audio_in, video_in, defer_video=defer_video)
2024
+ history.append((display_user, _render_text_message("Model loading...", "")))
2025
+ yield _serialize_chat_history(history), ""
2026
 
2027
  if mode == "Chat" and _is_identity_query(message):
2028
  fixed = (
 
2030
  "I can understand and generate text, images, speech, and video within a single architecture."
2031
  )
2032
  history[-1] = (display_user, _render_text_message("Assistant reply generated.", fixed))
2033
+ yield _serialize_chat_history(history), ""
2034
  return
2035
 
2036
  if defer_video:
2037
  display_user = _render_user_message(mode, message, image_in, audio_in, video_in, defer_video=False)
2038
  history[-1] = (display_user, history[-1][1])
2039
+ yield _serialize_chat_history(history), ""
2040
 
2041
  app = get_app()
2042
+ history[-1] = (display_user, _render_text_message("Generating...", ""))
2043
+ yield _serialize_chat_history(history), ""
2044
  # Respect UI mode: Auto uses eval-matched defaults, Custom uses UI values.
2045
  app.force_eval_settings = str(auto_mode).strip().lower() == "auto"
2046
 
 
2055
  ):
2056
  response = _render_response(status, reply_html)
2057
  history[-1] = (display_user, response)
2058
+ yield _serialize_chat_history(history), ""
2059
  return
2060
 
2061
  if mode == "TTS":
2062
  if not message:
2063
  history[-1] = (display_user, _render_text_message("Please type some text.", ""))
2064
+ yield _serialize_chat_history(history), ""
2065
  return
2066
  audio, status = app.run_t2s(
2067
  message,
 
2076
  t2s_pitch,
2077
  )
2078
  history[-1] = (display_user, _render_audio_message(status, audio))
2079
+ yield _serialize_chat_history(history), ""
2080
  return
2081
 
2082
  if mode == "ASR":
2083
  if not audio_in:
2084
  history[-1] = (display_user, _render_text_message("Please upload audio.", ""))
2085
+ yield _serialize_chat_history(history), ""
2086
  return
2087
  for text, status in app.run_s2t_stream(
2088
  audio_in,
 
2093
  update_every=32,
2094
  ):
2095
  history[-1] = (display_user, _render_text_message(status, text))
2096
+ yield _serialize_chat_history(history), ""
2097
  return
2098
 
2099
  if mode == "MMU (Video → Text)":
2100
  if not video_in:
2101
  history[-1] = (display_user, _render_text_message("Please upload a video.", ""))
2102
+ yield _serialize_chat_history(history), ""
2103
  return
2104
  for text, status in app.run_v2t_stream(
2105
  video_in,
 
2109
  update_every=32,
2110
  ):
2111
  history[-1] = (display_user, _render_text_message(status, text))
2112
+ yield _serialize_chat_history(history), ""
2113
  return
2114
 
2115
  if mode == "Image Generation":
2116
  if not message:
2117
  history[-1] = (display_user, _render_text_message("Please provide a prompt.", ""))
2118
+ yield _serialize_chat_history(history), ""
2119
  return
2120
  for image, status in app.run_t2i_stream(
2121
  message,
 
2125
  update_every=2,
2126
  ):
2127
  history[-1] = (display_user, _render_image_message(status, image))
2128
+ yield _serialize_chat_history(history), ""
2129
  return
2130
 
2131
  if mode == "Image Editing":
2132
  if not image_in:
2133
  history[-1] = (display_user, _render_text_message("Please upload an image.", ""))
2134
+ yield _serialize_chat_history(history), ""
2135
  return
2136
  if not message:
2137
  history[-1] = (display_user, _render_text_message("Please provide an edit instruction.", ""))
2138
+ yield _serialize_chat_history(history), ""
2139
  return
2140
  for image, status in app.run_i2i_stream(
2141
  message,
 
2146
  update_every=2,
2147
  ):
2148
  history[-1] = (display_user, _render_image_message(status, image))
2149
+ yield _serialize_chat_history(history), ""
2150
  return
2151
 
2152
  if mode == "MMU (Image → Text)":
2153
  if not image_in:
2154
  history[-1] = (display_user, _render_text_message("Please upload an image.", ""))
2155
+ yield _serialize_chat_history(history), ""
2156
  return
2157
+ # Keep MMU QA consistent with chat mask-pill UX.
2158
+ try:
2159
+ mmu_mask_count = max(16, min(int(mmu_max_tokens or 128), 256))
2160
+ except Exception:
2161
+ mmu_mask_count = 128
2162
+ mmu_mask_surface = " ".join(["<mdm_mask>"] * mmu_mask_count)
2163
+ history[-1] = (display_user, _render_text_message("Generating...", mmu_mask_surface))
2164
+ yield _serialize_chat_history(history), ""
2165
  reply, status = app.run_mmu(
2166
  images=[image_in],
2167
  message=message,
 
2171
  temperature=mmu_temperature,
2172
  )
2173
  history[-1] = (display_user, _render_text_message(status, reply))
2174
+ yield _serialize_chat_history(history), ""
2175
  return
2176
 
2177
  history[-1] = (display_user, _render_text_message("Unsupported mode.", ""))
2178
+ yield _serialize_chat_history(history), ""
2179
 
2180
  with demo:
2181
  def _hide_intro():
 
2263
 
2264
  if __name__ == "__main__":
2265
  _launch_kwargs = {
2266
+ "allowed_paths": [
2267
+ str(PREVIEW_DIR),
2268
+ str(PROJECT_ROOT),
2269
+ str(ASSET_ROOT),
2270
+ "/tmp",
2271
+ ],
2272
  }
2273
  if GRADIO_V6_PLUS:
2274
  _launch_kwargs.update(
2275
  {
2276
  "css": CUSTOM_CSS + EXTRA_CSS,
2277
  "theme": theme,
2278
+ "js": FORCE_LIGHT_MODE_JS
2279
  }
2280
  )
2281
  demo.launch(**_launch_kwargs)