Spaces:
Running on Zero
Running on Zero
Dynin-Omni
Browse files- .gradio/certificate.pem +31 -0
- MMaDA/inference/common.py +12 -0
- MMaDA/inference/gradio_multimodal_demo_inst.py +8 -6
- app.py +125 -35
.gradio/certificate.pem
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
-----BEGIN CERTIFICATE-----
|
| 2 |
+
MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
|
| 3 |
+
TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
|
| 4 |
+
cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
|
| 5 |
+
WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
|
| 6 |
+
ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
|
| 7 |
+
MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
|
| 8 |
+
h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
|
| 9 |
+
0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
|
| 10 |
+
A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
|
| 11 |
+
T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
|
| 12 |
+
B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
|
| 13 |
+
B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
|
| 14 |
+
KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
|
| 15 |
+
OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
|
| 16 |
+
jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
|
| 17 |
+
qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
|
| 18 |
+
rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
|
| 19 |
+
HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
|
| 20 |
+
hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
|
| 21 |
+
ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
|
| 22 |
+
3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
|
| 23 |
+
NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
|
| 24 |
+
ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
|
| 25 |
+
TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
|
| 26 |
+
jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
|
| 27 |
+
oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
|
| 28 |
+
4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
|
| 29 |
+
mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
|
| 30 |
+
emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
|
| 31 |
+
-----END CERTIFICATE-----
|
MMaDA/inference/common.py
CHANGED
|
@@ -42,9 +42,21 @@ def get_vq_model_audio(cfg, device):
|
|
| 42 |
|
| 43 |
def build_uni_prompting(cfg) -> Tuple[UniversalPrompting, AutoTokenizer]:
|
| 44 |
tokenizer = AutoTokenizer.from_pretrained(cfg.model.omada.tokenizer_path, padding_side="left")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
uni_prompting = UniversalPrompting(
|
| 46 |
tokenizer,
|
| 47 |
max_text_len=cfg.dataset.preprocessing.max_seq_length,
|
|
|
|
| 48 |
max_audio_len=cfg.dataset.preprocessing.max_aud_length,
|
| 49 |
special_tokens=(
|
| 50 |
"<|soi|>", "<|eoi|>", "<|sov|>", "<|eov|>", "<|t2i|>",
|
|
|
|
| 42 |
|
| 43 |
def build_uni_prompting(cfg) -> Tuple[UniversalPrompting, AutoTokenizer]:
|
| 44 |
tokenizer = AutoTokenizer.from_pretrained(cfg.model.omada.tokenizer_path, padding_side="left")
|
| 45 |
+
dataset_cfg = getattr(cfg, "dataset", None)
|
| 46 |
+
dataset_params = getattr(dataset_cfg, "params", None) if dataset_cfg else None
|
| 47 |
+
preproc_cfg = getattr(dataset_cfg, "preprocessing", None) if dataset_cfg else None
|
| 48 |
+
# MMU image tokens at 480 resolution are typically ~900 tokens (with patch size 16),
|
| 49 |
+
# so 512 can silently drop the whole image in mmu_mult_prompt.
|
| 50 |
+
max_image_len = int(
|
| 51 |
+
getattr(dataset_params, "max_image_len", 0)
|
| 52 |
+
or getattr(preproc_cfg, "max_image_len", 0)
|
| 53 |
+
or getattr(preproc_cfg, "max_seq_length_image", 0)
|
| 54 |
+
or 1024
|
| 55 |
+
)
|
| 56 |
uni_prompting = UniversalPrompting(
|
| 57 |
tokenizer,
|
| 58 |
max_text_len=cfg.dataset.preprocessing.max_seq_length,
|
| 59 |
+
max_image_len=max_image_len,
|
| 60 |
max_audio_len=cfg.dataset.preprocessing.max_aud_length,
|
| 61 |
special_tokens=(
|
| 62 |
"<|soi|>", "<|eoi|>", "<|sov|>", "<|eov|>", "<|t2i|>",
|
MMaDA/inference/gradio_multimodal_demo_inst.py
CHANGED
|
@@ -948,15 +948,15 @@ class OmadaDemo:
|
|
| 948 |
"temperature": 0.0,
|
| 949 |
},
|
| 950 |
"i2i": {
|
| 951 |
-
"timesteps":
|
| 952 |
"guidance_scale": 2.5,
|
| 953 |
"temperature": 0.0,
|
| 954 |
},
|
| 955 |
# Match defaults used in inference scripts for eval parity.
|
| 956 |
"t2s": {
|
| 957 |
-
"steps":
|
| 958 |
-
"block_length":
|
| 959 |
-
"max_new_tokens":
|
| 960 |
"temperature": 0.0,
|
| 961 |
"guidance_scale": float(_cfg_get(training_cfg, "guidance_scale", 3.5)),
|
| 962 |
},
|
|
@@ -2639,7 +2639,7 @@ class OmadaDemo:
|
|
| 2639 |
if not encoded_images:
|
| 2640 |
return "", "Failed to encode the provided image(s)."
|
| 2641 |
|
| 2642 |
-
question = (question or "").strip() or "Describe the
|
| 2643 |
if "<|start_header_id|>" in question:
|
| 2644 |
prompt = question
|
| 2645 |
else:
|
|
@@ -2706,7 +2706,9 @@ class OmadaDemo:
|
|
| 2706 |
).strip()
|
| 2707 |
print(
|
| 2708 |
f"[MMU] input_prompt_len={input_prompt_len} output_len={int(output_ids.shape[1])} "
|
| 2709 |
-
f"gen_len={int(gen_slice.numel())}
|
|
|
|
|
|
|
| 2710 |
flush=True,
|
| 2711 |
)
|
| 2712 |
try:
|
|
|
|
| 948 |
"temperature": 0.0,
|
| 949 |
},
|
| 950 |
"i2i": {
|
| 951 |
+
"timesteps": 32,
|
| 952 |
"guidance_scale": 2.5,
|
| 953 |
"temperature": 0.0,
|
| 954 |
},
|
| 955 |
# Match defaults used in inference scripts for eval parity.
|
| 956 |
"t2s": {
|
| 957 |
+
"steps": 256,
|
| 958 |
+
"block_length": 256,
|
| 959 |
+
"max_new_tokens": 512,
|
| 960 |
"temperature": 0.0,
|
| 961 |
"guidance_scale": float(_cfg_get(training_cfg, "guidance_scale", 3.5)),
|
| 962 |
},
|
|
|
|
| 2639 |
if not encoded_images:
|
| 2640 |
return "", "Failed to encode the provided image(s)."
|
| 2641 |
|
| 2642 |
+
question = (question or "").strip() or "Describe the given image in detail."
|
| 2643 |
if "<|start_header_id|>" in question:
|
| 2644 |
prompt = question
|
| 2645 |
else:
|
|
|
|
| 2706 |
).strip()
|
| 2707 |
print(
|
| 2708 |
f"[MMU] input_prompt_len={input_prompt_len} output_len={int(output_ids.shape[1])} "
|
| 2709 |
+
f"gen_len={int(gen_slice.numel())} image_tok_len={int(encoded_images[0].numel()) if encoded_images else -1} "
|
| 2710 |
+
f"max_image_len={int(getattr(self.uni_prompting, 'max_image_len', -1))} "
|
| 2711 |
+
f"first_ids={gen_slice[:16].detach().cpu().tolist()}",
|
| 2712 |
flush=True,
|
| 2713 |
)
|
| 2714 |
try:
|
app.py
CHANGED
|
@@ -104,7 +104,7 @@ from inference.gradio_multimodal_demo_inst import ( # noqa: E402
|
|
| 104 |
|
| 105 |
def download_assets() -> Path:
|
| 106 |
"""Download demo assets (logo + sample prompts/media) and return the root path."""
|
| 107 |
-
repo_id = os.getenv("ASSET_REPO_ID", "jaeikkim/
|
| 108 |
revision = os.getenv("ASSET_REVISION", "main")
|
| 109 |
token = os.getenv("HF_TOKEN")
|
| 110 |
cache_dir = PROJECT_ROOT / "_asset_cache"
|
|
@@ -247,8 +247,10 @@ def _load_i2i_examples():
|
|
| 247 |
)
|
| 248 |
|
| 249 |
n = min(len(image_files), len(text_files))
|
|
|
|
|
|
|
| 250 |
examples = []
|
| 251 |
-
for i in range(
|
| 252 |
img_path = image_files[i]
|
| 253 |
txt_path = text_files[i]
|
| 254 |
instruction = txt_path.read_text(encoding="utf-8").strip()
|
|
@@ -373,14 +375,16 @@ def _render_image_message(status: str, image: Image.Image):
|
|
| 373 |
return _render_response(status)
|
| 374 |
|
| 375 |
encoded = base64.b64encode(buffer.getvalue()).decode("ascii")
|
| 376 |
-
|
| 377 |
-
|
|
|
|
|
|
|
| 378 |
"<img src='data:image/png;base64,"
|
| 379 |
f"{encoded}"
|
| 380 |
-
"' alt='Generated image' style='max-width:100%;
|
|
|
|
| 381 |
"</div>"
|
| 382 |
)
|
| 383 |
-
return _render_response(status, image_html)
|
| 384 |
|
| 385 |
|
| 386 |
def _render_user_message(mode: str, message: str, image_in, audio_in, video_in, defer_video: bool = False) -> str:
|
|
@@ -588,6 +592,7 @@ V2T_EXAMPLES = _load_media_examples("v2t", {".mp4", ".mov", ".avi", ".webm"})
|
|
| 588 |
# MMU images
|
| 589 |
MMU_DIR = ASSET_ROOT / "mmu"
|
| 590 |
MMU_EXAMPLES: List[List[str]] = []
|
|
|
|
| 591 |
if MMU_DIR.exists():
|
| 592 |
for path in sorted(
|
| 593 |
[
|
|
@@ -598,7 +603,7 @@ if MMU_DIR.exists():
|
|
| 598 |
):
|
| 599 |
MMU_EXAMPLES.append([
|
| 600 |
str(path),
|
| 601 |
-
|
| 602 |
])
|
| 603 |
|
| 604 |
|
|
@@ -1203,6 +1208,41 @@ html, body, .gradio-container {
|
|
| 1203 |
background: rgba(255, 255, 255, 0.50) !important;
|
| 1204 |
color: #1f2937 !important;
|
| 1205 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1206 |
.omada-chip button {
|
| 1207 |
color: #273247 !important;
|
| 1208 |
}
|
|
@@ -1619,7 +1659,7 @@ with gr.Blocks(**_blocks_kwargs) as demo:
|
|
| 1619 |
{
|
| 1620 |
"label": "🖼️ Image QA",
|
| 1621 |
"mode": "MMU (Image → Text)",
|
| 1622 |
-
"text": MMU_EXAMPLES[0][1] if MMU_EXAMPLES else
|
| 1623 |
"image": MMU_EXAMPLES[0][0] if MMU_EXAMPLES else None,
|
| 1624 |
"audio": None,
|
| 1625 |
"video": None,
|
|
@@ -1630,7 +1670,7 @@ with gr.Blocks(**_blocks_kwargs) as demo:
|
|
| 1630 |
"text": "",
|
| 1631 |
"image": None,
|
| 1632 |
"audio": None,
|
| 1633 |
-
"video": V2T_EXAMPLES[
|
| 1634 |
},
|
| 1635 |
{
|
| 1636 |
"label": "🎨 Image Generation",
|
|
@@ -1720,9 +1760,9 @@ with gr.Blocks(**_blocks_kwargs) as demo:
|
|
| 1720 |
|
| 1721 |
adv_t2s = gr.Column(visible=False)
|
| 1722 |
with adv_t2s:
|
| 1723 |
-
t2s_max_tokens = gr.Slider(2, 512, value=
|
| 1724 |
-
t2s_steps = gr.Slider(2, 512, value=
|
| 1725 |
-
t2s_block = gr.Slider(2, 512, value=
|
| 1726 |
t2s_temperature = gr.Slider(0.0, 2.0, value=0.0, step=0.05, label="T2S temperature", interactive=True)
|
| 1727 |
t2s_cfg = gr.Slider(0.0, 6.0, value=3.5, step=0.1, label="T2S CFG scale", interactive=True)
|
| 1728 |
t2s_gender = gr.Dropdown(["random", "female", "male"], value="random", label="T2S gender", interactive=True)
|
|
@@ -1751,7 +1791,7 @@ with gr.Blocks(**_blocks_kwargs) as demo:
|
|
| 1751 |
|
| 1752 |
adv_i2i = gr.Column(visible=False)
|
| 1753 |
with adv_i2i:
|
| 1754 |
-
i2i_timesteps = gr.Slider(4, 128, value=
|
| 1755 |
i2i_temperature = gr.Slider(0.0, 2.0, value=0.0, step=0.05, label="I2I temperature", interactive=True)
|
| 1756 |
i2i_guidance = gr.Slider(0.0, 8.0, value=2.5, step=0.1, label="I2I CFG scale", interactive=True)
|
| 1757 |
|
|
@@ -1885,6 +1925,41 @@ def _format_user_message(msg: str) -> str:
|
|
| 1885 |
return msg.strip() if msg else " "
|
| 1886 |
|
| 1887 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1888 |
def _is_identity_query(message: str) -> bool:
|
| 1889 |
q = re.sub(r"[^a-z0-9\s]", " ", (message or "").lower())
|
| 1890 |
q = re.sub(r"\s+", " ", q).strip()
|
|
@@ -1942,12 +2017,12 @@ def _chat_handler(
|
|
| 1942 |
mmu_temperature,
|
| 1943 |
):
|
| 1944 |
_set_global_seed()
|
| 1945 |
-
history = history
|
| 1946 |
message = (message or "").strip()
|
| 1947 |
defer_video = mode == "MMU (Video → Text)" and bool(video_in)
|
| 1948 |
display_user = _render_user_message(mode, message, image_in, audio_in, video_in, defer_video=defer_video)
|
| 1949 |
-
history.append((display_user, _render_text_message("
|
| 1950 |
-
yield history, ""
|
| 1951 |
|
| 1952 |
if mode == "Chat" and _is_identity_query(message):
|
| 1953 |
fixed = (
|
|
@@ -1955,15 +2030,17 @@ def _chat_handler(
|
|
| 1955 |
"I can understand and generate text, images, speech, and video within a single architecture."
|
| 1956 |
)
|
| 1957 |
history[-1] = (display_user, _render_text_message("Assistant reply generated.", fixed))
|
| 1958 |
-
yield history, ""
|
| 1959 |
return
|
| 1960 |
|
| 1961 |
if defer_video:
|
| 1962 |
display_user = _render_user_message(mode, message, image_in, audio_in, video_in, defer_video=False)
|
| 1963 |
history[-1] = (display_user, history[-1][1])
|
| 1964 |
-
yield history, ""
|
| 1965 |
|
| 1966 |
app = get_app()
|
|
|
|
|
|
|
| 1967 |
# Respect UI mode: Auto uses eval-matched defaults, Custom uses UI values.
|
| 1968 |
app.force_eval_settings = str(auto_mode).strip().lower() == "auto"
|
| 1969 |
|
|
@@ -1978,13 +2055,13 @@ def _chat_handler(
|
|
| 1978 |
):
|
| 1979 |
response = _render_response(status, reply_html)
|
| 1980 |
history[-1] = (display_user, response)
|
| 1981 |
-
yield history, ""
|
| 1982 |
return
|
| 1983 |
|
| 1984 |
if mode == "TTS":
|
| 1985 |
if not message:
|
| 1986 |
history[-1] = (display_user, _render_text_message("Please type some text.", ""))
|
| 1987 |
-
yield history, ""
|
| 1988 |
return
|
| 1989 |
audio, status = app.run_t2s(
|
| 1990 |
message,
|
|
@@ -1999,13 +2076,13 @@ def _chat_handler(
|
|
| 1999 |
t2s_pitch,
|
| 2000 |
)
|
| 2001 |
history[-1] = (display_user, _render_audio_message(status, audio))
|
| 2002 |
-
yield history, ""
|
| 2003 |
return
|
| 2004 |
|
| 2005 |
if mode == "ASR":
|
| 2006 |
if not audio_in:
|
| 2007 |
history[-1] = (display_user, _render_text_message("Please upload audio.", ""))
|
| 2008 |
-
yield history, ""
|
| 2009 |
return
|
| 2010 |
for text, status in app.run_s2t_stream(
|
| 2011 |
audio_in,
|
|
@@ -2016,13 +2093,13 @@ def _chat_handler(
|
|
| 2016 |
update_every=32,
|
| 2017 |
):
|
| 2018 |
history[-1] = (display_user, _render_text_message(status, text))
|
| 2019 |
-
yield history, ""
|
| 2020 |
return
|
| 2021 |
|
| 2022 |
if mode == "MMU (Video → Text)":
|
| 2023 |
if not video_in:
|
| 2024 |
history[-1] = (display_user, _render_text_message("Please upload a video.", ""))
|
| 2025 |
-
yield history, ""
|
| 2026 |
return
|
| 2027 |
for text, status in app.run_v2t_stream(
|
| 2028 |
video_in,
|
|
@@ -2032,13 +2109,13 @@ def _chat_handler(
|
|
| 2032 |
update_every=32,
|
| 2033 |
):
|
| 2034 |
history[-1] = (display_user, _render_text_message(status, text))
|
| 2035 |
-
yield history, ""
|
| 2036 |
return
|
| 2037 |
|
| 2038 |
if mode == "Image Generation":
|
| 2039 |
if not message:
|
| 2040 |
history[-1] = (display_user, _render_text_message("Please provide a prompt.", ""))
|
| 2041 |
-
yield history, ""
|
| 2042 |
return
|
| 2043 |
for image, status in app.run_t2i_stream(
|
| 2044 |
message,
|
|
@@ -2048,17 +2125,17 @@ def _chat_handler(
|
|
| 2048 |
update_every=2,
|
| 2049 |
):
|
| 2050 |
history[-1] = (display_user, _render_image_message(status, image))
|
| 2051 |
-
yield history, ""
|
| 2052 |
return
|
| 2053 |
|
| 2054 |
if mode == "Image Editing":
|
| 2055 |
if not image_in:
|
| 2056 |
history[-1] = (display_user, _render_text_message("Please upload an image.", ""))
|
| 2057 |
-
yield history, ""
|
| 2058 |
return
|
| 2059 |
if not message:
|
| 2060 |
history[-1] = (display_user, _render_text_message("Please provide an edit instruction.", ""))
|
| 2061 |
-
yield history, ""
|
| 2062 |
return
|
| 2063 |
for image, status in app.run_i2i_stream(
|
| 2064 |
message,
|
|
@@ -2069,14 +2146,22 @@ def _chat_handler(
|
|
| 2069 |
update_every=2,
|
| 2070 |
):
|
| 2071 |
history[-1] = (display_user, _render_image_message(status, image))
|
| 2072 |
-
yield history, ""
|
| 2073 |
return
|
| 2074 |
|
| 2075 |
if mode == "MMU (Image → Text)":
|
| 2076 |
if not image_in:
|
| 2077 |
history[-1] = (display_user, _render_text_message("Please upload an image.", ""))
|
| 2078 |
-
yield history, ""
|
| 2079 |
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2080 |
reply, status = app.run_mmu(
|
| 2081 |
images=[image_in],
|
| 2082 |
message=message,
|
|
@@ -2086,11 +2171,11 @@ def _chat_handler(
|
|
| 2086 |
temperature=mmu_temperature,
|
| 2087 |
)
|
| 2088 |
history[-1] = (display_user, _render_text_message(status, reply))
|
| 2089 |
-
yield history, ""
|
| 2090 |
return
|
| 2091 |
|
| 2092 |
history[-1] = (display_user, _render_text_message("Unsupported mode.", ""))
|
| 2093 |
-
yield history, ""
|
| 2094 |
|
| 2095 |
with demo:
|
| 2096 |
def _hide_intro():
|
|
@@ -2178,14 +2263,19 @@ with demo:
|
|
| 2178 |
|
| 2179 |
if __name__ == "__main__":
|
| 2180 |
_launch_kwargs = {
|
| 2181 |
-
"allowed_paths": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2182 |
}
|
| 2183 |
if GRADIO_V6_PLUS:
|
| 2184 |
_launch_kwargs.update(
|
| 2185 |
{
|
| 2186 |
"css": CUSTOM_CSS + EXTRA_CSS,
|
| 2187 |
"theme": theme,
|
| 2188 |
-
"js": FORCE_LIGHT_MODE_JS
|
| 2189 |
}
|
| 2190 |
)
|
| 2191 |
demo.launch(**_launch_kwargs)
|
|
|
|
| 104 |
|
| 105 |
def download_assets() -> Path:
|
| 106 |
"""Download demo assets (logo + sample prompts/media) and return the root path."""
|
| 107 |
+
repo_id = os.getenv("ASSET_REPO_ID", "jaeikkim/Dynin-Omni-Demo-Assets")
|
| 108 |
revision = os.getenv("ASSET_REVISION", "main")
|
| 109 |
token = os.getenv("HF_TOKEN")
|
| 110 |
cache_dir = PROJECT_ROOT / "_asset_cache"
|
|
|
|
| 247 |
)
|
| 248 |
|
| 249 |
n = min(len(image_files), len(text_files))
|
| 250 |
+
if n == 0:
|
| 251 |
+
return []
|
| 252 |
examples = []
|
| 253 |
+
for i in range(n):
|
| 254 |
img_path = image_files[i]
|
| 255 |
txt_path = text_files[i]
|
| 256 |
instruction = txt_path.read_text(encoding="utf-8").strip()
|
|
|
|
| 375 |
return _render_response(status)
|
| 376 |
|
| 377 |
encoded = base64.b64encode(buffer.getvalue()).decode("ascii")
|
| 378 |
+
safe_status = html.escape(status or "")
|
| 379 |
+
return (
|
| 380 |
+
"<div class='omada-image-only'>"
|
| 381 |
+
f"<p class='omada-image-status'>{safe_status}</p>"
|
| 382 |
"<img src='data:image/png;base64,"
|
| 383 |
f"{encoded}"
|
| 384 |
+
"' alt='Generated image' style='display:block;width:auto;height:auto;max-width:min(100%,720px);"
|
| 385 |
+
"border-radius:0;image-rendering:crisp-edges;image-rendering:-webkit-optimize-contrast;filter:none;opacity:1;' />"
|
| 386 |
"</div>"
|
| 387 |
)
|
|
|
|
| 388 |
|
| 389 |
|
| 390 |
def _render_user_message(mode: str, message: str, image_in, audio_in, video_in, defer_video: bool = False) -> str:
|
|
|
|
| 592 |
# MMU images
|
| 593 |
MMU_DIR = ASSET_ROOT / "mmu"
|
| 594 |
MMU_EXAMPLES: List[List[str]] = []
|
| 595 |
+
DEFAULT_MMU_PROMPT = "Describe the given image in detail."
|
| 596 |
if MMU_DIR.exists():
|
| 597 |
for path in sorted(
|
| 598 |
[
|
|
|
|
| 603 |
):
|
| 604 |
MMU_EXAMPLES.append([
|
| 605 |
str(path),
|
| 606 |
+
DEFAULT_MMU_PROMPT,
|
| 607 |
])
|
| 608 |
|
| 609 |
|
|
|
|
| 1208 |
background: rgba(255, 255, 255, 0.50) !important;
|
| 1209 |
color: #1f2937 !important;
|
| 1210 |
}
|
| 1211 |
+
/* Keep generated images crisp (no frosted overlay on image replies) */
|
| 1212 |
+
.gradio-chatbot .message {
|
| 1213 |
+
backdrop-filter: none !important;
|
| 1214 |
+
-webkit-backdrop-filter: none !important;
|
| 1215 |
+
}
|
| 1216 |
+
.gradio-chatbot .message.bot:has(.omada-image-only) {
|
| 1217 |
+
background: transparent !important;
|
| 1218 |
+
border: none !important;
|
| 1219 |
+
box-shadow: none !important;
|
| 1220 |
+
padding: 0 !important;
|
| 1221 |
+
margin: 0 !important;
|
| 1222 |
+
}
|
| 1223 |
+
.omada-image-only {
|
| 1224 |
+
display: inline-block;
|
| 1225 |
+
background: transparent !important;
|
| 1226 |
+
border: 0 !important;
|
| 1227 |
+
box-shadow: none !important;
|
| 1228 |
+
padding: 0 !important;
|
| 1229 |
+
margin: 0 !important;
|
| 1230 |
+
opacity: 1 !important;
|
| 1231 |
+
filter: none !important;
|
| 1232 |
+
}
|
| 1233 |
+
.gradio-chatbot .message.bot:has(.omada-image-only) *,
|
| 1234 |
+
.omada-image-only * {
|
| 1235 |
+
background: transparent !important;
|
| 1236 |
+
box-shadow: none !important;
|
| 1237 |
+
filter: none !important;
|
| 1238 |
+
opacity: 1 !important;
|
| 1239 |
+
}
|
| 1240 |
+
.omada-image-status {
|
| 1241 |
+
margin: 0 0 6px 0 !important;
|
| 1242 |
+
font-size: 0.85rem !important;
|
| 1243 |
+
color: #42526b !important;
|
| 1244 |
+
font-weight: 600 !important;
|
| 1245 |
+
}
|
| 1246 |
.omada-chip button {
|
| 1247 |
color: #273247 !important;
|
| 1248 |
}
|
|
|
|
| 1659 |
{
|
| 1660 |
"label": "🖼️ Image QA",
|
| 1661 |
"mode": "MMU (Image → Text)",
|
| 1662 |
+
"text": MMU_EXAMPLES[0][1] if MMU_EXAMPLES else DEFAULT_MMU_PROMPT,
|
| 1663 |
"image": MMU_EXAMPLES[0][0] if MMU_EXAMPLES else None,
|
| 1664 |
"audio": None,
|
| 1665 |
"video": None,
|
|
|
|
| 1670 |
"text": "",
|
| 1671 |
"image": None,
|
| 1672 |
"audio": None,
|
| 1673 |
+
"video": V2T_EXAMPLES[-1][0] if V2T_EXAMPLES else None,
|
| 1674 |
},
|
| 1675 |
{
|
| 1676 |
"label": "🎨 Image Generation",
|
|
|
|
| 1760 |
|
| 1761 |
adv_t2s = gr.Column(visible=False)
|
| 1762 |
with adv_t2s:
|
| 1763 |
+
t2s_max_tokens = gr.Slider(2, 512, value=512, step=2, label="Speech token length", interactive=True)
|
| 1764 |
+
t2s_steps = gr.Slider(2, 512, value=256, step=2, label="T2S refinement steps", interactive=True)
|
| 1765 |
+
t2s_block = gr.Slider(2, 512, value=256, step=2, label="T2S block length", interactive=True)
|
| 1766 |
t2s_temperature = gr.Slider(0.0, 2.0, value=0.0, step=0.05, label="T2S temperature", interactive=True)
|
| 1767 |
t2s_cfg = gr.Slider(0.0, 6.0, value=3.5, step=0.1, label="T2S CFG scale", interactive=True)
|
| 1768 |
t2s_gender = gr.Dropdown(["random", "female", "male"], value="random", label="T2S gender", interactive=True)
|
|
|
|
| 1791 |
|
| 1792 |
adv_i2i = gr.Column(visible=False)
|
| 1793 |
with adv_i2i:
|
| 1794 |
+
i2i_timesteps = gr.Slider(4, 128, value=32, step=2, label="I2I timesteps", interactive=True)
|
| 1795 |
i2i_temperature = gr.Slider(0.0, 2.0, value=0.0, step=0.05, label="I2I temperature", interactive=True)
|
| 1796 |
i2i_guidance = gr.Slider(0.0, 8.0, value=2.5, step=0.1, label="I2I CFG scale", interactive=True)
|
| 1797 |
|
|
|
|
| 1925 |
return msg.strip() if msg else " "
|
| 1926 |
|
| 1927 |
|
| 1928 |
+
def _normalize_chat_history(history):
|
| 1929 |
+
if not history:
|
| 1930 |
+
return []
|
| 1931 |
+
if isinstance(history, list) and history and isinstance(history[0], dict):
|
| 1932 |
+
pairs = []
|
| 1933 |
+
pending_user = None
|
| 1934 |
+
for msg in history:
|
| 1935 |
+
role = msg.get("role")
|
| 1936 |
+
content = msg.get("content", "")
|
| 1937 |
+
if role == "user":
|
| 1938 |
+
if pending_user is not None:
|
| 1939 |
+
pairs.append((pending_user, ""))
|
| 1940 |
+
pending_user = content
|
| 1941 |
+
elif role == "assistant":
|
| 1942 |
+
if pending_user is None:
|
| 1943 |
+
pairs.append((" ", content))
|
| 1944 |
+
else:
|
| 1945 |
+
pairs.append((pending_user, content))
|
| 1946 |
+
pending_user = None
|
| 1947 |
+
if pending_user is not None:
|
| 1948 |
+
pairs.append((pending_user, ""))
|
| 1949 |
+
return pairs
|
| 1950 |
+
return list(history)
|
| 1951 |
+
|
| 1952 |
+
|
| 1953 |
+
def _serialize_chat_history(pairs):
|
| 1954 |
+
if not GRADIO_V6_PLUS:
|
| 1955 |
+
return pairs
|
| 1956 |
+
messages = []
|
| 1957 |
+
for user_msg, assistant_msg in pairs:
|
| 1958 |
+
messages.append({"role": "user", "content": user_msg if user_msg is not None else " "})
|
| 1959 |
+
messages.append({"role": "assistant", "content": assistant_msg if assistant_msg is not None else ""})
|
| 1960 |
+
return messages
|
| 1961 |
+
|
| 1962 |
+
|
| 1963 |
def _is_identity_query(message: str) -> bool:
|
| 1964 |
q = re.sub(r"[^a-z0-9\s]", " ", (message or "").lower())
|
| 1965 |
q = re.sub(r"\s+", " ", q).strip()
|
|
|
|
| 2017 |
mmu_temperature,
|
| 2018 |
):
|
| 2019 |
_set_global_seed()
|
| 2020 |
+
history = _normalize_chat_history(history)
|
| 2021 |
message = (message or "").strip()
|
| 2022 |
defer_video = mode == "MMU (Video → Text)" and bool(video_in)
|
| 2023 |
display_user = _render_user_message(mode, message, image_in, audio_in, video_in, defer_video=defer_video)
|
| 2024 |
+
history.append((display_user, _render_text_message("Model loading...", "")))
|
| 2025 |
+
yield _serialize_chat_history(history), ""
|
| 2026 |
|
| 2027 |
if mode == "Chat" and _is_identity_query(message):
|
| 2028 |
fixed = (
|
|
|
|
| 2030 |
"I can understand and generate text, images, speech, and video within a single architecture."
|
| 2031 |
)
|
| 2032 |
history[-1] = (display_user, _render_text_message("Assistant reply generated.", fixed))
|
| 2033 |
+
yield _serialize_chat_history(history), ""
|
| 2034 |
return
|
| 2035 |
|
| 2036 |
if defer_video:
|
| 2037 |
display_user = _render_user_message(mode, message, image_in, audio_in, video_in, defer_video=False)
|
| 2038 |
history[-1] = (display_user, history[-1][1])
|
| 2039 |
+
yield _serialize_chat_history(history), ""
|
| 2040 |
|
| 2041 |
app = get_app()
|
| 2042 |
+
history[-1] = (display_user, _render_text_message("Generating...", ""))
|
| 2043 |
+
yield _serialize_chat_history(history), ""
|
| 2044 |
# Respect UI mode: Auto uses eval-matched defaults, Custom uses UI values.
|
| 2045 |
app.force_eval_settings = str(auto_mode).strip().lower() == "auto"
|
| 2046 |
|
|
|
|
| 2055 |
):
|
| 2056 |
response = _render_response(status, reply_html)
|
| 2057 |
history[-1] = (display_user, response)
|
| 2058 |
+
yield _serialize_chat_history(history), ""
|
| 2059 |
return
|
| 2060 |
|
| 2061 |
if mode == "TTS":
|
| 2062 |
if not message:
|
| 2063 |
history[-1] = (display_user, _render_text_message("Please type some text.", ""))
|
| 2064 |
+
yield _serialize_chat_history(history), ""
|
| 2065 |
return
|
| 2066 |
audio, status = app.run_t2s(
|
| 2067 |
message,
|
|
|
|
| 2076 |
t2s_pitch,
|
| 2077 |
)
|
| 2078 |
history[-1] = (display_user, _render_audio_message(status, audio))
|
| 2079 |
+
yield _serialize_chat_history(history), ""
|
| 2080 |
return
|
| 2081 |
|
| 2082 |
if mode == "ASR":
|
| 2083 |
if not audio_in:
|
| 2084 |
history[-1] = (display_user, _render_text_message("Please upload audio.", ""))
|
| 2085 |
+
yield _serialize_chat_history(history), ""
|
| 2086 |
return
|
| 2087 |
for text, status in app.run_s2t_stream(
|
| 2088 |
audio_in,
|
|
|
|
| 2093 |
update_every=32,
|
| 2094 |
):
|
| 2095 |
history[-1] = (display_user, _render_text_message(status, text))
|
| 2096 |
+
yield _serialize_chat_history(history), ""
|
| 2097 |
return
|
| 2098 |
|
| 2099 |
if mode == "MMU (Video → Text)":
|
| 2100 |
if not video_in:
|
| 2101 |
history[-1] = (display_user, _render_text_message("Please upload a video.", ""))
|
| 2102 |
+
yield _serialize_chat_history(history), ""
|
| 2103 |
return
|
| 2104 |
for text, status in app.run_v2t_stream(
|
| 2105 |
video_in,
|
|
|
|
| 2109 |
update_every=32,
|
| 2110 |
):
|
| 2111 |
history[-1] = (display_user, _render_text_message(status, text))
|
| 2112 |
+
yield _serialize_chat_history(history), ""
|
| 2113 |
return
|
| 2114 |
|
| 2115 |
if mode == "Image Generation":
|
| 2116 |
if not message:
|
| 2117 |
history[-1] = (display_user, _render_text_message("Please provide a prompt.", ""))
|
| 2118 |
+
yield _serialize_chat_history(history), ""
|
| 2119 |
return
|
| 2120 |
for image, status in app.run_t2i_stream(
|
| 2121 |
message,
|
|
|
|
| 2125 |
update_every=2,
|
| 2126 |
):
|
| 2127 |
history[-1] = (display_user, _render_image_message(status, image))
|
| 2128 |
+
yield _serialize_chat_history(history), ""
|
| 2129 |
return
|
| 2130 |
|
| 2131 |
if mode == "Image Editing":
|
| 2132 |
if not image_in:
|
| 2133 |
history[-1] = (display_user, _render_text_message("Please upload an image.", ""))
|
| 2134 |
+
yield _serialize_chat_history(history), ""
|
| 2135 |
return
|
| 2136 |
if not message:
|
| 2137 |
history[-1] = (display_user, _render_text_message("Please provide an edit instruction.", ""))
|
| 2138 |
+
yield _serialize_chat_history(history), ""
|
| 2139 |
return
|
| 2140 |
for image, status in app.run_i2i_stream(
|
| 2141 |
message,
|
|
|
|
| 2146 |
update_every=2,
|
| 2147 |
):
|
| 2148 |
history[-1] = (display_user, _render_image_message(status, image))
|
| 2149 |
+
yield _serialize_chat_history(history), ""
|
| 2150 |
return
|
| 2151 |
|
| 2152 |
if mode == "MMU (Image → Text)":
|
| 2153 |
if not image_in:
|
| 2154 |
history[-1] = (display_user, _render_text_message("Please upload an image.", ""))
|
| 2155 |
+
yield _serialize_chat_history(history), ""
|
| 2156 |
return
|
| 2157 |
+
# Keep MMU QA consistent with chat mask-pill UX.
|
| 2158 |
+
try:
|
| 2159 |
+
mmu_mask_count = max(16, min(int(mmu_max_tokens or 128), 256))
|
| 2160 |
+
except Exception:
|
| 2161 |
+
mmu_mask_count = 128
|
| 2162 |
+
mmu_mask_surface = " ".join(["<mdm_mask>"] * mmu_mask_count)
|
| 2163 |
+
history[-1] = (display_user, _render_text_message("Generating...", mmu_mask_surface))
|
| 2164 |
+
yield _serialize_chat_history(history), ""
|
| 2165 |
reply, status = app.run_mmu(
|
| 2166 |
images=[image_in],
|
| 2167 |
message=message,
|
|
|
|
| 2171 |
temperature=mmu_temperature,
|
| 2172 |
)
|
| 2173 |
history[-1] = (display_user, _render_text_message(status, reply))
|
| 2174 |
+
yield _serialize_chat_history(history), ""
|
| 2175 |
return
|
| 2176 |
|
| 2177 |
history[-1] = (display_user, _render_text_message("Unsupported mode.", ""))
|
| 2178 |
+
yield _serialize_chat_history(history), ""
|
| 2179 |
|
| 2180 |
with demo:
|
| 2181 |
def _hide_intro():
|
|
|
|
| 2263 |
|
| 2264 |
if __name__ == "__main__":
|
| 2265 |
_launch_kwargs = {
|
| 2266 |
+
"allowed_paths": [
|
| 2267 |
+
str(PREVIEW_DIR),
|
| 2268 |
+
str(PROJECT_ROOT),
|
| 2269 |
+
str(ASSET_ROOT),
|
| 2270 |
+
"/tmp",
|
| 2271 |
+
],
|
| 2272 |
}
|
| 2273 |
if GRADIO_V6_PLUS:
|
| 2274 |
_launch_kwargs.update(
|
| 2275 |
{
|
| 2276 |
"css": CUSTOM_CSS + EXTRA_CSS,
|
| 2277 |
"theme": theme,
|
| 2278 |
+
"js": FORCE_LIGHT_MODE_JS
|
| 2279 |
}
|
| 2280 |
)
|
| 2281 |
demo.launch(**_launch_kwargs)
|