Spaces:

mispeech
/

Dasheng-AudioGen

Running

File size: 28,181 Bytes

d46b5d9
 
 
 
d8d5fca
d46b5d9
 
f1efebb
 
 
f112acd
f1efebb
 
 
 
 
 
 
 
 
d46b5d9
f112acd
 
 
 
 
f1efebb
d46b5d9
f112acd
d8d5fca
d46b5d9
 
 
 
f112acd
 
 
 
 
d46b5d9
 
 
f112acd
 
 
d46b5d9
 
 
 
f112acd
 
 
 
 
d46b5d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d8d5fca
 
 
 
 
 
 
 
 
f112acd
d8d5fca
 
 
 
f1efebb
d8d5fca
 
 
 
 
 
 
 
 
 
f112acd
 
 
d8d5fca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d46b5d9
d8d5fca
f112acd
 
d8d5fca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f112acd
 
 
d8d5fca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f112acd
f1efebb
 
 
 
 
d8d5fca
f1efebb
 
 
 
d8d5fca
 
 
 
 
 
f1efebb
 
 
f112acd
 
f1efebb
 
 
 
 
 
 
f112acd
 
 
 
 
 
 
 
f1efebb
 
 
 
 
 
d8d5fca
 
 
 
 
f1efebb
 
d8d5fca
f1efebb
 
f112acd
 
 
 
f1efebb
 
 
 
d8d5fca
f1efebb
 
 
 
f112acd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d8d5fca
 
 
 
f112acd
 
d8d5fca
 
 
f112acd
 
 
d8d5fca
 
 
 
 
 
f112acd
d8d5fca
 
 
f1efebb
 
 
 
 
f112acd
 
 
 
 
f1efebb
d46b5d9
 
3b2e714
 
 
 
 
 
 
 
 
 
 
d46b5d9
 
 
 
 
 
 
 
f1efebb
3b2e714
f1efebb
3b2e714
d46b5d9
3b2e714
 
 
d46b5d9
 
f1efebb
d46b5d9
 
 
 
 
 
 
 
f112acd
 
 
d46b5d9
 
 
 
 
 
f112acd
 
 
 
 
 
d46b5d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2a4fc2a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d46b5d9
 
f112acd
d46b5d9
 
 
 
 
 
 
 
2a4fc2a
d46b5d9
 
 
2a4fc2a
f112acd
9df66ca
 
 
 
 
 
 
 
 
 
 
 
 
 
d46b5d9
b9beba5
 
d46b5d9
 
 
 
 
 
 
 
 
 
 
 
 
3b2e714
 
 
 
f1efebb
2a4fc2a
e2b2b1c
 
0ffd937
2a4fc2a
0ffd937
2a4fc2a
 
0b875b1
 
 
33a63c2
f1efebb
 
5943dd3
2a4fc2a
 
 
d46b5d9
 
 
 
2a4fc2a
d46b5d9
f236a22
 
 
 
d46b5d9
 
 
2a4fc2a
d46b5d9
f112acd
 
 
d46b5d9
2a4fc2a
d46b5d9
 
 
 
2a4fc2a
d46b5d9
f236a22
 
 
 
 
 
 
 
 
 
 
f112acd
 
 
 
 
 
 
f236a22
 
 
 
 
d46b5d9
 
 
 
 
 
 
 
3b2e714
 
 
 
f1efebb
2a4fc2a
e2b2b1c
 
2a4fc2a
 
 
0ffd937
0b875b1
 
 
33a63c2
f1efebb
 
d46b5d9
2a4fc2a
 
d46b5d9
 
 
 
2a4fc2a
d46b5d9
 
 
 
 
 
 
 
 
2a4fc2a
d46b5d9
 
 
 
2a4fc2a
d46b5d9
 
 
 
 
 
 
2a4fc2a
d46b5d9
 
 
 
2a4fc2a
d46b5d9
 
 
 
2a4fc2a
d46b5d9
 
 
 
 
f112acd
 
 
d46b5d9
2a4fc2a
d46b5d9
 
 
 
2a4fc2a
d46b5d9
 
 
f112acd
 
 
d46b5d9
 
 
 
 
 
f1efebb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d46b5d9
 
 
 
 
 
 
f112acd
 
 
d46b5d9
 
 
f112acd
 
d46b5d9
2a4fc2a
 
 
 
 
 
 
 
 
f112acd
 
d46b5d9
f236a22
 
 
 
 
 
 
d46b5d9

#!/usr/bin/env python3

import os
import json
import gzip
import requests
import gradio as gr
from openai import OpenAI
from prompt_refiner_prompt import prompt_refiner_prompt as _PROMPT_REFINER_SYSTEM


# API Endpoints — required environment variables, no defaults
def _require_env(name: str) -> str:
    value = os.environ.get(name)
    if not value:
        raise EnvironmentError(
            f"Required environment variable '{name}' is not set. "
            "Please set it before launching the app."
        )
    return value


AUDIOGEN_API_URL = _require_env("AUDIOGEN_API_URL")
LLM_BASE_URL = os.environ.get("LLM_BASE_URL", "")
CLAW_API_URL = os.environ.get("CLAW_API_URL", "")
XI_API_BASE_URL = "https://api.xi-ai.cn/v1"
PROMPT_REFINER_MAX_RETRIES = 3

PROMPT_REFINER_MODE = os.environ.get("PROMPT_REFINER_MODE", "xi_api")

# Special token order and mapping
SPECIAL_TOKEN_ORDER = ["caption", "speech", "sfx", "music", "env", "asr"]
SPECIAL_TOKEN_MAP = {
    "caption": "<|caption|>",
    "speech": "<|speech|>",
    "sfx": "<|sfx|>",
    "music": "<|music|>",
    "env": "<|env|>",
    "asr": "<|asr|>",
}


def build_structured_prompt(
    caption="", speech="", sfx="", music="", env="", asr=""
):
    """Assemble a structured prompt string from individual token fields.
    Only tokens with non-empty values are included."""
    fields = {
        "caption": caption,
        "speech": speech,
        "sfx": sfx,
        "music": music,
        "env": env,
        "asr": asr,
    }
    parts = []
    for token in SPECIAL_TOKEN_ORDER:
        value = (fields[token] or "").strip()
        if value:
            parts.append(f"{SPECIAL_TOKEN_MAP[token]} {value}")
    return " ".join(parts)


def call_audiogen(structured_prompt):
    """POST the structured prompt to AudioGen API, save and return the WAV path."""
    if not structured_prompt.strip():
        return None, "Error: Prompt is empty"

    try:
        response = requests.post(
            AUDIOGEN_API_URL,
            headers={"Content-Type": "application/json"},
            json={"text": structured_prompt},
            timeout=120,
        )
        response.raise_for_status()

        os.makedirs("./outputs", exist_ok=True)
        output_path = "./outputs/audiogen_output.wav"
        with open(output_path, "wb") as f:
            f.write(response.content)

        return output_path, "Generation successful!"
    except requests.exceptions.ConnectionError:
        return None, "Error: Cannot connect to AudioGen API. Please check the service."
    except requests.exceptions.HTTPError as e:
        return None, f"Error: HTTP {e.response.status_code} - {e.response.reason}"
    except requests.exceptions.Timeout:
        return None, "Error: Request timed out."
    except Exception as e:
        return None, f"Error: {str(e)}"


def _parse_and_validate(raw_content: str, attempt: int):
    """Parse JSON and validate required 'caption' field. Returns (dict|None, error_str)."""
    try:
        parsed = json.loads(raw_content)
    except json.JSONDecodeError as e:
        return None, f"Invalid JSON on attempt {attempt}: {e}"

    normalized = {
        k.lower(): v
        for k, v in parsed.items() if v is not None and str(v).strip()
    }
    if not normalized.get("caption"):
        return None, f"Missing required 'Caption' field on attempt {attempt}."
    return normalized, None


def _decode_claw_response_json(response: requests.Response) -> dict:
    """Decode CLAW response robustly, including mis-labeled gzip responses."""
    raw_bytes = response.raw.read(decode_content=False)

    candidates = []
    # 1) Treat as plain utf-8 text first (some responses are plain text but mislabeled)
    candidates.append(raw_bytes.decode("utf-8", errors="replace"))
    # 2) Try gzip decode as fallback when content-encoding is incorrect/mixed
    try:
        candidates.append(
            gzip.decompress(raw_bytes).decode("utf-8", errors="replace")
        )
    except Exception:
        pass

    last_err = None
    for text in candidates:
        try:
            return json.loads(text)
        except Exception as e:
            last_err = e
    raise ValueError(f"Unable to decode CLAW JSON response: {last_err}")


def _call_prompt_refiner_claw(user_input: str, max_retries: int) -> dict:
    """Call Prompt Refiner via CLAW endpoint (no auth required).
    Sends the full prompt template with user input substituted as plain text.
    """
    # Substitute user input into the prompt template
    full_prompt = _PROMPT_REFINER_SYSTEM.replace("{{user_input}}",
                                                 user_input).strip()

    last_error = None
    for attempt in range(1, max_retries + 1):
        try:
            response = requests.post(
                CLAW_API_URL,
                headers={"Content-Type": "text/plain"},
                data=full_prompt.encode("utf-8"),
                timeout=60,
                stream=True,
            )
            response.raise_for_status()

            # Response has same structure as OpenAI: choices[0].message.content
            resp_json = _decode_claw_response_json(response)
            raw_content = resp_json["choices"][0]["message"]["content"]

            result, err = _parse_and_validate(raw_content, attempt)
            if err:
                last_error = err
                continue
            return result

        except requests.exceptions.HTTPError as e:
            code = e.response.status_code
            raise RuntimeError(
                f"CLAW API HTTP error {code}: {e.response.reason}"
            ) from e
        except requests.exceptions.ConnectionError as e:
            raise RuntimeError(f"CLAW API connection error: {e}") from e
        except requests.exceptions.Timeout:
            last_error = f"CLAW API timed out on attempt {attempt}."
        except Exception as e:
            last_error = f"CLAW API error on attempt {attempt}: {e}"

    raise RuntimeError(
        f"Prompt Refiner (claw) failed after {max_retries} attempt(s). "
        f"Last error: {last_error}"
    )


def _call_prompt_refiner_openai(user_input: str, max_retries: int) -> dict:
    """Call Prompt Refiner via OpenAI-compatible chat completions endpoint."""
    api_key = os.environ.get("API_KEY")
    model_name = os.environ.get("MODEL_NAME")

    if not api_key:
        raise EnvironmentError(
            "API_KEY environment variable is not set. "
            "Please set it before using Auto Mode (openai mode)."
        )
    if not model_name:
        raise EnvironmentError(
            "MODEL_NAME environment variable is not set. "
            "Please set it before using Auto Mode (openai mode)."
        )
    if not LLM_BASE_URL:
        raise EnvironmentError(
            "LLM_BASE_URL environment variable is not set. "
            "Please set it before using Auto Mode (openai mode)."
        )

    client = OpenAI(api_key=api_key, base_url=LLM_BASE_URL)
    system_content = _PROMPT_REFINER_SYSTEM.replace("{{user_input}}",
                                                    "").strip()

    last_error = None
    for attempt in range(1, max_retries + 1):
        try:
            completion = client.chat.completions.create(
                model=model_name,
                messages=[
                    {
                        "role": "system",
                        "content": system_content
                    },
                    {
                        "role": "user",
                        "content": user_input
                    },
                ],
                max_completion_tokens=1024,
                response_format={"type": "json_object"},
            )
            raw_content = completion.choices[0].message.content

            result, err = _parse_and_validate(raw_content, attempt)
            if err:
                last_error = err
                continue
            return result

        except EnvironmentError:
            raise
        except Exception as e:
            err_str = str(e).lower()
            if any(
                kw in err_str for kw in
                ("authentication", "api_key", "unauthorized", "403", "401")
            ):
                raise RuntimeError(f"Prompt Refiner auth error: {e}") from e
            last_error = f"API error on attempt {attempt}: {e}"

    raise RuntimeError(
        f"Prompt Refiner (openai) failed after {max_retries} attempt(s). "
        f"Last error: {last_error}"
    )


def _call_prompt_refiner_xi_api(user_input: str, max_retries: int) -> dict:
    """Call Prompt Refiner via XI API chat completions endpoint."""
    api_key = os.environ.get("XI_API_KEY")
    model_name = os.environ.get("XI_MODEL_NAME", "deepseek-v4-flash")

    if not api_key:
        raise EnvironmentError(
            "XI_API_KEY environment variable is not set. "
            "Please set it before using Auto Mode (xi_api mode)."
        )

    client = OpenAI(api_key=api_key, base_url=XI_API_BASE_URL)
    system_content = _PROMPT_REFINER_SYSTEM.replace("{{user_input}}",
                                                    "").strip()

    last_error = None
    for attempt in range(1, max_retries + 1):
        try:
            completion = client.chat.completions.create(
                model=model_name,
                messages=[
                    {
                        "role": "system",
                        "content": system_content
                    },
                    {
                        "role": "user",
                        "content": user_input
                    },
                ],
            )
            raw_content = completion.choices[0].message.content

            result, err = _parse_and_validate(raw_content, attempt)
            if err:
                last_error = err
                continue
            return result

        except EnvironmentError:
            raise
        except Exception as e:
            err_str = str(e).lower()
            if any(
                kw in err_str for kw in
                ("authentication", "api_key", "unauthorized", "403", "401")
            ):
                raise RuntimeError(f"Prompt Refiner auth error: {e}") from e
            last_error = f"XI API error on attempt {attempt}: {e}"

    raise RuntimeError(
        f"Prompt Refiner (xi_api) failed after {max_retries} attempt(s). "
        f"Last error: {last_error}"
    )


def call_prompt_refiner(user_input, max_retries=PROMPT_REFINER_MAX_RETRIES):
    """Dispatch to the configured Prompt Refiner backend.

    Mode is controlled by the PROMPT_REFINER_MODE environment variable:
      'xi_api' — XI API chat completions endpoint (default)
      'claw'   — CLAW plain-text endpoint, no auth required
      'openai' — OpenAI-compatible chat completions endpoint
    """
    mode = PROMPT_REFINER_MODE.lower()
    if mode == "xi_api":
        return _call_prompt_refiner_xi_api(user_input, max_retries)
    elif mode == "openai":
        return _call_prompt_refiner_openai(user_input, max_retries)
    elif mode == "claw":
        return _call_prompt_refiner_claw(user_input, max_retries)
    else:
        raise ValueError(
            f"Unknown PROMPT_REFINER_MODE '{mode}'. "
            "Valid values: 'xi_api' (default), 'claw', 'openai'."
        )


def build_caption_from_refined(refined: dict) -> str:
    """Build the full structured prompt string from a refined dict.
    This is a convenience wrapper around build_structured_prompt."""
    return build_structured_prompt(
        caption=refined.get("caption", ""),
        speech=refined.get("speech", ""),
        sfx=refined.get("sfx", ""),
        music=refined.get("music", ""),
        env=refined.get("env", ""),
        asr=refined.get("asr", ""),
    )


def _build_auto_mode_llm_error(message: str) -> str:
    """Return a clear UI-facing error message for Prompt Refiner failures."""
    return (
        "Auto Mode is currently unavailable because the Prompt Refiner LLM API "
        f"failed: {message} "
        "You can still try Manual Mode, which may remain available because it "
        "does not depend on the Prompt Refiner. "
        "If the issue continues, contact jiahaomei@sjtu.edu.cn."
    )


def auto_generate(caption, progress=gr.Progress()):
    """Mode 1: Call Prompt Refiner -> build structured prompt -> call AudioGen."""
    if not (caption or "").strip():
        return None, "", "Error: Please enter a description."

    progress(0.1, desc="Calling Prompt Refiner...")
    try:
        refined = call_prompt_refiner(caption)
    except EnvironmentError as e:
        return None, "", _build_auto_mode_llm_error(f"Configuration error. {e}")
    except RuntimeError as e:
        return None, "", _build_auto_mode_llm_error(str(e))
    except Exception as e:
        return None, "", _build_auto_mode_llm_error(
            f"Unexpected error while calling Prompt Refiner. {e}"
        )

    progress(0.4, desc="Building structured prompt...")
    structured_prompt = build_caption_from_refined(refined)

    progress(0.6, desc="Generating audio...")
    audio_path, status = call_audiogen(structured_prompt)

    progress(1.0)
    return audio_path, structured_prompt, status


def manual_generate(
    caption, speech, sfx, music, env, asr, progress=gr.Progress()
):
    """Mode 2: Build structured prompt from individual fields -> call AudioGen."""
    if not (caption or "").strip():
        return None, "", "Error: Caption is required."

    progress(0.2, desc="Building structured prompt...")
    structured_prompt = build_structured_prompt(
        caption=caption,
        speech=speech,
        sfx=sfx,
        music=music,
        env=env,
        asr=asr,
    )

    progress(0.5, desc="Generating audio...")
    audio_path, status = call_audiogen(structured_prompt)

    progress(1.0)
    return audio_path, structured_prompt, status


# Custom CSS
custom_css = """
.prompt-preview textarea {
    font-family: monospace !important;
    font-size: 12px !important;
}
.mode-radio label {
    font-weight: 600 !important;
}
.banner-warning {
    padding: 12px 16px;
    background: rgba(255, 193, 7, 0.12);
    border: 2px solid #d4920a;
    border-radius: 6px;
    margin-bottom: 12px;
    font-size: 14px;
    line-height: 1.9;
}
.dark .banner-warning {
    background: rgba(255, 193, 7, 0.07) !important;
    border-color: #c8860a !important;
}
.banner-warning a {
    color: #1a73e8;
}
.dark .banner-warning a {
    color: #7aafff !important;
}
"""


# ── Mode switching helper ──────────────────────────────────────────────────────
def switch_mode(mode):
    is_auto = mode == "🤖 Auto Mode"
    return gr.update(visible=is_auto), gr.update(visible=not is_auto)


# ── Gradio UI ──────────────────────────────────────────────────────────────────
with gr.Blocks(
    title="Dasheng AudioGen Demo",
    theme=gr.themes.Soft(),
    css=custom_css,
) as demo:
    gr.Markdown("# 🔊 Dasheng AudioGen Demo")
    gr.Markdown("Developed by SJTU X-LANCE & Xiaomi LLM Plus")
    gr.HTML(
        '<div style="padding: 16px 20px; background: rgba(220, 38, 38, 0.10); '
        'border: 2px solid #dc2626; border-radius: 8px; margin: 12px 0 16px 0; '
        'font-size: 15px; line-height: 2;">'
        "🚧 <strong>由于接口变动，在线 Demo 服务暂时不可用，正在维护中。</strong><br>"
        "您可以自行部署模型权重使用："
        '<a href="https://huggingface.co/mispeech/Dasheng-AudioGen" target="_blank">'
        "mispeech/Dasheng-AudioGen</a><br><br>"
        "🚧 <strong>Due to API changes, the online Demo is temporarily unavailable and under maintenance.</strong><br>"
        "You can self-deploy using the model weights: "
        '<a href="https://huggingface.co/mispeech/Dasheng-AudioGen" target="_blank">'
        "mispeech/Dasheng-AudioGen</a>"
        "</div>"
    )
    gr.Markdown(
        "支持结构化 Prompt 的混合音频生成，可用自然语言描述场景(Auto mode)或逐轨道填写(Manual mode)，一次生成包含音乐、可理解人声和音效的完整音频。。  \n"
        "Structured-prompt mixed audio generation that lets you describe a scene in natural language (Auto mode) or specify tracks manually (Manual mode), producing a complete audio clip with music, intelligible speech, and sound effects in one pass."
    )

    # Mode selector
    mode_radio = gr.Radio(
        choices=["🤖 Auto Mode", "✏️ Manual Mode"],
        value="🤖 Auto Mode",
        label="Generation Mode",
        interactive=True,
        elem_classes=["mode-radio"],
    )

    # ── Auto Mode section ──────────────────────────────────────────────────────
    with gr.Column(visible=True) as auto_section:
        gr.Markdown(
            "## 🤖 Auto Mode  \n"
            "If Auto Mode is unavailable, contact jiahaomei@sjtu.edu.cn."
        )
        gr.HTML(
            '<div class="banner-warning">'
            "⚠️ <strong>一次生成包含音乐、可理解人声和音效的完整音频。若生成音频质量较差或 Speech 内容不完整，可多尝试几次。</strong><br>"
            "⚠️ <strong>Producing a complete audio clip with music, intelligible speech, and sound effects in one pass. If the generated audio quality is poor or speech content is incomplete, please try generating again.</strong>"
            "<br><br>"
            "💬 你可以输入任意语言的音频描述，Prompt Refiner 会进行自动转换。目前 Speech 合成只支持英文，多语言支持即将上线。<br>"
            "💬 You can enter audio descriptions in any language — the Prompt Refiner will automatically convert them. "
            "Currently, speech synthesis only supports English. Multi-language support coming soon."
            "<br><br>"
            "🌐 Web Demo: "
            '<a href="https://nieeim.github.io/Dasheng-AudioGen-Web/" target="_blank">DashengAudioGen Web Demo</a><br>'
            "📦 GitHub Repo: "
            '<a href="https://github.com/xiaomi-research/dasheng-audiogen" target="_blank">DashengAudioGen GitHub Repository</a>'
            "</div>"
        )
        gr.Markdown(
            "输入整体音频描述，系统将调用 **Prompt Refiner** 自动转换为结构化 Prompt，再通过 **DashengAudioGen** 生成音频。  \n"
            "Enter an overall audio description. The system will call the **Prompt Refiner** to convert it into a "
            "structured prompt, then generate audio via **DashengAudioGen**."
        )
        with gr.Row():
            with gr.Column():
                auto_caption = gr.Textbox(
                    label="整体音频描述 / Overall Audio Description",
                    placeholder=(
                        'e.g., A train station broadcast says, '
                        '"Train G128 is arriving on platform three, '
                        'please stand behind the yellow line." '
                        'with warning beeps, light orchestral bed, and station ambience.'
                    ),
                    lines=4,
                )
                auto_button = gr.Button("Generate", variant="primary")
            with gr.Column():
                auto_audio = gr.Audio(
                    label="生成音频 / Generated Audio", type="filepath"
                )
                auto_prompt_preview = gr.Textbox(
                    label="结构化 Prompt 预览 / Structured Prompt (Preview)",
                    lines=4,
                    interactive=False,
                    elem_classes=["prompt-preview"],
                )
                auto_status = gr.Textbox(label="状态 / Status")

        gr.Examples(
            examples=[
                [
                    'A game announcer shouts, "Final round begins now, give it everything you have!" with crowd cheers, drum hits, and stadium ambience.',
                ],
                [
                    'A café barista says, "Your caramel latte is ready at the counter." while soft jazz plays, cups clink, and indoor café ambience continues.',
                ],
                [
                    'A train station broadcast says, "Train G128 is arriving on platform three, please stand behind the yellow line." with warning beeps, light orchestral bed, and station ambience.',
                ],
                [
                    'A radio host announces traffic updates over upbeat pop music with city street ambience.'
                ],
                ['安静的午后咖啡馆,一男一女在讨论天气'],
                [
                    '列车员使用英文报站,说“Train G128 is arriving on platform three, please stand behind the yellow line.”'
                ],
            ],
            inputs=[auto_caption],
            label="Examples",
        )

        auto_button.click(
            fn=auto_generate,
            inputs=[auto_caption],
            outputs=[auto_audio, auto_prompt_preview, auto_status],
        )

    # ── Manual Mode section ────────────────────────────────────────────────────
    with gr.Column(visible=False) as manual_section:
        gr.Markdown(
            "## ✏️ Manual Mode  \n"
            "If Manual Mode is unavailable, contact jiahaomei@sjtu.edu.cn."
        )
        gr.HTML(
            '<div class="banner-warning">'
            "⚠️ <strong>一次生成包含音乐、可理解人声和音效的完整音频。若生成音频质量较差或 Speech 内容不完整，可多尝试几次。</strong><br>"
            "⚠️ <strong>Producing a complete audio clip with music, intelligible speech, and sound effects in one pass. If the generated audio quality is poor or speech content is incomplete, please try generating again.</strong>"
            "<br><br>"
            "🔤 目前 Speech 合成只支持英文，多语言支持即将上线。<br>"
            "🔤 Currently, speech synthesis only supports English. Multi-language support coming soon."
            "<br><br>"
            "🌐 Web Demo: "
            '<a href="https://nieeim.github.io/Dasheng-AudioGen-Web/" target="_blank">DashengAudioGen Web Demo</a><br>'
            "📦 GitHub Repo: "
            '<a href="https://github.com/xiaomi-research/dasheng-audiogen" target="_blank">DashengAudioGen GitHub Repository</a>'
            "</div>"
        )
        gr.Markdown(
            "逐轨道填写音频元素，仅 **Caption** 为必填项，其余字段均为可选。  \n"
            "Fill in each track individually. Only **Caption** is required; all other fields are optional."
        )
        with gr.Row():
            with gr.Column():
                man_caption = gr.Textbox(
                    label="Caption — 整体描述 / Overall Description *",
                    placeholder=(
                        'e.g., A train station broadcast says, '
                        '"Train G128 is arriving on platform three, '
                        'please stand behind the yellow line." '
                        'with warning beeps and station ambience.'
                    ),
                    lines=3,
                )
                man_speech = gr.Textbox(
                    label="Speech — 说话人身份与风格 / Speaker Identity & Style",
                    placeholder="e.g., female announcer, calm and clear tone",
                    lines=1,
                )
                man_asr = gr.Textbox(
                    label="ASR — 语音文字 / Speech Transcript",
                    placeholder=(
                        "e.g., Train G128 is arriving on platform three, "
                        "please stand behind the yellow line."
                    ),
                    lines=2,
                )
                man_sfx = gr.Textbox(
                    label="SFX — 音效 / Sound Effects",
                    placeholder="e.g., warning beeps",
                    lines=1,
                )
                man_music = gr.Textbox(
                    label="Music — 背景音乐 / Background Music",
                    placeholder="e.g., light orchestral underscore",
                    lines=1,
                )
                man_env = gr.Textbox(
                    label="ENV — 环境音 / Environmental & Ambient Sound",
                    placeholder="e.g., train station ambience",
                    lines=1,
                )
                man_button = gr.Button("Generate Audio", variant="primary")
            with gr.Column():
                man_audio = gr.Audio(
                    label="生成音频 / Generated Audio", type="filepath"
                )
                man_prompt_preview = gr.Textbox(
                    label="结构化 Prompt 预览 / Structured Prompt (Preview)",
                    lines=5,
                    interactive=False,
                    elem_classes=["prompt-preview"],
                )
                man_status = gr.Textbox(label="状态 / Status")

        man_button.click(
            fn=manual_generate,
            inputs=[
                man_caption, man_speech, man_sfx, man_music, man_env, man_asr
            ],
            outputs=[man_audio, man_prompt_preview, man_status],
        )

        gr.Examples(
            examples=[
                [
                    "A game announcer shouts with crowd cheers, drum hits, and indoor stadium ambience.",
                    "excited male game announcer",
                    "crowd cheers and impacts",
                    "energetic drum rhythm",
                    "indoor stadium ambience",
                    "Final round begins now, give it everything you have!",
                ],
                [
                    "A café barista makes an announcement while soft jazz plays, cups clink, and indoor café ambience continues.",
                    "barista announcement",
                    "cup and spoon clinks",
                    "soft jazz trio",
                    "indoor cafe ambience",
                    "Your caramel latte is ready at the counter.",
                ],
                [
                    "A train station broadcast makes an announcement with warning beeps, light orchestral bed, and station ambience.",
                    "station public announcement",
                    "warning beeps",
                    "light orchestral underscore",
                    "train station ambience",
                    "Train G128 is arriving on platform three, please stand behind the yellow line.",
                ],
            ],
            inputs=[
                man_caption, man_speech, man_sfx, man_music, man_env, man_asr
            ],
            label="Examples",
        )

        gr.Markdown(
            r"""
---
### Special Token 说明 / Special Token Reference
| Token | 字段 / Field | 说明 / Description |
|-------|------------|-------------------|
| `<\|caption\|>` | Caption | 整体音频场景描述（必填）/ Overall audio scene description (required) |
| `<\|speech\|>` | Speech | 说话人身份与风格 / Speaker identity & speaking style |
| `<\|asr\|>` | ASR | 语音文字内容 / Actual transcript of speech content |
| `<\|sfx\|>` | SFX | 音效描述 / Sound effects present in the audio |
| `<\|music\|>` | Music | 背景音乐描述 / Background music description |
| `<\|env\|>` | ENV | 环境音 / Environmental & ambient sound |
"""
        )

    # Mode switching event
    mode_radio.change(
        fn=switch_mode,
        inputs=[mode_radio],
        outputs=[auto_section, manual_section],
    )

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860, share=False)