File size: 28,181 Bytes
d46b5d9
 
 
 
d8d5fca
d46b5d9
 
f1efebb
 
 
f112acd
f1efebb
 
 
 
 
 
 
 
 
d46b5d9
f112acd
 
 
 
 
f1efebb
d46b5d9
f112acd
d8d5fca
d46b5d9
 
 
 
f112acd
 
 
 
 
d46b5d9
 
 
f112acd
 
 
d46b5d9
 
 
 
f112acd
 
 
 
 
d46b5d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d8d5fca
 
 
 
 
 
 
 
 
f112acd
d8d5fca
 
 
 
f1efebb
d8d5fca
 
 
 
 
 
 
 
 
 
f112acd
 
 
d8d5fca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d46b5d9
d8d5fca
f112acd
 
d8d5fca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f112acd
 
 
d8d5fca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f112acd
f1efebb
 
 
 
 
d8d5fca
f1efebb
 
 
 
d8d5fca
 
 
 
 
 
f1efebb
 
 
f112acd
 
f1efebb
 
 
 
 
 
 
f112acd
 
 
 
 
 
 
 
f1efebb
 
 
 
 
 
d8d5fca
 
 
 
 
f1efebb
 
d8d5fca
f1efebb
 
f112acd
 
 
 
f1efebb
 
 
 
d8d5fca
f1efebb
 
 
 
f112acd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d8d5fca
 
 
 
f112acd
 
d8d5fca
 
 
f112acd
 
 
d8d5fca
 
 
 
 
 
f112acd
d8d5fca
 
 
f1efebb
 
 
 
 
f112acd
 
 
 
 
f1efebb
d46b5d9
 
3b2e714
 
 
 
 
 
 
 
 
 
 
d46b5d9
 
 
 
 
 
 
 
f1efebb
3b2e714
f1efebb
3b2e714
d46b5d9
3b2e714
 
 
d46b5d9
 
f1efebb
d46b5d9
 
 
 
 
 
 
 
f112acd
 
 
d46b5d9
 
 
 
 
 
f112acd
 
 
 
 
 
d46b5d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2a4fc2a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d46b5d9
 
f112acd
d46b5d9
 
 
 
 
 
 
 
2a4fc2a
d46b5d9
 
 
2a4fc2a
f112acd
9df66ca
 
 
 
 
 
 
 
 
 
 
 
 
 
d46b5d9
b9beba5
 
d46b5d9
 
 
 
 
 
 
 
 
 
 
 
 
3b2e714
 
 
 
f1efebb
2a4fc2a
e2b2b1c
 
0ffd937
2a4fc2a
0ffd937
2a4fc2a
 
0b875b1
 
 
33a63c2
f1efebb
 
5943dd3
2a4fc2a
 
 
d46b5d9
 
 
 
2a4fc2a
d46b5d9
f236a22
 
 
 
d46b5d9
 
 
2a4fc2a
d46b5d9
f112acd
 
 
d46b5d9
2a4fc2a
d46b5d9
 
 
 
2a4fc2a
d46b5d9
f236a22
 
 
 
 
 
 
 
 
 
 
f112acd
 
 
 
 
 
 
f236a22
 
 
 
 
d46b5d9
 
 
 
 
 
 
 
3b2e714
 
 
 
f1efebb
2a4fc2a
e2b2b1c
 
2a4fc2a
 
 
0ffd937
0b875b1
 
 
33a63c2
f1efebb
 
d46b5d9
2a4fc2a
 
d46b5d9
 
 
 
2a4fc2a
d46b5d9
 
 
 
 
 
 
 
 
2a4fc2a
d46b5d9
 
 
 
2a4fc2a
d46b5d9
 
 
 
 
 
 
2a4fc2a
d46b5d9
 
 
 
2a4fc2a
d46b5d9
 
 
 
2a4fc2a
d46b5d9
 
 
 
 
f112acd
 
 
d46b5d9
2a4fc2a
d46b5d9
 
 
 
2a4fc2a
d46b5d9
 
 
f112acd
 
 
d46b5d9
 
 
 
 
 
f1efebb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d46b5d9
 
 
 
 
 
 
f112acd
 
 
d46b5d9
 
 
f112acd
 
d46b5d9
2a4fc2a
 
 
 
 
 
 
 
 
f112acd
 
d46b5d9
f236a22
 
 
 
 
 
 
d46b5d9
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
๏ปฟ#!/usr/bin/env python3

import os
import json
import gzip
import requests
import gradio as gr
from openai import OpenAI
from prompt_refiner_prompt import prompt_refiner_prompt as _PROMPT_REFINER_SYSTEM


# API Endpoints โ€” required environment variables, no defaults
def _require_env(name: str) -> str:
    value = os.environ.get(name)
    if not value:
        raise EnvironmentError(
            f"Required environment variable '{name}' is not set. "
            "Please set it before launching the app."
        )
    return value


AUDIOGEN_API_URL = _require_env("AUDIOGEN_API_URL")
LLM_BASE_URL = os.environ.get("LLM_BASE_URL", "")
CLAW_API_URL = os.environ.get("CLAW_API_URL", "")
XI_API_BASE_URL = "https://api.xi-ai.cn/v1"
PROMPT_REFINER_MAX_RETRIES = 3

PROMPT_REFINER_MODE = os.environ.get("PROMPT_REFINER_MODE", "xi_api")

# Special token order and mapping
SPECIAL_TOKEN_ORDER = ["caption", "speech", "sfx", "music", "env", "asr"]
SPECIAL_TOKEN_MAP = {
    "caption": "<|caption|>",
    "speech": "<|speech|>",
    "sfx": "<|sfx|>",
    "music": "<|music|>",
    "env": "<|env|>",
    "asr": "<|asr|>",
}


def build_structured_prompt(
    caption="", speech="", sfx="", music="", env="", asr=""
):
    """Assemble a structured prompt string from individual token fields.
    Only tokens with non-empty values are included."""
    fields = {
        "caption": caption,
        "speech": speech,
        "sfx": sfx,
        "music": music,
        "env": env,
        "asr": asr,
    }
    parts = []
    for token in SPECIAL_TOKEN_ORDER:
        value = (fields[token] or "").strip()
        if value:
            parts.append(f"{SPECIAL_TOKEN_MAP[token]} {value}")
    return " ".join(parts)


def call_audiogen(structured_prompt):
    """POST the structured prompt to AudioGen API, save and return the WAV path."""
    if not structured_prompt.strip():
        return None, "Error: Prompt is empty"

    try:
        response = requests.post(
            AUDIOGEN_API_URL,
            headers={"Content-Type": "application/json"},
            json={"text": structured_prompt},
            timeout=120,
        )
        response.raise_for_status()

        os.makedirs("./outputs", exist_ok=True)
        output_path = "./outputs/audiogen_output.wav"
        with open(output_path, "wb") as f:
            f.write(response.content)

        return output_path, "Generation successful!"
    except requests.exceptions.ConnectionError:
        return None, "Error: Cannot connect to AudioGen API. Please check the service."
    except requests.exceptions.HTTPError as e:
        return None, f"Error: HTTP {e.response.status_code} - {e.response.reason}"
    except requests.exceptions.Timeout:
        return None, "Error: Request timed out."
    except Exception as e:
        return None, f"Error: {str(e)}"


def _parse_and_validate(raw_content: str, attempt: int):
    """Parse JSON and validate required 'caption' field. Returns (dict|None, error_str)."""
    try:
        parsed = json.loads(raw_content)
    except json.JSONDecodeError as e:
        return None, f"Invalid JSON on attempt {attempt}: {e}"

    normalized = {
        k.lower(): v
        for k, v in parsed.items() if v is not None and str(v).strip()
    }
    if not normalized.get("caption"):
        return None, f"Missing required 'Caption' field on attempt {attempt}."
    return normalized, None


def _decode_claw_response_json(response: requests.Response) -> dict:
    """Decode CLAW response robustly, including mis-labeled gzip responses."""
    raw_bytes = response.raw.read(decode_content=False)

    candidates = []
    # 1) Treat as plain utf-8 text first (some responses are plain text but mislabeled)
    candidates.append(raw_bytes.decode("utf-8", errors="replace"))
    # 2) Try gzip decode as fallback when content-encoding is incorrect/mixed
    try:
        candidates.append(
            gzip.decompress(raw_bytes).decode("utf-8", errors="replace")
        )
    except Exception:
        pass

    last_err = None
    for text in candidates:
        try:
            return json.loads(text)
        except Exception as e:
            last_err = e
    raise ValueError(f"Unable to decode CLAW JSON response: {last_err}")


def _call_prompt_refiner_claw(user_input: str, max_retries: int) -> dict:
    """Call Prompt Refiner via CLAW endpoint (no auth required).
    Sends the full prompt template with user input substituted as plain text.
    """
    # Substitute user input into the prompt template
    full_prompt = _PROMPT_REFINER_SYSTEM.replace("{{user_input}}",
                                                 user_input).strip()

    last_error = None
    for attempt in range(1, max_retries + 1):
        try:
            response = requests.post(
                CLAW_API_URL,
                headers={"Content-Type": "text/plain"},
                data=full_prompt.encode("utf-8"),
                timeout=60,
                stream=True,
            )
            response.raise_for_status()

            # Response has same structure as OpenAI: choices[0].message.content
            resp_json = _decode_claw_response_json(response)
            raw_content = resp_json["choices"][0]["message"]["content"]

            result, err = _parse_and_validate(raw_content, attempt)
            if err:
                last_error = err
                continue
            return result

        except requests.exceptions.HTTPError as e:
            code = e.response.status_code
            raise RuntimeError(
                f"CLAW API HTTP error {code}: {e.response.reason}"
            ) from e
        except requests.exceptions.ConnectionError as e:
            raise RuntimeError(f"CLAW API connection error: {e}") from e
        except requests.exceptions.Timeout:
            last_error = f"CLAW API timed out on attempt {attempt}."
        except Exception as e:
            last_error = f"CLAW API error on attempt {attempt}: {e}"

    raise RuntimeError(
        f"Prompt Refiner (claw) failed after {max_retries} attempt(s). "
        f"Last error: {last_error}"
    )


def _call_prompt_refiner_openai(user_input: str, max_retries: int) -> dict:
    """Call Prompt Refiner via OpenAI-compatible chat completions endpoint."""
    api_key = os.environ.get("API_KEY")
    model_name = os.environ.get("MODEL_NAME")

    if not api_key:
        raise EnvironmentError(
            "API_KEY environment variable is not set. "
            "Please set it before using Auto Mode (openai mode)."
        )
    if not model_name:
        raise EnvironmentError(
            "MODEL_NAME environment variable is not set. "
            "Please set it before using Auto Mode (openai mode)."
        )
    if not LLM_BASE_URL:
        raise EnvironmentError(
            "LLM_BASE_URL environment variable is not set. "
            "Please set it before using Auto Mode (openai mode)."
        )

    client = OpenAI(api_key=api_key, base_url=LLM_BASE_URL)
    system_content = _PROMPT_REFINER_SYSTEM.replace("{{user_input}}",
                                                    "").strip()

    last_error = None
    for attempt in range(1, max_retries + 1):
        try:
            completion = client.chat.completions.create(
                model=model_name,
                messages=[
                    {
                        "role": "system",
                        "content": system_content
                    },
                    {
                        "role": "user",
                        "content": user_input
                    },
                ],
                max_completion_tokens=1024,
                response_format={"type": "json_object"},
            )
            raw_content = completion.choices[0].message.content

            result, err = _parse_and_validate(raw_content, attempt)
            if err:
                last_error = err
                continue
            return result

        except EnvironmentError:
            raise
        except Exception as e:
            err_str = str(e).lower()
            if any(
                kw in err_str for kw in
                ("authentication", "api_key", "unauthorized", "403", "401")
            ):
                raise RuntimeError(f"Prompt Refiner auth error: {e}") from e
            last_error = f"API error on attempt {attempt}: {e}"

    raise RuntimeError(
        f"Prompt Refiner (openai) failed after {max_retries} attempt(s). "
        f"Last error: {last_error}"
    )


def _call_prompt_refiner_xi_api(user_input: str, max_retries: int) -> dict:
    """Call Prompt Refiner via XI API chat completions endpoint."""
    api_key = os.environ.get("XI_API_KEY")
    model_name = os.environ.get("XI_MODEL_NAME", "deepseek-v4-flash")

    if not api_key:
        raise EnvironmentError(
            "XI_API_KEY environment variable is not set. "
            "Please set it before using Auto Mode (xi_api mode)."
        )

    client = OpenAI(api_key=api_key, base_url=XI_API_BASE_URL)
    system_content = _PROMPT_REFINER_SYSTEM.replace("{{user_input}}",
                                                    "").strip()

    last_error = None
    for attempt in range(1, max_retries + 1):
        try:
            completion = client.chat.completions.create(
                model=model_name,
                messages=[
                    {
                        "role": "system",
                        "content": system_content
                    },
                    {
                        "role": "user",
                        "content": user_input
                    },
                ],
            )
            raw_content = completion.choices[0].message.content

            result, err = _parse_and_validate(raw_content, attempt)
            if err:
                last_error = err
                continue
            return result

        except EnvironmentError:
            raise
        except Exception as e:
            err_str = str(e).lower()
            if any(
                kw in err_str for kw in
                ("authentication", "api_key", "unauthorized", "403", "401")
            ):
                raise RuntimeError(f"Prompt Refiner auth error: {e}") from e
            last_error = f"XI API error on attempt {attempt}: {e}"

    raise RuntimeError(
        f"Prompt Refiner (xi_api) failed after {max_retries} attempt(s). "
        f"Last error: {last_error}"
    )


def call_prompt_refiner(user_input, max_retries=PROMPT_REFINER_MAX_RETRIES):
    """Dispatch to the configured Prompt Refiner backend.

    Mode is controlled by the PROMPT_REFINER_MODE environment variable:
      'xi_api' โ€” XI API chat completions endpoint (default)
      'claw'   โ€” CLAW plain-text endpoint, no auth required
      'openai' โ€” OpenAI-compatible chat completions endpoint
    """
    mode = PROMPT_REFINER_MODE.lower()
    if mode == "xi_api":
        return _call_prompt_refiner_xi_api(user_input, max_retries)
    elif mode == "openai":
        return _call_prompt_refiner_openai(user_input, max_retries)
    elif mode == "claw":
        return _call_prompt_refiner_claw(user_input, max_retries)
    else:
        raise ValueError(
            f"Unknown PROMPT_REFINER_MODE '{mode}'. "
            "Valid values: 'xi_api' (default), 'claw', 'openai'."
        )


def build_caption_from_refined(refined: dict) -> str:
    """Build the full structured prompt string from a refined dict.
    This is a convenience wrapper around build_structured_prompt."""
    return build_structured_prompt(
        caption=refined.get("caption", ""),
        speech=refined.get("speech", ""),
        sfx=refined.get("sfx", ""),
        music=refined.get("music", ""),
        env=refined.get("env", ""),
        asr=refined.get("asr", ""),
    )


def _build_auto_mode_llm_error(message: str) -> str:
    """Return a clear UI-facing error message for Prompt Refiner failures."""
    return (
        "Auto Mode is currently unavailable because the Prompt Refiner LLM API "
        f"failed: {message} "
        "You can still try Manual Mode, which may remain available because it "
        "does not depend on the Prompt Refiner. "
        "If the issue continues, contact jiahaomei@sjtu.edu.cn."
    )


def auto_generate(caption, progress=gr.Progress()):
    """Mode 1: Call Prompt Refiner -> build structured prompt -> call AudioGen."""
    if not (caption or "").strip():
        return None, "", "Error: Please enter a description."

    progress(0.1, desc="Calling Prompt Refiner...")
    try:
        refined = call_prompt_refiner(caption)
    except EnvironmentError as e:
        return None, "", _build_auto_mode_llm_error(f"Configuration error. {e}")
    except RuntimeError as e:
        return None, "", _build_auto_mode_llm_error(str(e))
    except Exception as e:
        return None, "", _build_auto_mode_llm_error(
            f"Unexpected error while calling Prompt Refiner. {e}"
        )

    progress(0.4, desc="Building structured prompt...")
    structured_prompt = build_caption_from_refined(refined)

    progress(0.6, desc="Generating audio...")
    audio_path, status = call_audiogen(structured_prompt)

    progress(1.0)
    return audio_path, structured_prompt, status


def manual_generate(
    caption, speech, sfx, music, env, asr, progress=gr.Progress()
):
    """Mode 2: Build structured prompt from individual fields -> call AudioGen."""
    if not (caption or "").strip():
        return None, "", "Error: Caption is required."

    progress(0.2, desc="Building structured prompt...")
    structured_prompt = build_structured_prompt(
        caption=caption,
        speech=speech,
        sfx=sfx,
        music=music,
        env=env,
        asr=asr,
    )

    progress(0.5, desc="Generating audio...")
    audio_path, status = call_audiogen(structured_prompt)

    progress(1.0)
    return audio_path, structured_prompt, status


# Custom CSS
custom_css = """
.prompt-preview textarea {
    font-family: monospace !important;
    font-size: 12px !important;
}
.mode-radio label {
    font-weight: 600 !important;
}
.banner-warning {
    padding: 12px 16px;
    background: rgba(255, 193, 7, 0.12);
    border: 2px solid #d4920a;
    border-radius: 6px;
    margin-bottom: 12px;
    font-size: 14px;
    line-height: 1.9;
}
.dark .banner-warning {
    background: rgba(255, 193, 7, 0.07) !important;
    border-color: #c8860a !important;
}
.banner-warning a {
    color: #1a73e8;
}
.dark .banner-warning a {
    color: #7aafff !important;
}
"""


# โ”€โ”€ Mode switching helper โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def switch_mode(mode):
    is_auto = mode == "๐Ÿค– Auto Mode"
    return gr.update(visible=is_auto), gr.update(visible=not is_auto)


# โ”€โ”€ Gradio UI โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
with gr.Blocks(
    title="Dasheng AudioGen Demo",
    theme=gr.themes.Soft(),
    css=custom_css,
) as demo:
    gr.Markdown("# ๐Ÿ”Š Dasheng AudioGen Demo")
    gr.Markdown("Developed by SJTU X-LANCE & Xiaomi LLM Plus")
    gr.HTML(
        '<div style="padding: 16px 20px; background: rgba(220, 38, 38, 0.10); '
        'border: 2px solid #dc2626; border-radius: 8px; margin: 12px 0 16px 0; '
        'font-size: 15px; line-height: 2;">'
        "๐Ÿšง <strong>็”ฑไบŽๆŽฅๅฃๅ˜ๅŠจ๏ผŒๅœจ็บฟ Demo ๆœๅŠกๆš‚ๆ—ถไธๅฏ็”จ๏ผŒๆญฃๅœจ็ปดๆŠคไธญใ€‚</strong><br>"
        "ๆ‚จๅฏไปฅ่‡ช่กŒ้ƒจ็ฝฒๆจกๅž‹ๆƒ้‡ไฝฟ็”จ๏ผš"
        '<a href="https://huggingface.co/mispeech/Dasheng-AudioGen" target="_blank">'
        "mispeech/Dasheng-AudioGen</a><br><br>"
        "๐Ÿšง <strong>Due to API changes, the online Demo is temporarily unavailable and under maintenance.</strong><br>"
        "You can self-deploy using the model weights: "
        '<a href="https://huggingface.co/mispeech/Dasheng-AudioGen" target="_blank">'
        "mispeech/Dasheng-AudioGen</a>"
        "</div>"
    )
    gr.Markdown(
        "ๆ”ฏๆŒ็ป“ๆž„ๅŒ– Prompt ็š„ๆททๅˆ้Ÿณ้ข‘็”Ÿๆˆ๏ผŒๅฏ็”จ่‡ช็„ถ่ฏญ่จ€ๆ่ฟฐๅœบๆ™ฏ(Auto mode)ๆˆ–้€่ฝจ้“ๅกซๅ†™(Manual mode)๏ผŒไธ€ๆฌก็”ŸๆˆๅŒ…ๅซ้Ÿณไนใ€ๅฏ็†่งฃไบบๅฃฐๅ’Œ้Ÿณๆ•ˆ็š„ๅฎŒๆ•ด้Ÿณ้ข‘ใ€‚ใ€‚  \n"
        "Structured-prompt mixed audio generation that lets you describe a scene in natural language (Auto mode) or specify tracks manually (Manual mode), producing a complete audio clip with music, intelligible speech, and sound effects in one pass."
    )

    # Mode selector
    mode_radio = gr.Radio(
        choices=["๐Ÿค– Auto Mode", "โœ๏ธ Manual Mode"],
        value="๐Ÿค– Auto Mode",
        label="Generation Mode",
        interactive=True,
        elem_classes=["mode-radio"],
    )

    # โ”€โ”€ Auto Mode section โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
    with gr.Column(visible=True) as auto_section:
        gr.Markdown(
            "## ๐Ÿค– Auto Mode  \n"
            "If Auto Mode is unavailable, contact jiahaomei@sjtu.edu.cn."
        )
        gr.HTML(
            '<div class="banner-warning">'
            "โš ๏ธ <strong>ไธ€ๆฌก็”ŸๆˆๅŒ…ๅซ้Ÿณไนใ€ๅฏ็†่งฃไบบๅฃฐๅ’Œ้Ÿณๆ•ˆ็š„ๅฎŒๆ•ด้Ÿณ้ข‘ใ€‚่‹ฅ็”Ÿๆˆ้Ÿณ้ข‘่ดจ้‡่พƒๅทฎๆˆ– Speech ๅ†…ๅฎนไธๅฎŒๆ•ด๏ผŒๅฏๅคšๅฐ่ฏ•ๅ‡ ๆฌกใ€‚</strong><br>"
            "โš ๏ธ <strong>Producing a complete audio clip with music, intelligible speech, and sound effects in one pass. If the generated audio quality is poor or speech content is incomplete, please try generating again.</strong>"
            "<br><br>"
            "๐Ÿ’ฌ ไฝ ๅฏไปฅ่พ“ๅ…ฅไปปๆ„่ฏญ่จ€็š„้Ÿณ้ข‘ๆ่ฟฐ๏ผŒPrompt Refiner ไผš่ฟ›่กŒ่‡ชๅŠจ่ฝฌๆขใ€‚็›ฎๅ‰ Speech ๅˆๆˆๅชๆ”ฏๆŒ่‹ฑๆ–‡๏ผŒๅคš่ฏญ่จ€ๆ”ฏๆŒๅณๅฐ†ไธŠ็บฟใ€‚<br>"
            "๐Ÿ’ฌ You can enter audio descriptions in any language โ€” the Prompt Refiner will automatically convert them. "
            "Currently, speech synthesis only supports English. Multi-language support coming soon."
            "<br><br>"
            "๐ŸŒ Web Demo: "
            '<a href="https://nieeim.github.io/Dasheng-AudioGen-Web/" target="_blank">DashengAudioGen Web Demo</a><br>'
            "๐Ÿ“ฆ GitHub Repo: "
            '<a href="https://github.com/xiaomi-research/dasheng-audiogen" target="_blank">DashengAudioGen GitHub Repository</a>'
            "</div>"
        )
        gr.Markdown(
            "่พ“ๅ…ฅๆ•ดไฝ“้Ÿณ้ข‘ๆ่ฟฐ๏ผŒ็ณป็ปŸๅฐ†่ฐƒ็”จ **Prompt Refiner** ่‡ชๅŠจ่ฝฌๆขไธบ็ป“ๆž„ๅŒ– Prompt๏ผŒๅ†้€š่ฟ‡ **DashengAudioGen** ็”Ÿๆˆ้Ÿณ้ข‘ใ€‚  \n"
            "Enter an overall audio description. The system will call the **Prompt Refiner** to convert it into a "
            "structured prompt, then generate audio via **DashengAudioGen**."
        )
        with gr.Row():
            with gr.Column():
                auto_caption = gr.Textbox(
                    label="ๆ•ดไฝ“้Ÿณ้ข‘ๆ่ฟฐ / Overall Audio Description",
                    placeholder=(
                        'e.g., A train station broadcast says, '
                        '"Train G128 is arriving on platform three, '
                        'please stand behind the yellow line." '
                        'with warning beeps, light orchestral bed, and station ambience.'
                    ),
                    lines=4,
                )
                auto_button = gr.Button("Generate", variant="primary")
            with gr.Column():
                auto_audio = gr.Audio(
                    label="็”Ÿๆˆ้Ÿณ้ข‘ / Generated Audio", type="filepath"
                )
                auto_prompt_preview = gr.Textbox(
                    label="็ป“ๆž„ๅŒ– Prompt ้ข„่งˆ / Structured Prompt (Preview)",
                    lines=4,
                    interactive=False,
                    elem_classes=["prompt-preview"],
                )
                auto_status = gr.Textbox(label="็Šถๆ€ / Status")

        gr.Examples(
            examples=[
                [
                    'A game announcer shouts, "Final round begins now, give it everything you have!" with crowd cheers, drum hits, and stadium ambience.',
                ],
                [
                    'A cafรฉ barista says, "Your caramel latte is ready at the counter." while soft jazz plays, cups clink, and indoor cafรฉ ambience continues.',
                ],
                [
                    'A train station broadcast says, "Train G128 is arriving on platform three, please stand behind the yellow line." with warning beeps, light orchestral bed, and station ambience.',
                ],
                [
                    'A radio host announces traffic updates over upbeat pop music with city street ambience.'
                ],
                ['ๅฎ‰้™็š„ๅˆๅŽๅ’–ๅ•ก้ฆ†,ไธ€็”ทไธ€ๅฅณๅœจ่ฎจ่ฎบๅคฉๆฐ”'],
                [
                    'ๅˆ—่ฝฆๅ‘˜ไฝฟ็”จ่‹ฑๆ–‡ๆŠฅ็ซ™,่ฏดโ€œTrain G128 is arriving on platform three, please stand behind the yellow line.โ€'
                ],
            ],
            inputs=[auto_caption],
            label="Examples",
        )

        auto_button.click(
            fn=auto_generate,
            inputs=[auto_caption],
            outputs=[auto_audio, auto_prompt_preview, auto_status],
        )

    # โ”€โ”€ Manual Mode section โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
    with gr.Column(visible=False) as manual_section:
        gr.Markdown(
            "## โœ๏ธ Manual Mode  \n"
            "If Manual Mode is unavailable, contact jiahaomei@sjtu.edu.cn."
        )
        gr.HTML(
            '<div class="banner-warning">'
            "โš ๏ธ <strong>ไธ€ๆฌก็”ŸๆˆๅŒ…ๅซ้Ÿณไนใ€ๅฏ็†่งฃไบบๅฃฐๅ’Œ้Ÿณๆ•ˆ็š„ๅฎŒๆ•ด้Ÿณ้ข‘ใ€‚่‹ฅ็”Ÿๆˆ้Ÿณ้ข‘่ดจ้‡่พƒๅทฎๆˆ– Speech ๅ†…ๅฎนไธๅฎŒๆ•ด๏ผŒๅฏๅคšๅฐ่ฏ•ๅ‡ ๆฌกใ€‚</strong><br>"
            "โš ๏ธ <strong>Producing a complete audio clip with music, intelligible speech, and sound effects in one pass. If the generated audio quality is poor or speech content is incomplete, please try generating again.</strong>"
            "<br><br>"
            "๐Ÿ”ค ็›ฎๅ‰ Speech ๅˆๆˆๅชๆ”ฏๆŒ่‹ฑๆ–‡๏ผŒๅคš่ฏญ่จ€ๆ”ฏๆŒๅณๅฐ†ไธŠ็บฟใ€‚<br>"
            "๐Ÿ”ค Currently, speech synthesis only supports English. Multi-language support coming soon."
            "<br><br>"
            "๐ŸŒ Web Demo: "
            '<a href="https://nieeim.github.io/Dasheng-AudioGen-Web/" target="_blank">DashengAudioGen Web Demo</a><br>'
            "๐Ÿ“ฆ GitHub Repo: "
            '<a href="https://github.com/xiaomi-research/dasheng-audiogen" target="_blank">DashengAudioGen GitHub Repository</a>'
            "</div>"
        )
        gr.Markdown(
            "้€่ฝจ้“ๅกซๅ†™้Ÿณ้ข‘ๅ…ƒ็ด ๏ผŒไป… **Caption** ไธบๅฟ…ๅกซ้กน๏ผŒๅ…ถไฝ™ๅญ—ๆฎตๅ‡ไธบๅฏ้€‰ใ€‚  \n"
            "Fill in each track individually. Only **Caption** is required; all other fields are optional."
        )
        with gr.Row():
            with gr.Column():
                man_caption = gr.Textbox(
                    label="Caption โ€” ๆ•ดไฝ“ๆ่ฟฐ / Overall Description *",
                    placeholder=(
                        'e.g., A train station broadcast says, '
                        '"Train G128 is arriving on platform three, '
                        'please stand behind the yellow line." '
                        'with warning beeps and station ambience.'
                    ),
                    lines=3,
                )
                man_speech = gr.Textbox(
                    label="Speech โ€” ่ฏด่ฏไบบ่บซไปฝไธŽ้ฃŽๆ ผ / Speaker Identity & Style",
                    placeholder="e.g., female announcer, calm and clear tone",
                    lines=1,
                )
                man_asr = gr.Textbox(
                    label="ASR โ€” ่ฏญ้Ÿณๆ–‡ๅญ— / Speech Transcript",
                    placeholder=(
                        "e.g., Train G128 is arriving on platform three, "
                        "please stand behind the yellow line."
                    ),
                    lines=2,
                )
                man_sfx = gr.Textbox(
                    label="SFX โ€” ้Ÿณๆ•ˆ / Sound Effects",
                    placeholder="e.g., warning beeps",
                    lines=1,
                )
                man_music = gr.Textbox(
                    label="Music โ€” ่ƒŒๆ™ฏ้Ÿณไน / Background Music",
                    placeholder="e.g., light orchestral underscore",
                    lines=1,
                )
                man_env = gr.Textbox(
                    label="ENV โ€” ็Žฏๅขƒ้Ÿณ / Environmental & Ambient Sound",
                    placeholder="e.g., train station ambience",
                    lines=1,
                )
                man_button = gr.Button("Generate Audio", variant="primary")
            with gr.Column():
                man_audio = gr.Audio(
                    label="็”Ÿๆˆ้Ÿณ้ข‘ / Generated Audio", type="filepath"
                )
                man_prompt_preview = gr.Textbox(
                    label="็ป“ๆž„ๅŒ– Prompt ้ข„่งˆ / Structured Prompt (Preview)",
                    lines=5,
                    interactive=False,
                    elem_classes=["prompt-preview"],
                )
                man_status = gr.Textbox(label="็Šถๆ€ / Status")

        man_button.click(
            fn=manual_generate,
            inputs=[
                man_caption, man_speech, man_sfx, man_music, man_env, man_asr
            ],
            outputs=[man_audio, man_prompt_preview, man_status],
        )

        gr.Examples(
            examples=[
                [
                    "A game announcer shouts with crowd cheers, drum hits, and indoor stadium ambience.",
                    "excited male game announcer",
                    "crowd cheers and impacts",
                    "energetic drum rhythm",
                    "indoor stadium ambience",
                    "Final round begins now, give it everything you have!",
                ],
                [
                    "A cafรฉ barista makes an announcement while soft jazz plays, cups clink, and indoor cafรฉ ambience continues.",
                    "barista announcement",
                    "cup and spoon clinks",
                    "soft jazz trio",
                    "indoor cafe ambience",
                    "Your caramel latte is ready at the counter.",
                ],
                [
                    "A train station broadcast makes an announcement with warning beeps, light orchestral bed, and station ambience.",
                    "station public announcement",
                    "warning beeps",
                    "light orchestral underscore",
                    "train station ambience",
                    "Train G128 is arriving on platform three, please stand behind the yellow line.",
                ],
            ],
            inputs=[
                man_caption, man_speech, man_sfx, man_music, man_env, man_asr
            ],
            label="Examples",
        )

        gr.Markdown(
            r"""
---
### Special Token ่ฏดๆ˜Ž / Special Token Reference
| Token | ๅญ—ๆฎต / Field | ่ฏดๆ˜Ž / Description |
|-------|------------|-------------------|
| `<\|caption\|>` | Caption | ๆ•ดไฝ“้Ÿณ้ข‘ๅœบๆ™ฏๆ่ฟฐ๏ผˆๅฟ…ๅกซ๏ผ‰/ Overall audio scene description (required) |
| `<\|speech\|>` | Speech | ่ฏด่ฏไบบ่บซไปฝไธŽ้ฃŽๆ ผ / Speaker identity & speaking style |
| `<\|asr\|>` | ASR | ่ฏญ้Ÿณๆ–‡ๅญ—ๅ†…ๅฎน / Actual transcript of speech content |
| `<\|sfx\|>` | SFX | ้Ÿณๆ•ˆๆ่ฟฐ / Sound effects present in the audio |
| `<\|music\|>` | Music | ่ƒŒๆ™ฏ้Ÿณไนๆ่ฟฐ / Background music description |
| `<\|env\|>` | ENV | ็Žฏๅขƒ้Ÿณ / Environmental & ambient sound |
"""
        )

    # Mode switching event
    mode_radio.change(
        fn=switch_mode,
        inputs=[mode_radio],
        outputs=[auto_section, manual_section],
    )

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860, share=False)