import re import tempfile import time from pathlib import Path import gradio as gr from modal_apps.modal_llm import PersonaLLM from modal_apps.modal_tts import CharacterTTS from src.character_registry import CHARACTER_PACKAGES, get_character from src.stage_driver import render_character_stage APP_CSS = """ #modal-demo-stage iframe, #modal-demo-stage { min-height: 460px; } """ def _character_choices() -> list[tuple[str, str]]: return [(character["display_name"], character_id) for character_id, character in CHARACTER_PACKAGES.items()] def _split_sentences(text: str) -> list[str]: return [part.strip() for part in re.split(r"(?<=[。!?!?;;])\\s*", text) if part.strip()] or [text.strip()] def _write_wav(audio: bytes, prefix: str = "vc_tts_") -> str: handle = tempfile.NamedTemporaryFile(prefix=prefix, suffix=".wav", delete=False) handle.write(audio) handle.close() return handle.name def chat_once(message: str, history: list[dict], character_id: str, tts_enabled: bool): if not message.strip(): yield history, None, {"status": "empty"} return character = get_character(character_id) history = history + [{"role": "user", "content": message}, {"role": "assistant", "content": "Modal LLM 正在生成..."}] yield history, None, {"status": "llm_generating"} started = time.perf_counter() llm_result = PersonaLLM().generate_text.remote( user_text=message, character=character, max_new_tokens=120, ) reply = llm_result["text"] history[-1]["content"] = reply debug = { "status": "llm_done", "llm_remote_s": llm_result.get("remote_s"), "llm_output_tokens": llm_result.get("output_tokens"), "client_elapsed_s": round(time.perf_counter() - started, 3), } yield history, None, debug if not tts_enabled: return for index, sentence in enumerate(_split_sentences(reply), start=1): if not sentence: continue debug = {**debug, "status": "tts_generating", "tts_sentence_index": index, "tts_sentence": sentence} yield history, None, debug tts_started = time.perf_counter() audio = CharacterTTS().synthesize.remote(text=sentence, emotion="neutral") audio_path = _write_wav(audio) debug = { **debug, "status": "tts_chunk_done", "tts_sentence_index": index, "tts_remote_client_s": round(time.perf_counter() - tts_started, 3), "audio_path": audio_path, } yield history, audio_path, debug def switch_character(character_id: str): character = get_character(character_id) stage = {"expression": "idle", "motion": "breathe", "intensity": 0.35} return character["summary"], render_character_stage(character, stage) def build_demo() -> gr.Blocks: default_id = "memory_girl" default_character = get_character(default_id) default_stage = {"expression": "idle", "motion": "breathe", "intensity": 0.35} with gr.Blocks(title="Modal Virtual Character Smoke Demo") as demo: with gr.Row(): with gr.Column(scale=1, min_width=260): character_select = gr.Radio(_character_choices(), value=default_id, label="角色") character_summary = gr.Markdown(default_character["summary"]) tts_enabled = gr.Checkbox(value=True, label="启用 Chatterbox TTS") with gr.Column(scale=2, min_width=360): stage = gr.HTML( render_character_stage(default_character, default_stage), elem_id="modal-demo-stage", min_height=460, ) with gr.Column(scale=2, min_width=360): chatbot = gr.Chatbot(label="Modal 对话", height=380) message = gr.Textbox(label="输入", lines=2, submit_btn=True) audio = gr.Audio(label="分句语音", autoplay=True) debug = gr.JSON(label="调试") character_select.change(switch_character, inputs=[character_select], outputs=[character_summary, stage]) message.submit( chat_once, inputs=[message, chatbot, character_select, tts_enabled], outputs=[chatbot, audio, debug], ).then(lambda: "", outputs=[message]) return demo if __name__ == "__main__": build_demo().queue().launch(css=APP_CSS, server_name="127.0.0.1", server_port=7862)