| import json |
| import os |
| import re |
| from pathlib import Path |
| from typing import Optional |
|
|
| import torch |
|
|
| from shared.mps import mps_device_or |
| from shared.utils import files_locator as fl |
|
|
| from .prompt_enhancers import TTS_MONOLOGUE_PROMPT, TTS_QWEN3_DIALOGUE_PROMPT |
|
|
|
|
| QWEN3_TTS_VARIANTS = { |
| "qwen3_tts_customvoice": { |
| "repo": "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice", |
| "config_file": "qwen3_tts_customvoice.json", |
| }, |
| "qwen3_tts_voicedesign": { |
| "repo": "Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign", |
| "config_file": "qwen3_tts_voicedesign.json", |
| }, |
| "qwen3_tts_base": { |
| "repo": "Qwen/Qwen3-TTS-12Hz-1.7B-Base", |
| "config_file": "qwen3_tts_base.json", |
| }, |
| } |
|
|
| QWEN3_TTS_GENERATION_CONFIG = "qwen3_tts_generation_config.json" |
| _QWEN3_CONFIG_DIR = Path(__file__).resolve().parent / "qwen3" / "configs" |
|
|
| QWEN3_TTS_TEXT_TOKENIZER_DIR = "qwen3_tts_text_tokenizer" |
| QWEN3_TTS_SPEECH_TOKENIZER_DIR = "qwen3_tts_tokenizer_12hz" |
| QWEN3_TTS_SPEECH_TOKENIZER_WEIGHTS = "qwen3_tts_tokenizer_12hz.safetensors" |
| QWEN3_TTS_REPO = "DeepBeepMeep/TTS" |
| QWEN3_TTS_TEXT_TOKENIZER_FILES = [ |
| "merges.txt", |
| "vocab.json", |
| "tokenizer_config.json", |
| "preprocessor_config.json", |
| ] |
| QWEN3_TTS_SPEECH_TOKENIZER_FILES = [ |
| "config.json", |
| "configuration.json", |
| "preprocessor_config.json", |
| QWEN3_TTS_SPEECH_TOKENIZER_WEIGHTS, |
| ] |
|
|
| QWEN3_TTS_LANG_FALLBACK = [ |
| "auto", |
| "chinese", |
| "english", |
| "japanese", |
| "korean", |
| "german", |
| "french", |
| "russian", |
| "portuguese", |
| "spanish", |
| "italian", |
| ] |
| QWEN3_TTS_SPEAKER_FALLBACK = [ |
| "serena", |
| "vivian", |
| "uncle_fu", |
| "ryan", |
| "aiden", |
| "ono_anna", |
| "sohee", |
| "eric", |
| "dylan", |
| ] |
| QWEN3_TTS_SPEAKER_META = { |
| "vivian": { |
| "style": "Bright, slightly edgy young female voice", |
| "language": "Chinese", |
| }, |
| "serena": { |
| "style": "Warm, gentle young female voice", |
| "language": "Chinese", |
| }, |
| "uncle_fu": { |
| "style": "Seasoned male voice with a low, mellow timbre", |
| "language": "Chinese", |
| }, |
| "dylan": { |
| "style": "Youthful Beijing male voice with a clear, natural timbre", |
| "language": "Chinese (Beijing Dialect)", |
| }, |
| "eric": { |
| "style": "Lively Chengdu male voice with a slightly husky brightness", |
| "language": "Chinese (Sichuan Dialect)", |
| }, |
| "ryan": { |
| "style": "Dynamic male voice with strong rhythmic drive", |
| "language": "English", |
| }, |
| "aiden": { |
| "style": "Sunny American male voice with a clear midrange", |
| "language": "English", |
| }, |
| "ono_anna": { |
| "style": "Playful Japanese female voice with a light, nimble timbre", |
| "language": "Japanese", |
| }, |
| "sohee": { |
| "style": "Warm Korean female voice with rich emotion", |
| "language": "Korean", |
| }, |
| } |
| QWEN3_TTS_DURATION_SLIDER = { |
| "label": "Max duration (seconds)", |
| "min": 1, |
| "max": 600, |
| "increment": 1, |
| "default": 20, |
| } |
| QWEN3_TTS_AUDIO_PROMPT_TYPE_SOURCES = { |
| "selection": ["A", "AB"], |
| "labels": { |
| "A": "Voice cloning of 1 speaker", |
| "AB": "Voice cloning of 2 speakers (Speaker 1 and Speaker 2)", |
| }, |
| "letters_filter": "AB", |
| "default": "A", |
| } |
| QWEN3_TTS_AUTO_SPLIT_SETTING_ID = "auto_split_every_s" |
| QWEN3_TTS_AUTO_SPLIT_MIN_SECONDS = 5.0 |
| QWEN3_TTS_AUTO_SPLIT_MAX_SECONDS = 90.0 |
| QWEN3_TTS_CUSTOM_SETTINGS = [ |
| { |
| "id": QWEN3_TTS_AUTO_SPLIT_SETTING_ID, |
| "label": "Auto Split Every s (5-90, optional), may reduce VRAM requiremens for very long speeches.", |
| "name": "Auto Split Every s", |
| "type": "float", |
| }, |
| ] |
|
|
|
|
| def _format_qwen3_label(value: str) -> str: |
| return value.replace("_", " ").title() |
|
|
|
|
| def _format_qwen3_speaker_label(name: str) -> str: |
| label = _format_qwen3_label(name) |
| meta = QWEN3_TTS_SPEAKER_META.get(name.lower()) |
| if not meta: |
| return label |
| parts = [] |
| style = meta.get("style", "") |
| language = meta.get("language", "") |
| if style: |
| parts.append(style) |
| if language: |
| parts.append(language) |
| if not parts: |
| return label |
| return f"{label} ({'; '.join(parts)})" |
|
|
|
|
| def get_qwen3_config_path(base_model_type: str) -> Optional[str]: |
| variant = QWEN3_TTS_VARIANTS.get(base_model_type) |
| if variant is None: |
| return None |
| config_path = _QWEN3_CONFIG_DIR / variant["config_file"] |
| return str(config_path) if config_path.is_file() else None |
|
|
|
|
| def get_qwen3_generation_config_path() -> Optional[str]: |
| config_path = _QWEN3_CONFIG_DIR / QWEN3_TTS_GENERATION_CONFIG |
| return str(config_path) if config_path.is_file() else None |
|
|
|
|
| def load_qwen3_config(base_model_type: str) -> Optional[dict]: |
| config_path = get_qwen3_config_path(base_model_type) |
| if not config_path: |
| return None |
| with open(config_path, "r", encoding="utf-8") as handle: |
| return json.load(handle) |
|
|
|
|
| def get_qwen3_languages(base_model_type: str) -> list[str]: |
| config = load_qwen3_config(base_model_type) |
| if config is None: |
| return list(QWEN3_TTS_LANG_FALLBACK) |
| lang_map = config.get("talker_config", {}).get("codec_language_id", {}) |
| languages = [name for name in lang_map.keys() if "dialect" not in name.lower()] |
| languages = ["auto"] + sorted({name.lower() for name in languages}) |
| return languages |
|
|
|
|
| def get_qwen3_speakers(base_model_type: str) -> list[str]: |
| config = load_qwen3_config(base_model_type) |
| if config is None: |
| return list(QWEN3_TTS_SPEAKER_FALLBACK) |
| speakers = list(config.get("talker_config", {}).get("spk_id", {}).keys()) |
| speakers = sorted({name.lower() for name in speakers}) |
| return speakers or list(QWEN3_TTS_SPEAKER_FALLBACK) |
|
|
|
|
| def get_qwen3_language_choices(base_model_type: str) -> list[tuple[str, str]]: |
| return [(_format_qwen3_label(lang), lang) for lang in get_qwen3_languages(base_model_type)] |
|
|
|
|
| def get_qwen3_speaker_choices(base_model_type: str) -> list[tuple[str, str]]: |
| return [(_format_qwen3_speaker_label(name), name) for name in get_qwen3_speakers(base_model_type)] |
|
|
|
|
| def get_qwen3_model_def(base_model_type: str) -> dict: |
| common = { |
| "audio_only": True, |
| "image_outputs": False, |
| "sliding_window": False, |
| "guidance_max_phases": 0, |
| "no_negative_prompt": True, |
| "inference_steps": False, |
| "temperature": True, |
| "image_prompt_types_allowed": "", |
| "supports_early_stop": True, |
| "profiles_dir": [base_model_type], |
| "duration_slider": dict(QWEN3_TTS_DURATION_SLIDER), |
| "top_k_slider": True, |
| "text_prompt_enhancer_instructions": TTS_MONOLOGUE_PROMPT, |
| "text_prompt_enhancer_max_tokens": 512, |
| "prompt_enhancer_button_label": "Write", |
| "compile": False, |
| "parent_model_type": "qwen3_tts_base", |
| "lm_engines": ["cg"], |
| "prompt_enhancer_def": { |
| "selection": ["T", "T1"] if base_model_type == "qwen3_tts_base" else ["T"], |
| "labels": { |
| "T": "A Speech based on current Prompt", |
| "T1": "A Dialogue between two People based on current Prompt", |
| }, |
| "default": "T", |
| }, |
| } |
| if base_model_type == "qwen3_tts_customvoice": |
| speakers = get_qwen3_speakers(base_model_type) |
| default_speaker = speakers[0] if speakers else "" |
| return { |
| **common, |
| "model_modes": { |
| "choices": get_qwen3_speaker_choices(base_model_type), |
| "default": default_speaker, |
| "label": "Speaker", |
| }, |
| "alt_prompt": { |
| "label": "Instruction (optional)", |
| "placeholder": "calm, friendly, slightly husky", |
| "lines": 2, |
| }, |
| } |
| if base_model_type == "qwen3_tts_voicedesign": |
| return { |
| **common, |
| "model_modes": { |
| "choices": get_qwen3_language_choices(base_model_type), |
| "default": "auto", |
| "label": "Language", |
| }, |
| "alt_prompt": { |
| "label": "Voice instruction", |
| "placeholder": "young female, warm tone, clear articulation", |
| "lines": 2, |
| }, |
| } |
| if base_model_type == "qwen3_tts_base": |
| return { |
| **common, |
| "model_modes": { |
| "choices": get_qwen3_language_choices(base_model_type), |
| "default": "auto", |
| "label": "Language", |
| }, |
| "alt_prompt": { |
| "label": "Reference transcript(s) (optional, two-speaker: one per line)", |
| "placeholder": "Speaker 1 reference transcript\nSpeaker 2 reference transcript", |
| "lines": 3, |
| }, |
| "pause_between_sentences": True, |
| "preserve_empty_prompt_lines": True, |
| "any_audio_prompt": True, |
| "audio_prompt_choices": True, |
| "audio_prompt_type_sources": dict(QWEN3_TTS_AUDIO_PROMPT_TYPE_SOURCES), |
| "custom_settings": [one.copy() for one in QWEN3_TTS_CUSTOM_SETTINGS], |
| "text_prompt_enhancer_instructions1": TTS_QWEN3_DIALOGUE_PROMPT, |
| "text_prompt_enhancer_max_tokens1": 512, |
| "audio_guide_label": "Speaker 1 reference voice", |
| "audio_guide2_label": "Speaker 2 reference voice", |
| } |
| return common |
|
|
|
|
| def get_qwen3_duration_default() -> int: |
| return int(QWEN3_TTS_DURATION_SLIDER.get("default", 20)) |
|
|
|
|
| def get_qwen3_download_def(base_model_type: str) -> list[dict]: |
| return [ |
| { |
| "repoId": QWEN3_TTS_REPO, |
| "sourceFolderList": [QWEN3_TTS_TEXT_TOKENIZER_DIR], |
| "fileList": [QWEN3_TTS_TEXT_TOKENIZER_FILES], |
| }, |
| { |
| "repoId": QWEN3_TTS_REPO, |
| "sourceFolderList": [QWEN3_TTS_SPEECH_TOKENIZER_DIR], |
| "fileList": [QWEN3_TTS_SPEECH_TOKENIZER_FILES], |
| }, |
| ] |
|
|
|
|
| class family_handler: |
| @staticmethod |
| def query_supported_types(): |
| return list(QWEN3_TTS_VARIANTS) |
|
|
| @staticmethod |
| def query_family_maps(): |
| return {}, {} |
|
|
| @staticmethod |
| def query_model_family(): |
| return "tts" |
|
|
| @staticmethod |
| def query_family_infos(): |
| return {"tts": (200, "TTS")} |
|
|
| @staticmethod |
| def register_lora_cli_args(parser, lora_root): |
| parser.add_argument( |
| "--lora-dir-qwen3-tts", |
| type=str, |
| default=None, |
| help=f"Path to a directory that contains Qwen3 TTS settings (default: {os.path.join(lora_root, 'qwen3_tts')})", |
| ) |
|
|
| @staticmethod |
| def get_lora_dir(base_model_type, args, lora_root): |
| return getattr(args, "lora_qwen3_tts", None) or os.path.join(lora_root, "qwen3_tts") |
|
|
| @staticmethod |
| def query_model_def(base_model_type, model_def): |
| return get_qwen3_model_def(base_model_type) |
|
|
| @staticmethod |
| def query_model_files(computeList, base_model_type, model_def=None): |
| return get_qwen3_download_def(base_model_type) |
|
|
| @staticmethod |
| def load_model( |
| model_filename, |
| model_type, |
| base_model_type, |
| model_def, |
| quantizeTransformer=False, |
| text_encoder_quantization=None, |
| dtype=None, |
| VAE_dtype=None, |
| mixed_precision_transformer=False, |
| save_quantized=False, |
| submodel_no_list=None, |
| text_encoder_filename=None, |
| profile=0, |
| lm_decoder_engine="legacy", |
| **kwargs, |
| ): |
| from .qwen3.pipeline import Qwen3TTSPipeline |
|
|
| ckpt_root = fl.get_download_location() |
| weights_candidate = None |
| if isinstance(model_filename, (list, tuple)): |
| if len(model_filename) > 0: |
| weights_candidate = model_filename[0] |
| else: |
| weights_candidate = model_filename |
| weights_path = None |
| if weights_candidate: |
| weights_path = fl.locate_file(weights_candidate, error_if_none=False) |
| if weights_path is None: |
| weights_path = weights_candidate |
|
|
| pipeline = Qwen3TTSPipeline( |
| model_weights_path=weights_path, |
| base_model_type=base_model_type, |
| ckpt_root=ckpt_root, |
| device=mps_device_or(torch.device("cpu")), |
| lm_decoder_engine=lm_decoder_engine, |
| ) |
| if str(lm_decoder_engine).strip().lower() in ("cg", "cudagraph"): |
| pipeline.model._budget = 0 |
| talker = getattr(pipeline.model, "talker", None) |
| if talker is not None: |
| talker._budget = 0 |
| code_predictor = getattr(talker, "code_predictor", None) |
| if code_predictor is not None: |
| code_predictor._budget = 0 |
|
|
| pipe = {"transformer": pipeline.model} |
| if getattr(pipeline, "speech_tokenizer", None) is not None: |
| pipe["speech_tokenizer"] = pipeline.speech_tokenizer.model |
| if save_quantized and weights_path: |
| from wgp import save_quantized_model |
|
|
| config_path = get_qwen3_config_path(base_model_type) |
| if config_path is None: |
| config_candidate = os.path.join("qwen3", "configs", f"{base_model_type}.json") |
| config_path = fl.locate_file(config_candidate, error_if_none=False) or config_candidate |
| save_quantized_model(pipeline.model, model_type, weights_path, dtype or torch.bfloat16, config_path) |
| return pipeline, pipe |
|
|
| @staticmethod |
| def fix_settings(base_model_type, settings_version, model_def, ui_defaults): |
| if "alt_prompt" not in ui_defaults: |
| ui_defaults["alt_prompt"] = "" |
|
|
| if base_model_type == "qwen3_tts_customvoice": |
| speakers = get_qwen3_speakers(base_model_type) |
| defaults = { |
| "audio_prompt_type": "", |
| "model_mode": speakers[0] if speakers else "", |
| } |
| elif base_model_type == "qwen3_tts_voicedesign": |
| defaults = { |
| "audio_prompt_type": "", |
| "model_mode": "auto", |
| } |
| elif base_model_type == "qwen3_tts_base": |
| defaults = { |
| "audio_prompt_type": "A", |
| "model_mode": "auto", |
| "pause_seconds": 0.5, |
| } |
| else: |
| defaults = { |
| "audio_prompt_type": "", |
| "model_mode": "auto", |
| } |
| for key, value in defaults.items(): |
| ui_defaults.setdefault(key, value) |
| if base_model_type == "qwen3_tts_base": |
| audio_prompt_type = str(ui_defaults.get("audio_prompt_type", "A") or "A").upper() |
| if audio_prompt_type not in ("A", "AB"): |
| ui_defaults["audio_prompt_type"] = "A" |
|
|
| if settings_version < 2.44: |
| if model_def.get("top_k_slider", False): |
| ui_defaults["top_k"] = 50 |
|
|
| @staticmethod |
| def update_default_settings(base_model_type, model_def, ui_defaults): |
| if base_model_type == "qwen3_tts_customvoice": |
| speakers = get_qwen3_speakers(base_model_type) |
| default_speaker = speakers[0] if speakers else "" |
| ui_defaults.update( |
| { |
| "audio_prompt_type": "", |
| "model_mode": default_speaker, |
| "alt_prompt": "", |
| "duration_seconds": get_qwen3_duration_default(), |
| "repeat_generation": 1, |
| "video_length": 0, |
| "num_inference_steps": 0, |
| "negative_prompt": "", |
| "temperature": 0.9, |
| "top_k": 50, |
| "multi_prompts_gen_type": "FG", |
| } |
| ) |
| return |
|
|
| if base_model_type == "qwen3_tts_voicedesign": |
| ui_defaults.update( |
| { |
| "audio_prompt_type": "", |
| "model_mode": "auto", |
| "alt_prompt": "young female, warm tone, clear articulation", |
| "duration_seconds": get_qwen3_duration_default(), |
| "repeat_generation": 1, |
| "video_length": 0, |
| "num_inference_steps": 0, |
| "negative_prompt": "", |
| "temperature": 0.9, |
| "top_k": 50, |
| "multi_prompts_gen_type": "FG", |
| } |
| ) |
| return |
|
|
| if base_model_type == "qwen3_tts_base": |
| ui_defaults.update( |
| { |
| "audio_prompt_type": "A", |
| "model_mode": "auto", |
| "alt_prompt": "", |
| "duration_seconds": get_qwen3_duration_default(), |
| "pause_seconds": 0.5, |
| "repeat_generation": 1, |
| "video_length": 0, |
| "num_inference_steps": 0, |
| "negative_prompt": "", |
| "temperature": 0.9, |
| "top_k": 50, |
| "multi_prompts_gen_type": "FG", |
| } |
| ) |
|
|
| @staticmethod |
| def validate_generative_prompt(base_model_type, model_def, inputs, one_prompt): |
| if base_model_type == "qwen3_tts_customvoice": |
| if one_prompt is None or len(str(one_prompt).strip()) == 0: |
| return "Prompt text cannot be empty for Qwen3 CustomVoice." |
| speaker = inputs.get("model_mode", "") |
| if not speaker: |
| return "Please select a speaker for Qwen3 CustomVoice." |
| speakers = get_qwen3_speakers(base_model_type) |
| if speaker.lower() not in speakers: |
| return f"Unsupported speaker '{speaker}'." |
| return None |
|
|
| if base_model_type == "qwen3_tts_voicedesign": |
| if one_prompt is None or len(str(one_prompt).strip()) == 0: |
| return "Prompt text cannot be empty for Qwen3 VoiceDesign." |
| return None |
|
|
| if base_model_type == "qwen3_tts_base": |
| if one_prompt is None or len(str(one_prompt).strip()) == 0: |
| return "Prompt text cannot be empty for Qwen3 Base voice clone." |
| audio_prompt_type = str(inputs.get("audio_prompt_type", "A") or "A").upper() |
| if inputs.get("audio_guide") is None: |
| return "Qwen3 Base requires Speaker 1 reference audio." |
| prompt_text = str(one_prompt) |
| has_speaker_syntax = re.search(r"Speaker\s*\d+\s*:", prompt_text, flags=re.IGNORECASE) is not None |
| if "B" in audio_prompt_type: |
| if inputs.get("audio_guide2") is None: |
| return "Two-speaker mode requires Speaker 2 reference audio." |
| speaker_matches = list(re.finditer(r"Speaker\s*(\d+)\s*:", prompt_text, flags=re.IGNORECASE)) |
| if not speaker_matches: |
| return ( |
| "Two-speaker mode requires prompt lines using Speaker 1: and Speaker 2: " |
| ) |
| speaker_ids = sorted({int(m.group(1)) for m in speaker_matches}) |
| if len(speaker_ids) != 2: |
| return ( |
| "Two-speaker mode requires exactly two speaker IDs. Use Speaker 1: and Speaker 2:. " |
| "For headless settings, keep 'multi_prompts_gen_type' = 'FG'." |
| ) |
| elif has_speaker_syntax: |
| return "Speaker-tag dialogue requires two-speaker mode (set audio prompt mode to Dialogue)." |
| return None |
|
|
| return None |
|
|
| @staticmethod |
| def validate_generative_settings(base_model_type, model_def, inputs): |
| if base_model_type != "qwen3_tts_base": |
| return None |
| custom_settings = inputs.get("custom_settings", None) |
| if custom_settings is None: |
| return None |
| if not isinstance(custom_settings, dict): |
| return "Custom settings must be a dictionary." |
|
|
| raw_value = custom_settings.get(QWEN3_TTS_AUTO_SPLIT_SETTING_ID, None) |
| if raw_value is None: |
| return None |
| if isinstance(raw_value, str): |
| raw_value = raw_value.strip() |
| if len(raw_value) == 0: |
| custom_settings.pop(QWEN3_TTS_AUTO_SPLIT_SETTING_ID, None) |
| inputs["custom_settings"] = custom_settings if len(custom_settings) > 0 else None |
| return None |
|
|
| try: |
| if isinstance(raw_value, bool): |
| raise ValueError() |
| auto_split_seconds = float(raw_value) |
| except Exception: |
| return ( |
| f"Auto Split Every s must be a number between " |
| f"{int(QWEN3_TTS_AUTO_SPLIT_MIN_SECONDS)} and {int(QWEN3_TTS_AUTO_SPLIT_MAX_SECONDS)} seconds." |
| ) |
|
|
| if ( |
| auto_split_seconds < QWEN3_TTS_AUTO_SPLIT_MIN_SECONDS |
| or auto_split_seconds > QWEN3_TTS_AUTO_SPLIT_MAX_SECONDS |
| ): |
| return ( |
| f"Auto Split Every s must be between " |
| f"{int(QWEN3_TTS_AUTO_SPLIT_MIN_SECONDS)} and {int(QWEN3_TTS_AUTO_SPLIT_MAX_SECONDS)} seconds." |
| ) |
|
|
| custom_settings[QWEN3_TTS_AUTO_SPLIT_SETTING_ID] = auto_split_seconds |
| inputs["custom_settings"] = custom_settings |
| return None |
|
|