import gradio as gr import os import torch import commons import utils from models import SynthesizerTrn import numpy as np import json import shutil import logging import random import re from huggingface_hub import snapshot_download # --- 1. SETUP LOGGING --- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) # --- 2. ROMAJI CONVERTER --- try: import pykakasi kks = pykakasi.kakasi() def to_romaji(text): if not text or text == "None": return "" try: result = kks.convert(str(text)) return "".join([item['hepburn'].capitalize() for item in result]) except: return str(text) except: def to_romaji(text): return str(text) # --- 3. CLEAN HARDCODED INFO --- CLEAN_INFO = { "0": {"title": "Sanoba Witch & Senren Banka", "example": "こんにちは。", "type": "vits"}, "1": {"title": "Hamidashi Creative", "example": "こんにちは。", "type": "vits"}, "2": {"title": "Cafe Stella & Shinigami no Chou", "example": "こんにちは。", "type": "vits"}, "3": {"title": "Yosuga no Sora", "example": "こんにちは。", "type": "vits"}, "4": {"title": "Bishoujo Mangekyou", "example": "こんにちは。", "type": "vits"}, "5": {"title": "Nene & Nanami Pack (Multi)", "example": "[JA]こんにちは。[JA]", "type": "vits"}, "6": {"title": "The Fox Waiting for You", "example": "안녕하세요.", "type": "vits"}, "7": {"title": "Galgame Characters Pack (13)", "example": "こんにちは。", "type": "vits"}, "8": {"title": "Zero no Tsukaima", "example": "こんにちは。", "type": "vits"}, "9": {"title": "Zero no Tsukaima (VC Mode)", "example": "", "type": "soft-vits-vc"}, "10": {"title": "Toaru Majutsu no Index (VC)", "example": "", "type": "soft-vits-vc"}, "11": {"title": "Shiki Natsume (VC Mode)", "example": "", "type": "soft-vits-vc"}, "12": {"title": "DRACU-RIOT!", "example": "こんにちは。", "type": "vits"}, "13": {"title": "To LOVE-Ru Series", "example": "こんにちは。", "type": "vits"}, "14": {"title": "CJKS Multi-Language", "example": "[JA]こんにちは。[JA]", "type": "vits"}, "15": {"title": "Voistock Mega Pack (2891 Chars)", "example": "[JA]こんにちは。[JA]", "type": "vits"}, "16": {"title": "Shanghainese Dialect", "example": "侬好!", "type": "vits"}, "17": {"title": "Chinese Dialects Pack", "example": "[SH]侬好![SH]", "type": "vits"}, "18": {"title": "Umamusume: Pretty Derby", "example": "こんにちは。", "type": "vits"}, "19": {"title": "Princess Connect! Re:Dive", "example": "[JA]こんにちは。[JA]", "type": "vits"}, "20": {"title": "Magia Record (Madoka Magica)", "example": "こんにちは。", "type": "vits"} } # --- 4. DOWNLOAD ASSETS --- REPO_ID = "Plana-Archive/Plana-TTS" LOCAL_ROOT = "saved_model" def download_assets(): os.makedirs(LOCAL_ROOT, exist_ok=True) if not os.path.exists(os.path.join(LOCAL_ROOT, "0")): try: logger.info("Downloading Assets...") snapshot_download(repo_id=REPO_ID, local_dir="temp_dir", allow_patterns=["MOE-TTS/saved_model/*"]) src_path = os.path.join("temp_dir", "MOE-TTS", "saved_model") if os.path.exists(src_path): shutil.copytree(src_path, LOCAL_ROOT, dirs_exist_ok=True) shutil.rmtree("temp_dir") except Exception as e: logger.error(f"Download error: {e}") download_assets() # --- 5. ENGINE LOAD MODEL --- loaded_models = {} def clean_config(conf): if isinstance(conf, dict): return {str(k): clean_config(v) for k, v in conf.items()} elif isinstance(conf, list): return [clean_config(i) for i in conf] return conf def get_vits_model(m_id): mid = str(m_id) if mid in loaded_models: return loaded_models[mid] try: p = os.path.join(LOCAL_ROOT, mid) cfg_p = os.path.join(p, "config.json") if not os.path.exists(cfg_p): return None hps = utils.get_hparams_from_file(cfg_p) m_params = clean_config(hps.model.__dict__ if hasattr(hps.model, '__dict__') else dict(hps.model)) net = SynthesizerTrn(len(hps.symbols), hps.data.filter_length // 2 + 1, hps.train.segment_size // hps.data.hop_length, n_speakers=hps.data.n_speakers, **m_params) utils.load_checkpoint(os.path.join(p, "model.pth"), net, None) net.eval() raw_spks = hps.speakers if hasattr(hps, 'speakers') else [f"Character {i}" for i in range(hps.data.n_speakers)] display_spks = [] original_spks = [] for s in raw_spks: romaji_name = to_romaji(s) if romaji_name and romaji_name.lower() != "none": display_spks.append(romaji_name) original_spks.append(s) loaded_models[mid] = (hps, net, display_spks, original_spks) return loaded_models[mid] except Exception as e: logger.error(f"Load Error {mid}: {e}") return None def tts_execute(m_id, text, speaker_romaji, speed): data = get_vits_model(m_id) if not data: return "❌ Model Loading...", None hps, net, display_spks, _ = data if not speaker_romaji: if display_spks: speaker_romaji = display_spks[0] else: return "❌ No Speaker Selected", None try: sid = display_spks.index(speaker_romaji) from text import text_to_sequence clean_text = re.sub(r'\[[A-Z]{2}\]', '', text) cleaners = hps.data.text_cleaners if hasattr(hps.data, 'text_cleaners') else ['japanese_cleaners'] seq = text_to_sequence(clean_text, hps.symbols, cleaners) if hps.data.add_blank: seq = commons.intersperse(seq, 0) with torch.no_grad(): audio = net.infer(torch.LongTensor(seq).unsqueeze(0), torch.LongTensor([len(seq)]), sid=torch.LongTensor([sid]), noise_scale=0.667, noise_scale_w=0.8, length_scale=1.0/speed)[0][0,0].data.cpu().float().numpy() return f"✅ Done!", (hps.data.sampling_rate, (audio * 32767).astype(np.int16)) except Exception as e: return f"Error: {e}", None def get_random_jp(): return random.choice(["こんにちは!", "お元気ですか?", "先生、お疲れ様です!", "大好きだよ!", "また明日ね。"]) def get_char_info_html(m_id): data = get_vits_model(m_id) if not data: return "" _, _, _, original_names = data html = f"""
🍂 Style-Bert-VITS2 🍂
CHARACTER LIST (ROMAJI)
") with gr.Column(elem_classes="scroll-box"): if chars: for name in chars: btn = gr.Button(f"👤 {name}", elem_classes="char-btn") btn.click(fn=lambda n=name: (n, f"📍 Selected: **{n}**"), outputs=[sel_name, char_display]) gr.HTML("""