import json import os import re import logging import random import shutil import numpy as np import torch from torch import no_grad, LongTensor import commons import utils import gradio as gr from models import SynthesizerTrn from text import text_to_sequence, _clean_text from huggingface_hub import hf_hub_download # --- 1. SETUP LOGGING --- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) # --- 2. CONFIGURATION & ASSETS --- HF_TOKEN = os.getenv("HF_TOKEN") REPO_ID = "Plana-Archive/Plana-TTS" SUBFOLDER = "Prosekai-TTS/saved_model" device = torch.device("cpu") # --- 3. ROMAJI CONVERTER --- try: import pykakasi kks = pykakasi.kakasi() def to_romaji(text): if not text or text == "None": return "" try: result = kks.convert(str(text)) return "".join([item['hepburn'].capitalize() for item in result]) except: return str(text) except: def to_romaji(text): return str(text) # --- 4. LOADING MODEL --- logger.info("[*] Downloading Project Sekai model assets...") config_path = hf_hub_download(repo_id=REPO_ID, filename="config.json", subfolder=SUBFOLDER, token=HF_TOKEN) model_path = hf_hub_download(repo_id=REPO_ID, filename="model.pth", subfolder=SUBFOLDER, token=HF_TOKEN) cover_path = hf_hub_download(repo_id=REPO_ID, filename="cover.png", subfolder=SUBFOLDER, token=HF_TOKEN) hps = utils.get_hparams_from_file(config_path) model = SynthesizerTrn( len(hps.symbols), hps.data.filter_length // 2 + 1, hps.train.segment_size // hps.data.hop_length, n_speakers=hps.data.n_speakers, **hps.model).to(device) utils.load_checkpoint(model_path, model, None) model.eval() speaker_names = [name for name in hps.speakers if name != "None"] display_names = [to_romaji(name) for name in speaker_names] speaker_map = {romaji: original for romaji, original in zip(display_names, speaker_names)} # --- 5. LOGIC FUNCTIONS --- def get_text(text, hps, is_phoneme): text_norm = text_to_sequence(text, hps.symbols, [] if is_phoneme else hps.data.text_cleaners) if hps.data.add_blank: text_norm = commons.intersperse(text_norm, 0) return LongTensor(text_norm) def tts_execute(text, speaker_romaji, speed, is_phoneme): if not speaker_romaji: return "❌ Pilih Karakter dulu!", None try: original_name = speaker_map[speaker_romaji] speaker_id = hps.speakers.index(original_name) stn_tst = get_text(text, hps, is_phoneme) with no_grad(): x_tst = stn_tst.unsqueeze(0).to(device) x_tst_lengths = LongTensor([stn_tst.size(0)]).to(device) sid = LongTensor([speaker_id]).to(device) audio = model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8, length_scale=1.0 / speed)[0][0, 0].data.cpu().float().numpy() return "✅ Success!", (hps.data.sampling_rate, (audio * 32767).astype(np.int16)) except Exception as e: return f"Error: {e}", None def get_random_jp(): return random.choice(["こんにちは!", "お元気ですか?", "ワンダショ、最高!", "練習、頑張ろうね。", "また明日も会えるかな?"]) def to_phoneme_fn(text): return _clean_text(text, hps.data.text_cleaners) if text != "" else "" # --- 6. UI STYLE --- css = """ :root { --primary-600: #1299ff !important; --accent-600: #1299ff !important; --loader-color: #add8e6 !important; /* Warna Biru Muda untuk Loading */ } /* Modifikasi Loading Screen Gradio */ .gradio-container .load-overlay { background: rgba(255, 255, 255, 0.8) !important; } .gradio-container .loader { border-top-color: #add8e6 !important; } .ba-header-container { border: 1.5px solid #e1e8f0; border-radius: 12px; padding: 20px 10px; margin-bottom: 12px; background: white; text-align: center; } .ba-header-container h1 { color: #1299ff !important; font-weight: 700 !important; font-size: 36px !important; margin: 0; } .status-container { border: 1.5px solid #e1e8f0; border-radius: 12px; padding: 15px 22px; margin-bottom: 20px; background: white; } .status-title { color: #1299ff !important; font-weight: 800; font-size: 16px; margin-bottom: 8px; } .text-green-bold { color: #28a745 !important; font-weight: 900 !important; } .text-blue-status { color: #1299ff !important; } .slim-card { max-width: 500px; margin: 0 auto; background: transparent; padding: 10px; } .scroll-box { height: 220px; overflow-y: auto; border: 1px solid #f0f4f8; border-radius: 12px; padding: 10px; background: #fafbfc; margin-bottom: 10px; } .char-btn { background: white !important; border: 1px solid #e2e8f0 !important; border-left: 5px solid #1299ff !important; text-align: left !important; padding: 8px !important; font-size: 12px !important; margin-bottom: 4px !important; width: 100%; color: #4a5568 !important; } .warning-card { background: #fff9f0; border: 2px dashed #f5a623; border-radius: 10px; padding: 12px; margin-bottom: 15px; text-align: center; } .jp-btn { background: #f8fafc !important; border: 1px solid #cbd5e1 !important; color: #475569 !important; font-weight: 700 !important; border-radius: 10px !important; margin-bottom: 10px; font-size: 12px !important; width: 100%; } .gen-btn { background: #1299ff !important; color: white !important; font-weight: 700 !important; border-radius: 12px !important; height: 45px !important; width: 100%; border: none !important; cursor: pointer; } .credit-footer { margin-top: 25px; padding: 15px; background: white; border-radius: 12px; text-align: center; border-bottom: 4px solid #1299ff; color: #94a3b8; font-weight: 700; font-size: 12px; letter-spacing: 2px; } """ # --- 7. GRADIO INTERFACE --- with gr.Blocks(title="Project Sekai TTS", css=css) as app: with gr.Column(elem_classes="slim-card"): gr.HTML(f"""

Project Sekai

💫 VITS Emotional TTS 💫

System Status
Model :  ProSekai Pack ✅
Device :  CPU Mode
""") if os.path.exists(cover_path): gr.Image(cover_path, show_label=False, interactive=False, height=160) sel_name = gr.State("") char_display = gr.Markdown("📍 *Silakan pilih karakter...*") with gr.Column(elem_classes="scroll-box"): for name in display_names: btn = gr.Button(f"👤 {name}", elem_classes="char-btn") btn.click(fn=lambda n=name: (n, f"📍 Selected: **{n}**"), outputs=[sel_name, char_display]) with gr.Column(): gr.HTML("""
🔖 INSTRUKSI 🔖
Pilih member Proseka, tulis teks Jepang, lalu klik Generate!
""") txt_in = gr.TextArea(label="Input Text", value="こんにちは。", lines=3) gr.Button("🎲 RANDOM JAPANESE TEXT 🎲", elem_classes="jp-btn").click(get_random_jp, outputs=[txt_in]) speed = gr.Slider(0.5, 2.0, 1.0, step=0.1, label="Speed Audio") with gr.Accordion(label="Advanced Options", open=False): phoneme_input = gr.Checkbox(value=False, label="Phoneme input") to_phoneme_btn = gr.Button("Convert text to phoneme") phoneme_list = gr.Dataset( label="Phoneme list", components=[txt_in], samples=[[x] for x in hps.symbols] ) to_phoneme_btn.click(to_phoneme_fn, [txt_in], [txt_in]) btn_gen = gr.Button("🎐 GENERATE VOICE 🎐", elem_classes="gen-btn") status_out = gr.Textbox(label="Status Message", interactive=False) aud_out = gr.Audio(label="Voice Output") btn_gen.click( fn=tts_execute, inputs=[txt_in, sel_name, speed, phoneme_input], outputs=[status_out, aud_out] ) gr.HTML("""""") if __name__ == "__main__": app.launch( server_name="0.0.0.0", server_port=7860, ssr_mode=False )