Project-SEKAI / app.py
Plana-Archive's picture
Update app.py
8e28b4a verified
import json
import os
import re
import logging
import random
import shutil
import numpy as np
import torch
from torch import no_grad, LongTensor
import commons
import utils
import gradio as gr
from models import SynthesizerTrn
from text import text_to_sequence, _clean_text
from huggingface_hub import hf_hub_download
# --- 1. SETUP LOGGING ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
# --- 2. CONFIGURATION & ASSETS ---
HF_TOKEN = os.getenv("HF_TOKEN")
REPO_ID = "Plana-Archive/Plana-TTS"
SUBFOLDER = "Prosekai-TTS/saved_model"
device = torch.device("cpu")
# --- 3. ROMAJI CONVERTER ---
try:
import pykakasi
kks = pykakasi.kakasi()
def to_romaji(text):
if not text or text == "None": return ""
try:
result = kks.convert(str(text))
return "".join([item['hepburn'].capitalize() for item in result])
except:
return str(text)
except:
def to_romaji(text): return str(text)
# --- 4. LOADING MODEL ---
logger.info("[*] Downloading Project Sekai model assets...")
config_path = hf_hub_download(repo_id=REPO_ID, filename="config.json", subfolder=SUBFOLDER, token=HF_TOKEN)
model_path = hf_hub_download(repo_id=REPO_ID, filename="model.pth", subfolder=SUBFOLDER, token=HF_TOKEN)
cover_path = hf_hub_download(repo_id=REPO_ID, filename="cover.png", subfolder=SUBFOLDER, token=HF_TOKEN)
hps = utils.get_hparams_from_file(config_path)
model = SynthesizerTrn(
len(hps.symbols),
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
n_speakers=hps.data.n_speakers,
**hps.model).to(device)
utils.load_checkpoint(model_path, model, None)
model.eval()
speaker_names = [name for name in hps.speakers if name != "None"]
display_names = [to_romaji(name) for name in speaker_names]
speaker_map = {romaji: original for romaji, original in zip(display_names, speaker_names)}
# --- 5. LOGIC FUNCTIONS ---
def get_text(text, hps, is_phoneme):
text_norm = text_to_sequence(text, hps.symbols, [] if is_phoneme else hps.data.text_cleaners)
if hps.data.add_blank:
text_norm = commons.intersperse(text_norm, 0)
return LongTensor(text_norm)
def tts_execute(text, speaker_romaji, speed, is_phoneme):
if not speaker_romaji:
return "❌ Pilih Karakter dulu!", None
try:
original_name = speaker_map[speaker_romaji]
speaker_id = hps.speakers.index(original_name)
stn_tst = get_text(text, hps, is_phoneme)
with no_grad():
x_tst = stn_tst.unsqueeze(0).to(device)
x_tst_lengths = LongTensor([stn_tst.size(0)]).to(device)
sid = LongTensor([speaker_id]).to(device)
audio = model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667,
noise_scale_w=0.8, length_scale=1.0 / speed)[0][0, 0].data.cpu().float().numpy()
return "βœ… Success!", (hps.data.sampling_rate, (audio * 32767).astype(np.int16))
except Exception as e:
return f"Error: {e}", None
def get_random_jp():
return random.choice(["こんにけは!", "γŠε…ƒζ°—γ§γ™γ‹οΌŸ", "γƒ―γƒ³γƒ€γ‚·γƒ§γ€ζœ€ι«˜οΌ", "練習、頑弡ろうね。", "また明ζ—₯γ‚‚δΌšγˆγ‚‹γ‹γͺ?"])
def to_phoneme_fn(text):
return _clean_text(text, hps.data.text_cleaners) if text != "" else ""
# --- 6. UI STYLE ---
css = """
:root {
--primary-600: #1299ff !important;
--accent-600: #1299ff !important;
--loader-color: #add8e6 !important; /* Warna Biru Muda untuk Loading */
}
/* Modifikasi Loading Screen Gradio */
.gradio-container .load-overlay {
background: rgba(255, 255, 255, 0.8) !important;
}
.gradio-container .loader {
border-top-color: #add8e6 !important;
}
.ba-header-container { border: 1.5px solid #e1e8f0; border-radius: 12px; padding: 20px 10px; margin-bottom: 12px; background: white; text-align: center; }
.ba-header-container h1 { color: #1299ff !important; font-weight: 700 !important; font-size: 36px !important; margin: 0; }
.status-container { border: 1.5px solid #e1e8f0; border-radius: 12px; padding: 15px 22px; margin-bottom: 20px; background: white; }
.status-title { color: #1299ff !important; font-weight: 800; font-size: 16px; margin-bottom: 8px; }
.text-green-bold { color: #28a745 !important; font-weight: 900 !important; }
.text-blue-status { color: #1299ff !important; }
.slim-card { max-width: 500px; margin: 0 auto; background: transparent; padding: 10px; }
.scroll-box { height: 220px; overflow-y: auto; border: 1px solid #f0f4f8; border-radius: 12px; padding: 10px; background: #fafbfc; margin-bottom: 10px; }
.char-btn { background: white !important; border: 1px solid #e2e8f0 !important; border-left: 5px solid #1299ff !important; text-align: left !important; padding: 8px !important; font-size: 12px !important; margin-bottom: 4px !important; width: 100%; color: #4a5568 !important; }
.warning-card { background: #fff9f0; border: 2px dashed #f5a623; border-radius: 10px; padding: 12px; margin-bottom: 15px; text-align: center; }
.jp-btn { background: #f8fafc !important; border: 1px solid #cbd5e1 !important; color: #475569 !important; font-weight: 700 !important; border-radius: 10px !important; margin-bottom: 10px; font-size: 12px !important; width: 100%; }
.gen-btn { background: #1299ff !important; color: white !important; font-weight: 700 !important; border-radius: 12px !important; height: 45px !important; width: 100%; border: none !important; cursor: pointer; }
.credit-footer { margin-top: 25px; padding: 15px; background: white; border-radius: 12px; text-align: center; border-bottom: 4px solid #1299ff; color: #94a3b8; font-weight: 700; font-size: 12px; letter-spacing: 2px; }
"""
# --- 7. GRADIO INTERFACE ---
with gr.Blocks(title="Project Sekai TTS", css=css) as app:
with gr.Column(elem_classes="slim-card"):
gr.HTML(f"""
<div class="ba-header-container">
<h1>Project Sekai</h1>
<p>πŸ’« VITS Emotional TTS πŸ’«</p>
</div>
<div class="status-container">
<div class="status-title">System Status</div>
<div class="status-item"><span style="color:#4a5568">Model :</span> <span class="text-green-bold">&nbsp;ProSekai Pack βœ…</span></div>
<div class="status-item"><span style="color:#4a5568">Device :</span> <span class="text-blue-status">&nbsp;CPU Mode</span></div>
</div>
""")
if os.path.exists(cover_path):
gr.Image(cover_path, show_label=False, interactive=False, height=160)
sel_name = gr.State("")
char_display = gr.Markdown("πŸ“ *Silakan pilih karakter...*")
with gr.Column(elem_classes="scroll-box"):
for name in display_names:
btn = gr.Button(f"πŸ‘€ {name}", elem_classes="char-btn")
btn.click(fn=lambda n=name: (n, f"πŸ“ Selected: **{n}**"), outputs=[sel_name, char_display])
with gr.Column():
gr.HTML("""
<div class="warning-card">
<div style="color:#f5a623;font-weight:800;font-size:13px;">πŸ”– INSTRUKSI πŸ”–</div>
<div style="color:#855d1a;font-size:11px;font-weight:600;">
Pilih member Proseka, tulis teks Jepang, lalu klik Generate!
</div>
</div>
""")
txt_in = gr.TextArea(label="Input Text", value="こんにけは。", lines=3)
gr.Button("🎲 RANDOM JAPANESE TEXT 🎲", elem_classes="jp-btn").click(get_random_jp, outputs=[txt_in])
speed = gr.Slider(0.5, 2.0, 1.0, step=0.1, label="Speed Audio")
with gr.Accordion(label="Advanced Options", open=False):
phoneme_input = gr.Checkbox(value=False, label="Phoneme input")
to_phoneme_btn = gr.Button("Convert text to phoneme")
phoneme_list = gr.Dataset(
label="Phoneme list",
components=[txt_in],
samples=[[x] for x in hps.symbols]
)
to_phoneme_btn.click(to_phoneme_fn, [txt_in], [txt_in])
btn_gen = gr.Button("🎐 GENERATE VOICE 🎐", elem_classes="gen-btn")
status_out = gr.Textbox(label="Status Message", interactive=False)
aud_out = gr.Audio(label="Voice Output")
btn_gen.click(
fn=tts_execute,
inputs=[txt_in, sel_name, speed, phoneme_input],
outputs=[status_out, aud_out]
)
gr.HTML("""<div class="credit-footer">πŸ₯’ CREATED BY MUTSUMI πŸ₯’</div>""")
if __name__ == "__main__":
app.launch(
server_name="0.0.0.0",
server_port=7860,
ssr_mode=False
)