File size: 8,880 Bytes
957c16c
 
98c252c
574ec19
 
 
957c16c
 
 
 
 
 
 
 
c999898
957c16c
574ec19
 
 
 
 
98c252c
c999898
 
 
957c16c
574ec19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
957c16c
 
 
 
574ec19
 
 
 
 
 
 
 
98c252c
 
 
 
 
 
 
574ec19
 
 
 
 
 
 
 
 
 
 
 
8e28b4a
 
 
 
 
 
 
 
 
 
 
 
 
 
574ec19
 
 
 
 
 
 
 
 
 
 
 
 
957c16c
5ad521e
574ec19
c4fce1b
574ec19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6f2f222
574ec19
 
f596609
5623961
c4fce1b
f596609
5623961
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
import json
import os
import re
import logging
import random
import shutil
import numpy as np
import torch
from torch import no_grad, LongTensor
import commons
import utils
import gradio as gr
from models import SynthesizerTrn
from text import text_to_sequence, _clean_text
from huggingface_hub import hf_hub_download

# --- 1. SETUP LOGGING ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- 2. CONFIGURATION & ASSETS ---
HF_TOKEN = os.getenv("HF_TOKEN")
REPO_ID = "Plana-Archive/Plana-TTS"
SUBFOLDER = "Prosekai-TTS/saved_model"
device = torch.device("cpu")

# --- 3. ROMAJI CONVERTER ---
try:
    import pykakasi
    kks = pykakasi.kakasi()
    def to_romaji(text):
        if not text or text == "None": return ""
        try:
            result = kks.convert(str(text))
            return "".join([item['hepburn'].capitalize() for item in result])
        except:
            return str(text)
except:
    def to_romaji(text): return str(text)

# --- 4. LOADING MODEL ---
logger.info("[*] Downloading Project Sekai model assets...")
config_path = hf_hub_download(repo_id=REPO_ID, filename="config.json", subfolder=SUBFOLDER, token=HF_TOKEN)
model_path = hf_hub_download(repo_id=REPO_ID, filename="model.pth", subfolder=SUBFOLDER, token=HF_TOKEN)
cover_path = hf_hub_download(repo_id=REPO_ID, filename="cover.png", subfolder=SUBFOLDER, token=HF_TOKEN)

hps = utils.get_hparams_from_file(config_path)
model = SynthesizerTrn(
    len(hps.symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    n_speakers=hps.data.n_speakers,
    **hps.model).to(device)

utils.load_checkpoint(model_path, model, None)
model.eval()

speaker_names = [name for name in hps.speakers if name != "None"]
display_names = [to_romaji(name) for name in speaker_names]
speaker_map = {romaji: original for romaji, original in zip(display_names, speaker_names)}

# --- 5. LOGIC FUNCTIONS ---
def get_text(text, hps, is_phoneme):
    text_norm = text_to_sequence(text, hps.symbols, [] if is_phoneme else hps.data.text_cleaners)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    return LongTensor(text_norm)

def tts_execute(text, speaker_romaji, speed, is_phoneme):
    if not speaker_romaji:
        return "❌ Pilih Karakter dulu!", None
    try:
        original_name = speaker_map[speaker_romaji]
        speaker_id = hps.speakers.index(original_name)
        stn_tst = get_text(text, hps, is_phoneme)
        with no_grad():
            x_tst = stn_tst.unsqueeze(0).to(device)
            x_tst_lengths = LongTensor([stn_tst.size(0)]).to(device)
            sid = LongTensor([speaker_id]).to(device)
            audio = model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, 
                                noise_scale_w=0.8, length_scale=1.0 / speed)[0][0, 0].data.cpu().float().numpy()
        return "βœ… Success!", (hps.data.sampling_rate, (audio * 32767).astype(np.int16))
    except Exception as e:
        return f"Error: {e}", None

def get_random_jp():
    return random.choice(["こんにけは!", "γŠε…ƒζ°—γ§γ™γ‹οΌŸ", "γƒ―γƒ³γƒ€γ‚·γƒ§γ€ζœ€ι«˜οΌ", "練習、頑弡ろうね。", "また明ζ—₯γ‚‚δΌšγˆγ‚‹γ‹γͺ?"])

def to_phoneme_fn(text):
    return _clean_text(text, hps.data.text_cleaners) if text != "" else ""

# --- 6. UI STYLE ---
css = """
:root { 
    --primary-600: #1299ff !important; 
    --accent-600: #1299ff !important; 
    --loader-color: #add8e6 !important; /* Warna Biru Muda untuk Loading */
}

/* Modifikasi Loading Screen Gradio */
.gradio-container .load-overlay {
    background: rgba(255, 255, 255, 0.8) !important;
}
.gradio-container .loader {
    border-top-color: #add8e6 !important;
}

.ba-header-container { border: 1.5px solid #e1e8f0; border-radius: 12px; padding: 20px 10px; margin-bottom: 12px; background: white; text-align: center; }
.ba-header-container h1 { color: #1299ff !important; font-weight: 700 !important; font-size: 36px !important; margin: 0; }
.status-container { border: 1.5px solid #e1e8f0; border-radius: 12px; padding: 15px 22px; margin-bottom: 20px; background: white; }
.status-title { color: #1299ff !important; font-weight: 800; font-size: 16px; margin-bottom: 8px; }
.text-green-bold { color: #28a745 !important; font-weight: 900 !important; }
.text-blue-status { color: #1299ff !important; }
.slim-card { max-width: 500px; margin: 0 auto; background: transparent; padding: 10px; }
.scroll-box { height: 220px; overflow-y: auto; border: 1px solid #f0f4f8; border-radius: 12px; padding: 10px; background: #fafbfc; margin-bottom: 10px; }
.char-btn { background: white !important; border: 1px solid #e2e8f0 !important; border-left: 5px solid #1299ff !important; text-align: left !important; padding: 8px !important; font-size: 12px !important; margin-bottom: 4px !important; width: 100%; color: #4a5568 !important; }
.warning-card { background: #fff9f0; border: 2px dashed #f5a623; border-radius: 10px; padding: 12px; margin-bottom: 15px; text-align: center; }
.jp-btn { background: #f8fafc !important; border: 1px solid #cbd5e1 !important; color: #475569 !important; font-weight: 700 !important; border-radius: 10px !important; margin-bottom: 10px; font-size: 12px !important; width: 100%; }
.gen-btn { background: #1299ff !important; color: white !important; font-weight: 700 !important; border-radius: 12px !important; height: 45px !important; width: 100%; border: none !important; cursor: pointer; }
.credit-footer { margin-top: 25px; padding: 15px; background: white; border-radius: 12px; text-align: center; border-bottom: 4px solid #1299ff; color: #94a3b8; font-weight: 700; font-size: 12px; letter-spacing: 2px; }
"""

# --- 7. GRADIO INTERFACE ---
with gr.Blocks(title="Project Sekai TTS", css=css) as app:
    with gr.Column(elem_classes="slim-card"):
        gr.HTML(f"""
            <div class="ba-header-container">
                <h1>Project Sekai</h1>
                <p>πŸ’« VITS Emotional TTS πŸ’«</p>
            </div>
            <div class="status-container">
                <div class="status-title">System Status</div>
                <div class="status-item"><span style="color:#4a5568">Model :</span> <span class="text-green-bold">&nbsp;ProSekai Pack βœ…</span></div>
                <div class="status-item"><span style="color:#4a5568">Device :</span> <span class="text-blue-status">&nbsp;CPU Mode</span></div>
            </div>
        """)

        if os.path.exists(cover_path):
            gr.Image(cover_path, show_label=False, interactive=False, height=160)

        sel_name = gr.State("")
        char_display = gr.Markdown("πŸ“ *Silakan pilih karakter...*")
        
        with gr.Column(elem_classes="scroll-box"):
            for name in display_names:
                btn = gr.Button(f"πŸ‘€ {name}", elem_classes="char-btn")
                btn.click(fn=lambda n=name: (n, f"πŸ“ Selected: **{n}**"), outputs=[sel_name, char_display])

        with gr.Column():
            gr.HTML("""
                <div class="warning-card">
                    <div style="color:#f5a623;font-weight:800;font-size:13px;">πŸ”– INSTRUKSI πŸ”–</div>
                    <div style="color:#855d1a;font-size:11px;font-weight:600;">
                        Pilih member Proseka, tulis teks Jepang, lalu klik Generate!
                    </div>
                </div>
            """)
            
            txt_in = gr.TextArea(label="Input Text", value="こんにけは。", lines=3)
            gr.Button("🎲 RANDOM JAPANESE TEXT 🎲", elem_classes="jp-btn").click(get_random_jp, outputs=[txt_in])
            speed = gr.Slider(0.5, 2.0, 1.0, step=0.1, label="Speed Audio")
            
            with gr.Accordion(label="Advanced Options", open=False):
                phoneme_input = gr.Checkbox(value=False, label="Phoneme input")
                to_phoneme_btn = gr.Button("Convert text to phoneme")
                
                phoneme_list = gr.Dataset(
                    label="Phoneme list", 
                    components=[txt_in], 
                    samples=[[x] for x in hps.symbols]
                )
                to_phoneme_btn.click(to_phoneme_fn, [txt_in], [txt_in])

            btn_gen = gr.Button("🎐 GENERATE VOICE 🎐", elem_classes="gen-btn")
            status_out = gr.Textbox(label="Status Message", interactive=False)
            aud_out = gr.Audio(label="Voice Output")

            btn_gen.click(
                fn=tts_execute, 
                inputs=[txt_in, sel_name, speed, phoneme_input], 
                outputs=[status_out, aud_out]
            )

        gr.HTML("""<div class="credit-footer">πŸ₯’ CREATED BY MUTSUMI πŸ₯’</div>""")

if __name__ == "__main__":
    app.launch(
        server_name="0.0.0.0", 
        server_port=7860,
        ssr_mode=False
    )