| """ |
| BgTTS-38M Web Server — Gradio Interface |
| ======================================== |
| Voice cloning TTS with Bulgarian + English support. |
| """ |
|
|
| import sys |
| import os |
| import torch |
| import numpy as np |
| import tempfile |
| import time |
| import soundfile as sf |
|
|
| |
| sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) |
|
|
| from config import ( |
| AUDIO_OFFSET, NUM_AUDIO_TOKENS, END_OF_SPEECH_TOKEN_ID, |
| START_OF_SPEECH_TOKEN_ID, CODEC_SAMPLE_RATE, CODEC_FRAME_RATE, |
| ) |
| from tokenizer import TTSTokenizer |
| from codec import CodecV6 |
| from model import load_for_inference |
| from inference import generate, _split_text |
|
|
| |
| MODEL = None |
| TOKENIZER = None |
| CODEC = None |
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" |
| CHECKPOINT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "checkpoint_inference.pt") |
|
|
|
|
| def load_model(): |
| """Load model, tokenizer, codec once at startup.""" |
| global MODEL, TOKENIZER, CODEC |
| print(f"Loading model from {CHECKPOINT_PATH} on {DEVICE}...") |
| MODEL = load_for_inference(CHECKPOINT_PATH, device=DEVICE) |
| TOKENIZER = TTSTokenizer() |
| CODEC = CodecV6(device=DEVICE) |
| print("Model loaded!") |
|
|
|
|
| def synthesize_speech(text, ref_audio, temperature, top_k, top_p, rep_penalty): |
| """ |
| Generate speech from text using reference audio for voice cloning. |
| |
| Returns: (sample_rate, audio_array) tuple for Gradio |
| """ |
| if not text or not text.strip(): |
| return None |
| |
| if ref_audio is None: |
| return None |
| |
| |
| sr_ref, audio_ref = ref_audio |
| audio_ref = audio_ref.astype(np.float32) |
| if audio_ref.max() > 1.0 or audio_ref.min() < -1.0: |
| audio_ref = audio_ref / max(abs(audio_ref.max()), abs(audio_ref.min())) |
| |
| waveform = torch.from_numpy(audio_ref) |
| if waveform.dim() == 2: |
| waveform = waveform.mean(1) |
| |
| result = CODEC.encode_waveform(waveform, sr_ref) |
| speaker_emb = result['global_embedding'].to(DEVICE) |
| |
| |
| chunks = _split_text(text, TOKENIZER, max_len=250) |
| |
| t0 = time.time() |
| all_codes = [] |
| for chunk in chunks: |
| codes = generate( |
| MODEL, TOKENIZER, chunk, speaker_emb, |
| max_new_tokens=512, |
| temperature=temperature, |
| top_k=int(top_k), |
| top_p=top_p, |
| rep_penalty=rep_penalty, |
| device=DEVICE |
| ) |
| if codes is not None and len(codes) > 0: |
| all_codes.append(codes) |
| |
| gen_time = time.time() - t0 |
| |
| if not all_codes: |
| return None |
| |
| codes = torch.cat(all_codes) |
| audio_dur = len(codes) / CODEC_FRAME_RATE |
| rtf = gen_time / audio_dur if audio_dur > 0 else float('inf') |
| |
| |
| wav = CODEC.decode(codes, speaker_emb) |
| wav_np = wav.numpy() |
| |
| info = f"✅ {len(codes)} tokens | {audio_dur:.1f}s audio | {gen_time:.1f}s gen | RTF: {rtf:.3f}" |
| |
| return (CODEC_SAMPLE_RATE, wav_np), info |
|
|
|
|
| def build_ui(): |
| """Build Gradio interface.""" |
| import gradio as gr |
| |
| with gr.Blocks( |
| title="BgTTS-38M — Bulgarian Text-to-Speech", |
| theme=gr.themes.Soft( |
| primary_hue="blue", |
| secondary_hue="slate", |
| ), |
| css=""" |
| .main-title { text-align: center; margin-bottom: 0.5em; } |
| .subtitle { text-align: center; color: #666; margin-bottom: 1.5em; } |
| """ |
| ) as app: |
| gr.HTML('<h1 class="main-title">🎙️ BgTTS-38M</h1>') |
| gr.HTML('<p class="subtitle">Bulgarian + English Text-to-Speech with Voice Cloning | 38M params | 153MB</p>') |
| |
| with gr.Row(): |
| with gr.Column(scale=2): |
| text_input = gr.Textbox( |
| label="Текст / Text", |
| placeholder="Въведете текст на български или английски...\nEnter text in Bulgarian or English...", |
| lines=5, |
| max_lines=15, |
| ) |
| |
| ref_audio = gr.Audio( |
| label="🎤 Reference Voice (за клониране на глас)", |
| type="numpy", |
| sources=["upload", "microphone"], |
| ) |
| |
| with gr.Row(): |
| generate_btn = gr.Button("🔊 Генерирай / Generate", variant="primary", size="lg") |
| clear_btn = gr.Button("🗑️ Изчисти", size="lg") |
| |
| with gr.Column(scale=1): |
| with gr.Accordion("⚙️ Настройки / Settings", open=False): |
| temperature = gr.Slider( |
| minimum=0.05, maximum=1.5, value=0.3, step=0.05, |
| label="Temperature", |
| info="По-ниска = по-чисто, по-висока = по-разнообразно" |
| ) |
| top_k = gr.Slider( |
| minimum=1, maximum=500, value=250, step=10, |
| label="Top-K" |
| ) |
| top_p = gr.Slider( |
| minimum=0.1, maximum=1.0, value=0.95, step=0.05, |
| label="Top-P (Nucleus)" |
| ) |
| rep_penalty = gr.Slider( |
| minimum=1.0, maximum=2.0, value=1.1, step=0.05, |
| label="Repetition Penalty" |
| ) |
| |
| output_audio = gr.Audio( |
| label="🔊 Резултат / Output", |
| type="numpy", |
| interactive=False, |
| ) |
| |
| info_text = gr.Textbox( |
| label="ℹ️ Информация", |
| interactive=False, |
| lines=2, |
| ) |
| |
| |
| gr.Examples( |
| examples=[ |
| ["Българският език е изключително богат и мелодичен."], |
| ["Artificial intelligence has reached a fascinating stage."], |
| ["Когато говорим за истински multitasking, способността ми да превключвам плавно между български и English е от огромно значение."], |
| ["Здравейте! Казвам се Ани и мога да говоря на български и английски."], |
| ["The quick brown fox jumps over the lazy dog."], |
| ], |
| inputs=[text_input], |
| label="📝 Примери / Examples", |
| ) |
| |
| |
| generate_btn.click( |
| fn=synthesize_speech, |
| inputs=[text_input, ref_audio, temperature, top_k, top_p, rep_penalty], |
| outputs=[output_audio, info_text], |
| ) |
| |
| clear_btn.click( |
| fn=lambda: (None, None, ""), |
| outputs=[text_input, output_audio, info_text], |
| ) |
| |
| return app |
|
|
|
|
| if __name__ == "__main__": |
| import argparse |
| p = argparse.ArgumentParser() |
| p.add_argument("--checkpoint", default=CHECKPOINT_PATH) |
| p.add_argument("--host", default="0.0.0.0") |
| p.add_argument("--port", type=int, default=7860) |
| p.add_argument("--share", action="store_true") |
| p.add_argument("--device", default=DEVICE) |
| args = p.parse_args() |
| |
| CHECKPOINT_PATH = args.checkpoint |
| DEVICE = args.device |
| |
| load_model() |
| app = build_ui() |
| app.launch( |
| server_name=args.host, |
| server_port=args.port, |
| share=args.share, |
| ) |
|
|