import gradio as gr import os import re import torch import numpy as np from scipy.io.wavfile import write from phonemizer.backend.espeak.wrapper import EspeakWrapper from safetensors.torch import load_file from huggingface_hub import hf_hub_download from tts import commons from tts import utils from tts.models import SynthesizerTrn from text.symbols import symbols from text import text_to_sequence _ESPEAK_LIBRARY = r"C:\Program Files\eSpeak NG\libespeak-ng.dll" if os.path.exists(_ESPEAK_LIBRARY): EspeakWrapper.set_library(_ESPEAK_LIBRARY) print(f"✅ Found eSpeak-ng: {_ESPEAK_LIBRARY}") REPO_ID = "PatnaikAshish/Sonya-TTS" MODEL_FILENAME = "checkpoints/sonya-tts.safetensors" CONFIG_FILENAME = "checkpoints/config.json" LOCAL_MODEL_PATH = "checkpoints/sonya-tts.safetensors" LOCAL_CONFIG_PATH = "checkpoints/config.json" device = "cuda" if torch.cuda.is_available() else "cpu" def clean_text_for_vits(text): text = text.strip() text = text.replace("'", "'") text = text.replace(""", '"').replace(""", '"') text = text.replace("–", "-").replace("—", "-") text = re.sub(r"[()\[\]{}<>]", "", text) text = re.sub(r"[^a-zA-Z0-9\s.,!?'\-]", "", text) text = re.sub(r"\s+", " ", text) return text def get_text(text, hps): text = clean_text_for_vits(text) text_norm = text_to_sequence(text, hps.data.text_cleaners) if hps.data.add_blank: text_norm = commons.intersperse(text_norm, 0) return torch.LongTensor(text_norm) def split_sentences(text): text = clean_text_for_vits(text) if not text: return [] return re.split(r'(?<=[.!?])\s+', text) print("🔄 Loading Sonya TTS Model...") if os.path.exists(LOCAL_MODEL_PATH) and os.path.exists(LOCAL_CONFIG_PATH): print("✅ Loading Sonya TTS from local checkpoints...") model_path = LOCAL_MODEL_PATH config_path = LOCAL_CONFIG_PATH else: print("🌍 Downloading Sonya TTS from Hugging Face...") model_path = hf_hub_download(repo_id=REPO_ID, filename=MODEL_FILENAME) config_path = hf_hub_download(repo_id=REPO_ID, filename=CONFIG_FILENAME) hps = utils.get_hparams_from_file(config_path) net_g = SynthesizerTrn( len(symbols), hps.data.filter_length // 2 + 1, hps.train.segment_size // hps.data.hop_length, **hps.model ).to(device) net_g.eval() state_dict = load_file(model_path) net_g.load_state_dict(state_dict) print("🎉 Sonya TTS loaded successfully!") def infer_short(text, noise_scale, noise_scale_w, length_scale): if not text.strip(): return None stn_tst = get_text(text, hps) with torch.no_grad(): x_tst = stn_tst.to(device).unsqueeze(0) x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device) audio = net_g.infer( x_tst, x_tst_lengths, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=length_scale )[0][0,0].data.cpu().float().numpy() return (hps.data.sampling_rate, audio) def infer_long(text, length_scale, noise_scale): if not text.strip(): return None sentences = split_sentences(text) audio_chunks = [] fixed_noise_w = 0.6 base_pause = 0.3 for sent in sentences: if len(sent.strip()) < 2: continue stn_tst = get_text(sent, hps) with torch.no_grad(): x_tst = stn_tst.to(device).unsqueeze(0) x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device) audio = net_g.infer( x_tst, x_tst_lengths, noise_scale=noise_scale, noise_scale_w=fixed_noise_w, length_scale=length_scale )[0][0,0].data.cpu().float().numpy() if sent.endswith("?"): pause_dur = base_pause + 0.2 elif sent.endswith("!"): pause_dur = base_pause + 0.1 else: pause_dur = base_pause silence = np.zeros(int(hps.data.sampling_rate * pause_dur)) audio_chunks.append(audio) audio_chunks.append(silence) final_audio = np.concatenate(audio_chunks) return (hps.data.sampling_rate, final_audio) theme = gr.themes.Soft( primary_hue="pink", secondary_hue="rose", neutral_hue="slate" ).set( button_primary_background_fill="linear-gradient(90deg, #ff69b4, #ff1493)", button_primary_background_fill_hover="linear-gradient(90deg, #ff1493, #c71585)", button_primary_text_color="white", ) custom_css = """ .banner-container { width: 100%; max-width: 100%; margin: 0 auto 20px auto; display: flex; justify-content: center; align-items: center; } .banner-container img { width: 100%; max-width: 1800px; max-height: 120px; height: auto; object-fit: scale-down; object-position: center; border-radius: 8px; } .main-title { text-align: center; color: #ff1493; font-size: 2em; font-weight: 700; margin: 15px 0 8px 0; } .subtitle { text-align: center; color: white; font-size: 1.1em; margin-bottom: 25px; font-weight: 400; } footer { display: none !important; } """ with gr.Blocks(theme=theme, css=custom_css, title="Sonya TTS") as app: with gr.Row(elem_classes="banner-container"): if os.path.exists("logo.png"): gr.Image("logo.png", show_label=False, container=False, elem_classes="banner-img") gr.HTML("""
High-fidelity AI speech with emotion, rhythm, and audiobook mode
""") with gr.Tabs(): with gr.TabItem("🎛️ Studio Mode"): with gr.Row(): with gr.Column(scale=2): inp_short = gr.Textbox( label="💬 Input Text", placeholder="Type something for Sonya to say...", lines=4, value="Hello! I am Sonya, your AI voice." ) with gr.Accordion("⚙️ Voice Controls", open=True): slider_ns = gr.Slider(0.1, 1.0, value=0.4, label="🎭 Emotion", info="Higher = more expressive") slider_nsw = gr.Slider(0.1, 1.0, value=0.5, label="🎵 Rhythm", info="Higher = looser timing") slider_ls = gr.Slider(0.5, 1.5, value=0.97, label="⏱ Speed", info="Lower = faster, Higher = slower") btn_short = gr.Button("✨ Generate Voice", variant="primary", size="lg") with gr.Column(scale=1): out_short = gr.Audio(label="🔊 Sonya's Voice", type="numpy") btn_short.click( infer_short, inputs=[inp_short, slider_ns, slider_nsw, slider_ls], outputs=[out_short] ) with gr.TabItem("📖 Audiobook Mode"): gr.Markdown( """Paste long text. Sonya will read it beautifully with natural pauses.
""", elem_classes="audiobook-description" ) with gr.Row(): with gr.Column(scale=2): inp_long = gr.Textbox( label="📜 Long Text Input", placeholder="Paste your story or article here...", lines=10 ) with gr.Accordion("⚙️ Narration Settings", open=False): long_ls = gr.Slider(0.5, 1.5, value=1.0, label="⏱ Reading Speed") long_ns = gr.Slider(0.1, 1.0, value=0.5, label="🎭 Tone Variation") btn_long = gr.Button("🎧 Read Aloud", variant="primary", size="lg") with gr.Column(scale=1): out_long = gr.Audio(label="📢 Full Narration", type="numpy") btn_long.click( infer_long, inputs=[inp_long, long_ls, long_ns], outputs=[out_long] ) if __name__ == "__main__": app.launch()