| import gradio as gr | |
| import os | |
| import re | |
| import torch | |
| import numpy as np | |
| from scipy.io.wavfile import write | |
| from phonemizer.backend.espeak.wrapper import EspeakWrapper | |
| from safetensors.torch import load_file | |
| from huggingface_hub import hf_hub_download | |
| from tts import commons | |
| from tts import utils | |
| from tts.models import SynthesizerTrn | |
| from text.symbols import symbols | |
| from text import text_to_sequence | |
| _ESPEAK_LIBRARY = r"C:\Program Files\eSpeak NG\libespeak-ng.dll" | |
| if os.path.exists(_ESPEAK_LIBRARY): | |
| EspeakWrapper.set_library(_ESPEAK_LIBRARY) | |
| print(f"β Found eSpeak-ng: {_ESPEAK_LIBRARY}") | |
| REPO_ID = "PatnaikAshish/Sonya-TTS" | |
| MODEL_FILENAME = "checkpoints/sonya-tts.safetensors" | |
| CONFIG_FILENAME = "checkpoints/config.json" | |
| LOCAL_MODEL_PATH = "checkpoints/sonya-tts.safetensors" | |
| LOCAL_CONFIG_PATH = "checkpoints/config.json" | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| def clean_text_for_vits(text): | |
| text = text.strip() | |
| text = text.replace("'", "'") | |
| text = text.replace(""", '"').replace(""", '"') | |
| text = text.replace("β", "-").replace("β", "-") | |
| text = re.sub(r"[()\[\]{}<>]", "", text) | |
| text = re.sub(r"[^a-zA-Z0-9\s.,!?'\-]", "", text) | |
| text = re.sub(r"\s+", " ", text) | |
| return text | |
| def get_text(text, hps): | |
| text = clean_text_for_vits(text) | |
| text_norm = text_to_sequence(text, hps.data.text_cleaners) | |
| if hps.data.add_blank: | |
| text_norm = commons.intersperse(text_norm, 0) | |
| return torch.LongTensor(text_norm) | |
| def split_sentences(text): | |
| text = clean_text_for_vits(text) | |
| if not text: | |
| return [] | |
| return re.split(r'(?<=[.!?])\s+', text) | |
| print("π Loading Sonya TTS Model...") | |
| if os.path.exists(LOCAL_MODEL_PATH) and os.path.exists(LOCAL_CONFIG_PATH): | |
| print("β Loading Sonya TTS from local checkpoints...") | |
| model_path = LOCAL_MODEL_PATH | |
| config_path = LOCAL_CONFIG_PATH | |
| else: | |
| print("π Downloading Sonya TTS from Hugging Face...") | |
| model_path = hf_hub_download(repo_id=REPO_ID, filename=MODEL_FILENAME) | |
| config_path = hf_hub_download(repo_id=REPO_ID, filename=CONFIG_FILENAME) | |
| hps = utils.get_hparams_from_file(config_path) | |
| net_g = SynthesizerTrn( | |
| len(symbols), | |
| hps.data.filter_length // 2 + 1, | |
| hps.train.segment_size // hps.data.hop_length, | |
| **hps.model | |
| ).to(device) | |
| net_g.eval() | |
| state_dict = load_file(model_path) | |
| net_g.load_state_dict(state_dict) | |
| print("π Sonya TTS loaded successfully!") | |
| def infer_short(text, noise_scale, noise_scale_w, length_scale): | |
| if not text.strip(): | |
| return None | |
| stn_tst = get_text(text, hps) | |
| with torch.no_grad(): | |
| x_tst = stn_tst.to(device).unsqueeze(0) | |
| x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device) | |
| audio = net_g.infer( | |
| x_tst, | |
| x_tst_lengths, | |
| noise_scale=noise_scale, | |
| noise_scale_w=noise_scale_w, | |
| length_scale=length_scale | |
| )[0][0,0].data.cpu().float().numpy() | |
| return (hps.data.sampling_rate, audio) | |
| def infer_long(text, length_scale, noise_scale): | |
| if not text.strip(): | |
| return None | |
| sentences = split_sentences(text) | |
| audio_chunks = [] | |
| fixed_noise_w = 0.6 | |
| base_pause = 0.3 | |
| for sent in sentences: | |
| if len(sent.strip()) < 2: | |
| continue | |
| stn_tst = get_text(sent, hps) | |
| with torch.no_grad(): | |
| x_tst = stn_tst.to(device).unsqueeze(0) | |
| x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device) | |
| audio = net_g.infer( | |
| x_tst, | |
| x_tst_lengths, | |
| noise_scale=noise_scale, | |
| noise_scale_w=fixed_noise_w, | |
| length_scale=length_scale | |
| )[0][0,0].data.cpu().float().numpy() | |
| if sent.endswith("?"): | |
| pause_dur = base_pause + 0.2 | |
| elif sent.endswith("!"): | |
| pause_dur = base_pause + 0.1 | |
| else: | |
| pause_dur = base_pause | |
| silence = np.zeros(int(hps.data.sampling_rate * pause_dur)) | |
| audio_chunks.append(audio) | |
| audio_chunks.append(silence) | |
| final_audio = np.concatenate(audio_chunks) | |
| return (hps.data.sampling_rate, final_audio) | |
| theme = gr.themes.Soft( | |
| primary_hue="pink", | |
| secondary_hue="rose", | |
| neutral_hue="slate" | |
| ).set( | |
| button_primary_background_fill="linear-gradient(90deg, #ff69b4, #ff1493)", | |
| button_primary_background_fill_hover="linear-gradient(90deg, #ff1493, #c71585)", | |
| button_primary_text_color="white", | |
| ) | |
| custom_css = """ | |
| .banner-container { | |
| width: 100%; | |
| max-width: 100%; | |
| margin: 0 auto 20px auto; | |
| display: flex; | |
| justify-content: center; | |
| align-items: center; | |
| } | |
| .banner-container img { | |
| width: 100%; | |
| max-width: 1800px; | |
| max-height: 120px; | |
| height: auto; | |
| object-fit: scale-down; | |
| object-position: center; | |
| border-radius: 8px; | |
| } | |
| .main-title { | |
| text-align: center; | |
| color: #ff1493; | |
| font-size: 2em; | |
| font-weight: 700; | |
| margin: 15px 0 8px 0; | |
| } | |
| .subtitle { | |
| text-align: center; | |
| color: white; | |
| font-size: 1.1em; | |
| margin-bottom: 25px; | |
| font-weight: 400; | |
| } | |
| footer { | |
| display: none !important; | |
| } | |
| """ | |
| with gr.Blocks(theme=theme, css=custom_css, title="Sonya TTS") as app: | |
| with gr.Row(elem_classes="banner-container"): | |
| if os.path.exists("logo.png"): | |
| gr.Image("logo.png", show_label=False, container=False, elem_classes="banner-img") | |
| gr.HTML(""" | |
| <h1 class="main-title">β¨ Sonya TTS β A Beautiful, Expressive Neural Voice Engine</h1> | |
| <p class="subtitle">High-fidelity AI speech with emotion, rhythm, and audiobook mode</p> | |
| """) | |
| with gr.Tabs(): | |
| with gr.TabItem("ποΈ Studio Mode"): | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| inp_short = gr.Textbox( | |
| label="π¬ Input Text", | |
| placeholder="Type something for Sonya to say...", | |
| lines=4, | |
| value="Hello! I am Sonya, your AI voice." | |
| ) | |
| with gr.Accordion("βοΈ Voice Controls", open=True): | |
| slider_ns = gr.Slider(0.1, 1.0, value=0.4, label="π Emotion", info="Higher = more expressive") | |
| slider_nsw = gr.Slider(0.1, 1.0, value=0.5, label="π΅ Rhythm", info="Higher = looser timing") | |
| slider_ls = gr.Slider(0.5, 1.5, value=0.97, label="β± Speed", info="Lower = faster, Higher = slower") | |
| btn_short = gr.Button("β¨ Generate Voice", variant="primary", size="lg") | |
| with gr.Column(scale=1): | |
| out_short = gr.Audio(label="π Sonya's Voice", type="numpy") | |
| btn_short.click( | |
| infer_short, | |
| inputs=[inp_short, slider_ns, slider_nsw, slider_ls], | |
| outputs=[out_short] | |
| ) | |
| with gr.TabItem("π Audiobook Mode"): | |
| gr.Markdown( | |
| """<p style='text-align: center; color: #666; font-size: 1.05em;'> | |
| Paste long text. Sonya will read it beautifully with natural pauses. | |
| </p>""", | |
| elem_classes="audiobook-description" | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| inp_long = gr.Textbox( | |
| label="π Long Text Input", | |
| placeholder="Paste your story or article here...", | |
| lines=10 | |
| ) | |
| with gr.Accordion("βοΈ Narration Settings", open=False): | |
| long_ls = gr.Slider(0.5, 1.5, value=1.0, label="β± Reading Speed") | |
| long_ns = gr.Slider(0.1, 1.0, value=0.5, label="π Tone Variation") | |
| btn_long = gr.Button("π§ Read Aloud", variant="primary", size="lg") | |
| with gr.Column(scale=1): | |
| out_long = gr.Audio(label="π’ Full Narration", type="numpy") | |
| btn_long.click( | |
| infer_long, | |
| inputs=[inp_long, long_ls, long_ns], | |
| outputs=[out_long] | |
| ) | |
| if __name__ == "__main__": | |
| app.launch() |