|
|
import gradio as gr |
|
|
from TTS.api import TTS |
|
|
import tempfile |
|
|
import os |
|
|
|
|
|
|
|
|
model_name = "tts_models/en/vctk/vits" |
|
|
tts = TTS(model_name) |
|
|
|
|
|
|
|
|
speaker_labels = { |
|
|
"p225": "Male, Young Adult", |
|
|
"p226": "Female, Middle-Aged", |
|
|
"p227": "Male, Mature Storyteller", |
|
|
"p228": "Female, Young Adult", |
|
|
"p229": "Male, Elderly Narrator", |
|
|
"p230": "Female, Warm Storyteller", |
|
|
"p231": "Male, Deep Voice", |
|
|
"p232": "Female, Clear Articulation", |
|
|
"p233": "Male, Authoritative", |
|
|
"p234": "Female, Gentle Storyteller" |
|
|
} |
|
|
|
|
|
|
|
|
available_speakers = [spk for spk in tts.speakers if spk in speaker_labels] |
|
|
|
|
|
def text_to_speech(text, speaker_name, speed, pitch): |
|
|
try: |
|
|
if not text.strip(): |
|
|
raise ValueError("Please enter some text") |
|
|
|
|
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f: |
|
|
tts.tts_to_file( |
|
|
text=text, |
|
|
speaker=speaker_name, |
|
|
file_path=f.name, |
|
|
speed=speed |
|
|
) |
|
|
output_path = f.name |
|
|
|
|
|
|
|
|
if pitch != 0.0: |
|
|
try: |
|
|
import sox |
|
|
tfm = sox.Transformer() |
|
|
tfm.pitch(pitch) |
|
|
adjusted_file = output_path + "_adjusted.wav" |
|
|
tfm.build_file(output_path, adjusted_file) |
|
|
os.replace(adjusted_file, output_path) |
|
|
except ImportError: |
|
|
print("Sox not installed; skipping pitch adjustment.") |
|
|
|
|
|
return output_path |
|
|
|
|
|
except Exception as e: |
|
|
raise gr.Error(f"Error generating speech: {str(e)}") |
|
|
|
|
|
def create_download_link(audio_file): |
|
|
if audio_file and os.path.exists(audio_file): |
|
|
return gr.update(visible=True, value=audio_file) |
|
|
return gr.update(visible=False) |
|
|
|
|
|
with gr.Blocks(title="Storytelling TTS App") as app: |
|
|
gr.Markdown("# ποΈ Professional Storytelling Text-to-Speech") |
|
|
gr.Markdown("Convert your text into narrated audio using expressive voices. Ideal for audiobooks, storytelling, and podcast narration.") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
text_input = gr.Textbox( |
|
|
label="Enter your story text", |
|
|
lines=8, |
|
|
placeholder="Once upon a time..." |
|
|
) |
|
|
|
|
|
speaker = gr.Dropdown( |
|
|
choices=available_speakers, |
|
|
label="Narrator Voice", |
|
|
value="p227", |
|
|
format_func=lambda x: speaker_labels[x] |
|
|
) |
|
|
|
|
|
with gr.Accordion("ποΈ Voice Adjustment", open=True): |
|
|
speed = gr.Slider( |
|
|
minimum=0.5, maximum=2.0, |
|
|
value=1.0, step=0.1, |
|
|
label="Speaking Rate", |
|
|
info="1.0 = normal speed" |
|
|
) |
|
|
pitch = gr.Slider( |
|
|
minimum=-5.0, maximum=5.0, |
|
|
value=0.0, step=0.5, |
|
|
label="Pitch Shift (in semitones)", |
|
|
info="0 = normal, positive = higher pitch" |
|
|
) |
|
|
|
|
|
generate_btn = gr.Button("π§ Generate Narration", variant="primary") |
|
|
|
|
|
with gr.Column(): |
|
|
audio_output = gr.Audio( |
|
|
label="Generated Narration", |
|
|
type="filepath", |
|
|
elem_classes=["output-audio"] |
|
|
) |
|
|
download_button = gr.DownloadButton( |
|
|
label="Download Audio", visible=False |
|
|
) |
|
|
|
|
|
with gr.Accordion("π€ Preview Narrator Voices (Samples Coming Soon)", open=False): |
|
|
gr.Markdown("Previews will be available here once sample audios are added.") |
|
|
for speaker_id in available_speakers[:3]: |
|
|
gr.Audio( |
|
|
value=None, |
|
|
label=speaker_labels[speaker_id], |
|
|
visible=False |
|
|
) |
|
|
|
|
|
generate_btn.click( |
|
|
fn=text_to_speech, |
|
|
inputs=[text_input, speaker, speed, pitch], |
|
|
outputs=audio_output |
|
|
).then( |
|
|
fn=create_download_link, |
|
|
inputs=audio_output, |
|
|
outputs=download_button |
|
|
) |
|
|
|
|
|
gr.Examples( |
|
|
examples=[ |
|
|
["The old man sat by the fireplace, his eyes twinkling with memories of adventures past.", "p227", 0.9, 0.0], |
|
|
["In a quiet village nestled between the mountains, a young girl discovered a secret that would change everything.", "p234", 1.0, 0.5], |
|
|
["The detective examined the clue carefully, knowing this small piece of evidence could crack the entire case wide open.", "p231", 1.1, -1.0] |
|
|
], |
|
|
inputs=[text_input, speaker, speed, pitch], |
|
|
outputs=audio_output, |
|
|
fn=text_to_speech, |
|
|
cache_examples=False |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
try: |
|
|
import sox |
|
|
except ImportError: |
|
|
print("Consider installing sox for pitch adjustment: pip install sox") |
|
|
|
|
|
app.launch() |