Spaces:
Runtime error
Runtime error
| import torch | |
| import gradio as gr | |
| import torchaudio | |
| import time | |
| from datetime import datetime | |
| import numpy as np # Add this import for handling numpy arrays | |
| from transformers import pipeline | |
| from tortoise.api import TextToSpeech | |
| from tortoise.utils.text import split_and_recombine_text | |
| from tortoise.utils.audio import load_audio, load_voice, load_voices | |
| # STT Initialization | |
| model_id = "openai/whisper-tiny" | |
| pipe = pipeline("automatic-speech-recognition", model=model_id) | |
| # TTS Initialization | |
| VOICE_OPTIONS = [ | |
| "indian_f_1", "indian_F_2", "indian_F_3", | |
| "indian_M_1", "indian_M_2", "indian_M_3" | |
| ] | |
| tts = TextToSpeech(kv_cache=True, use_deepspeed=True, half=True) | |
| def convert_audio(filepath, voice="indian_F_1"): | |
| # Transcribe audio to text using STT | |
| ) | |
| transcribed_text = transcription_output["text"] | |
| # Use the transcribed text for TTS | |
| texts = split_and_recombine_text(transcribed_text) | |
| voice_samples, conditioning_latents = load_voice(voice) | |
| audio_frames = [] | |
| for text in texts: | |
| for audio_frame in tts.tts_with_preset( | |
| text, | |
| voice_samples=voice_samples, | |
| k=1 | |
| ): | |
| audio_frames.append(audio_frame.cpu().detach().numpy()) | |
| # Joining the audio frames for output using numpy's concatenate | |
| final_audio = np.concatenate(audio_frames, axis=0) | |
| interface = gr.Interface( | |
| fn=convert_audio, | |
| inputs=[ | |
| gr.Audio(source="upload", type="filepath"), | |
| gr.Dropdown(VOICE_OPTIONS, value="indian_f_1", label="Select voice:", type="value") | |
| ], | |
| outputs=gr.Audio(label="streaming audio:", streaming=True, autoplay=True), | |
| title="STT to TTS", | |
| description="Convert spoken words into a different voice" | |
| ) | |
| interface.launch() |