Spaces:

LonewolfT141
/

Text_To_Speech_Model

Sleeping

File size: 2,701 Bytes

1166ec4



# ==================================
# 2) IMPORT LIBRARIES
# ==================================
import gradio as gr
import whisper
import tempfile
from zyphra import ZyphraClient  # Assumes the Zyphra package provides this client

# ==================================
# 3) LOAD WHISPER MODEL
# ==================================
model = whisper.load_model("base")

# ==================================
# 4) DEFINE PROCESSING FUNCTION
# ==================================
def process_media(media_file):
    """
    This function:
      - Transcribes and translates the uploaded audio/video into English using Whisper.
      - Uses ZyphraClient (synchronous) to convert the English text to speech.
      - Returns both the synthesized audio and the English subtitles.
    """
    try:
        # Transcribe and translate the media into English
        result = model.transcribe(media_file, task="translate")
        english_transcription = result["text"]
        
        # ==================================
        # Zyphra TTS API CALL using ZyphraClient
        # ==================================
        api_key = "zsk-c8741b6d61d76f872442699c84ed180e98f43b2b2cf4ed8f8c8da72c70fcfbb3"
        with ZyphraClient(api_key=api_key) as client:
            # Get audio bytes for the given text; adjust speaking_rate if desired.
            audio_data = client.audio.speech.create(
                text=english_transcription,
                speaking_rate=15
            )
        
        # Write the returned audio data to a temporary file
        temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
        temp_audio.write(audio_data)
        temp_audio.close()
        synthesized_audio = temp_audio.name
        
        return synthesized_audio, english_transcription
    
    except Exception as e:
        print("Error during processing:", e)
        return None, f"Error: {str(e)}"

# ==================================
# 5) BUILD GRADIO INTERFACE
# ==================================
interface = gr.Interface(
    fn=process_media,
    inputs=gr.File(label="Upload Audio or Video", file_types=["audio", "video"]),
    outputs=[
        gr.Audio(type="filepath", label="Synthesized English Audio"),
        gr.Textbox(label="English Subtitles")
    ],
    title="Multilingual Media to English TTS Pipeline (Zyphra)",
    description=(
        "Upload an audio or video file in any language. The file is transcribed and translated into "
        "English using Whisper, then converted to speech via the Zyphra TTS service using ZyphraClient."
    )
)

# ==================================
# 6) LAUNCH THE APP
# ==================================
interface.launch(debug=True)