LonewolfT141's picture
Create app.py
1166ec4 verified
# ==================================
# 2) IMPORT LIBRARIES
# ==================================
import gradio as gr
import whisper
import tempfile
from zyphra import ZyphraClient # Assumes the Zyphra package provides this client
# ==================================
# 3) LOAD WHISPER MODEL
# ==================================
model = whisper.load_model("base")
# ==================================
# 4) DEFINE PROCESSING FUNCTION
# ==================================
def process_media(media_file):
"""
This function:
- Transcribes and translates the uploaded audio/video into English using Whisper.
- Uses ZyphraClient (synchronous) to convert the English text to speech.
- Returns both the synthesized audio and the English subtitles.
"""
try:
# Transcribe and translate the media into English
result = model.transcribe(media_file, task="translate")
english_transcription = result["text"]
# ==================================
# Zyphra TTS API CALL using ZyphraClient
# ==================================
api_key = "zsk-c8741b6d61d76f872442699c84ed180e98f43b2b2cf4ed8f8c8da72c70fcfbb3"
with ZyphraClient(api_key=api_key) as client:
# Get audio bytes for the given text; adjust speaking_rate if desired.
audio_data = client.audio.speech.create(
text=english_transcription,
speaking_rate=15
)
# Write the returned audio data to a temporary file
temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
temp_audio.write(audio_data)
temp_audio.close()
synthesized_audio = temp_audio.name
return synthesized_audio, english_transcription
except Exception as e:
print("Error during processing:", e)
return None, f"Error: {str(e)}"
# ==================================
# 5) BUILD GRADIO INTERFACE
# ==================================
interface = gr.Interface(
fn=process_media,
inputs=gr.File(label="Upload Audio or Video", file_types=["audio", "video"]),
outputs=[
gr.Audio(type="filepath", label="Synthesized English Audio"),
gr.Textbox(label="English Subtitles")
],
title="Multilingual Media to English TTS Pipeline (Zyphra)",
description=(
"Upload an audio or video file in any language. The file is transcribed and translated into "
"English using Whisper, then converted to speech via the Zyphra TTS service using ZyphraClient."
)
)
# ==================================
# 6) LAUNCH THE APP
# ==================================
interface.launch(debug=True)