Spaces:
Sleeping
Sleeping
| # ================================== | |
| # 2) IMPORT LIBRARIES | |
| # ================================== | |
| import gradio as gr | |
| import whisper | |
| import tempfile | |
| from zyphra import ZyphraClient # Assumes the Zyphra package provides this client | |
| # ================================== | |
| # 3) LOAD WHISPER MODEL | |
| # ================================== | |
| model = whisper.load_model("base") | |
| # ================================== | |
| # 4) DEFINE PROCESSING FUNCTION | |
| # ================================== | |
| def process_media(media_file): | |
| """ | |
| This function: | |
| - Transcribes and translates the uploaded audio/video into English using Whisper. | |
| - Uses ZyphraClient (synchronous) to convert the English text to speech. | |
| - Returns both the synthesized audio and the English subtitles. | |
| """ | |
| try: | |
| # Transcribe and translate the media into English | |
| result = model.transcribe(media_file, task="translate") | |
| english_transcription = result["text"] | |
| # ================================== | |
| # Zyphra TTS API CALL using ZyphraClient | |
| # ================================== | |
| api_key = "zsk-c8741b6d61d76f872442699c84ed180e98f43b2b2cf4ed8f8c8da72c70fcfbb3" | |
| with ZyphraClient(api_key=api_key) as client: | |
| # Get audio bytes for the given text; adjust speaking_rate if desired. | |
| audio_data = client.audio.speech.create( | |
| text=english_transcription, | |
| speaking_rate=15 | |
| ) | |
| # Write the returned audio data to a temporary file | |
| temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".wav") | |
| temp_audio.write(audio_data) | |
| temp_audio.close() | |
| synthesized_audio = temp_audio.name | |
| return synthesized_audio, english_transcription | |
| except Exception as e: | |
| print("Error during processing:", e) | |
| return None, f"Error: {str(e)}" | |
| # ================================== | |
| # 5) BUILD GRADIO INTERFACE | |
| # ================================== | |
| interface = gr.Interface( | |
| fn=process_media, | |
| inputs=gr.File(label="Upload Audio or Video", file_types=["audio", "video"]), | |
| outputs=[ | |
| gr.Audio(type="filepath", label="Synthesized English Audio"), | |
| gr.Textbox(label="English Subtitles") | |
| ], | |
| title="Multilingual Media to English TTS Pipeline (Zyphra)", | |
| description=( | |
| "Upload an audio or video file in any language. The file is transcribed and translated into " | |
| "English using Whisper, then converted to speech via the Zyphra TTS service using ZyphraClient." | |
| ) | |
| ) | |
| # ================================== | |
| # 6) LAUNCH THE APP | |
| # ================================== | |
| interface.launch(debug=True) | |