Spaces:

drixo
/

Translator

Sleeping

App Files Files Community

drixo commited on Sep 3, 2025

Commit

027eeb4

verified ·

1 Parent(s): 6a574a7

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -24

app.py CHANGED Viewed

@@ -4,8 +4,7 @@ from transformers import (
     MarianTokenizer,
     pipeline,
     AutoProcessor,
-    SpeechT5ForTextToSpeech,
-    SpeechT5HifiGan
 )
 import torch
 import numpy as np
@@ -31,29 +30,21 @@ language_models = {
 asr = pipeline("automatic-speech-recognition", model="openai/whisper-small")
 # --------------------------
-# Text-to-Speech (SpeechT5 + vocoder)
 # --------------------------
-tts_model_name = "microsoft/speecht5_tts"
-processor = AutoProcessor.from_pretrained(tts_model_name)
-tts_model = SpeechT5ForTextToSpeech.from_pretrained(tts_model_name)
-vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
-# For voice cloning you’d normally load speaker embeddings.
-# Here we just use random embedding so you always get a voice.
-speaker_embeddings = torch.randn(1, 512)
 def text_to_speech(text: str):
-    """Convert translated text into speech waveform"""
-    inputs = processor(text=text, return_tensors="pt")
     with torch.no_grad():
-        speech = tts_model.generate_speech(
-            inputs["input_ids"],
-            speaker_embeddings,
-            vocoder=vocoder
-        )
-    # Convert to numpy float32 for Gradio
-    audio = speech.cpu().numpy().astype(np.float32)
-    return (16000, audio)  # ✅ Correct order: (sr, np.ndarray)
 def translate_audio(audio, lang_pair):
     """Full pipeline: STT -> Translate -> TTS"""
@@ -76,13 +67,13 @@ def translate_audio(audio, lang_pair):
     # Step 4: Convert translation to speech
     sr, audio_array = text_to_speech(translated_text)
-    return translated_text, (sr, audio_array)  # ✅ Correct order
 # --------------------------
 # Gradio UI
 # --------------------------
 with gr.Blocks() as demo:
-    gr.Markdown("## 🎤 Speech Translator with Voice Output")
     with gr.Row():
         with gr.Column():
             audio_input = gr.Audio(
@@ -109,4 +100,5 @@ with gr.Blocks() as demo:
         outputs=[text_output, audio_output]
     )
-demo.launch()

     MarianTokenizer,
     pipeline,
     AutoProcessor,
+    AutoModelForTextToWaveform,
 )
 import torch
 import numpy as np
 asr = pipeline("automatic-speech-recognition", model="openai/whisper-small")
 # --------------------------
+# Text-to-Speech (VibeVoice-1.5B)
 # --------------------------
+tts_model_name = "microsoft/VibeVoice-1.5B"
+tts_processor = AutoProcessor.from_pretrained(tts_model_name)
+tts_model = AutoModelForTextToWaveform.from_pretrained(tts_model_name)
 def text_to_speech(text: str):
+    """Convert translated text into speech waveform with VibeVoice"""
+    inputs = tts_processor(text=text, return_tensors="pt")
     with torch.no_grad():
+        audio = tts_model.generate(**inputs)  # VibeVoice generates waveform directly
+    # Convert tensor -> numpy
+    audio_np = audio.cpu().numpy().astype(np.float32).squeeze()
+    return (16000, audio_np)  # ✅ return tuple (sample_rate, waveform)
 def translate_audio(audio, lang_pair):
     """Full pipeline: STT -> Translate -> TTS"""
     # Step 4: Convert translation to speech
     sr, audio_array = text_to_speech(translated_text)
+    return translated_text, (sr, audio_array)
 # --------------------------
 # Gradio UI
 # --------------------------
 with gr.Blocks() as demo:
+    gr.Markdown("## 🎤 Speech Translator with VibeVoice Output")
     with gr.Row():
         with gr.Column():
             audio_input = gr.Audio(
         outputs=[text_output, audio_output]
     )
+demo.launch()