Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -4,8 +4,7 @@ from transformers import (
|
|
| 4 |
MarianTokenizer,
|
| 5 |
pipeline,
|
| 6 |
AutoProcessor,
|
| 7 |
-
|
| 8 |
-
SpeechT5HifiGan
|
| 9 |
)
|
| 10 |
import torch
|
| 11 |
import numpy as np
|
|
@@ -31,29 +30,21 @@ language_models = {
|
|
| 31 |
asr = pipeline("automatic-speech-recognition", model="openai/whisper-small")
|
| 32 |
|
| 33 |
# --------------------------
|
| 34 |
-
# Text-to-Speech (
|
| 35 |
# --------------------------
|
| 36 |
-
tts_model_name = "microsoft/
|
| 37 |
-
|
| 38 |
-
tts_model =
|
| 39 |
-
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
|
| 40 |
-
|
| 41 |
-
# For voice cloning you’d normally load speaker embeddings.
|
| 42 |
-
# Here we just use random embedding so you always get a voice.
|
| 43 |
-
speaker_embeddings = torch.randn(1, 512)
|
| 44 |
|
| 45 |
def text_to_speech(text: str):
|
| 46 |
-
"""Convert translated text into speech waveform"""
|
| 47 |
-
inputs =
|
| 48 |
with torch.no_grad():
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
# Convert to numpy float32 for Gradio
|
| 55 |
-
audio = speech.cpu().numpy().astype(np.float32)
|
| 56 |
-
return (16000, audio) # ✅ Correct order: (sr, np.ndarray)
|
| 57 |
|
| 58 |
def translate_audio(audio, lang_pair):
|
| 59 |
"""Full pipeline: STT -> Translate -> TTS"""
|
|
@@ -76,13 +67,13 @@ def translate_audio(audio, lang_pair):
|
|
| 76 |
# Step 4: Convert translation to speech
|
| 77 |
sr, audio_array = text_to_speech(translated_text)
|
| 78 |
|
| 79 |
-
return translated_text, (sr, audio_array)
|
| 80 |
|
| 81 |
# --------------------------
|
| 82 |
# Gradio UI
|
| 83 |
# --------------------------
|
| 84 |
with gr.Blocks() as demo:
|
| 85 |
-
gr.Markdown("## 🎤 Speech Translator with
|
| 86 |
with gr.Row():
|
| 87 |
with gr.Column():
|
| 88 |
audio_input = gr.Audio(
|
|
@@ -109,4 +100,5 @@ with gr.Blocks() as demo:
|
|
| 109 |
outputs=[text_output, audio_output]
|
| 110 |
)
|
| 111 |
|
| 112 |
-
demo.launch()
|
|
|
|
|
|
| 4 |
MarianTokenizer,
|
| 5 |
pipeline,
|
| 6 |
AutoProcessor,
|
| 7 |
+
AutoModelForTextToWaveform,
|
|
|
|
| 8 |
)
|
| 9 |
import torch
|
| 10 |
import numpy as np
|
|
|
|
| 30 |
asr = pipeline("automatic-speech-recognition", model="openai/whisper-small")
|
| 31 |
|
| 32 |
# --------------------------
|
| 33 |
+
# Text-to-Speech (VibeVoice-1.5B)
|
| 34 |
# --------------------------
|
| 35 |
+
tts_model_name = "microsoft/VibeVoice-1.5B"
|
| 36 |
+
tts_processor = AutoProcessor.from_pretrained(tts_model_name)
|
| 37 |
+
tts_model = AutoModelForTextToWaveform.from_pretrained(tts_model_name)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
def text_to_speech(text: str):
|
| 40 |
+
"""Convert translated text into speech waveform with VibeVoice"""
|
| 41 |
+
inputs = tts_processor(text=text, return_tensors="pt")
|
| 42 |
with torch.no_grad():
|
| 43 |
+
audio = tts_model.generate(**inputs) # VibeVoice generates waveform directly
|
| 44 |
+
|
| 45 |
+
# Convert tensor -> numpy
|
| 46 |
+
audio_np = audio.cpu().numpy().astype(np.float32).squeeze()
|
| 47 |
+
return (16000, audio_np) # ✅ return tuple (sample_rate, waveform)
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
def translate_audio(audio, lang_pair):
|
| 50 |
"""Full pipeline: STT -> Translate -> TTS"""
|
|
|
|
| 67 |
# Step 4: Convert translation to speech
|
| 68 |
sr, audio_array = text_to_speech(translated_text)
|
| 69 |
|
| 70 |
+
return translated_text, (sr, audio_array)
|
| 71 |
|
| 72 |
# --------------------------
|
| 73 |
# Gradio UI
|
| 74 |
# --------------------------
|
| 75 |
with gr.Blocks() as demo:
|
| 76 |
+
gr.Markdown("## 🎤 Speech Translator with VibeVoice Output")
|
| 77 |
with gr.Row():
|
| 78 |
with gr.Column():
|
| 79 |
audio_input = gr.Audio(
|
|
|
|
| 100 |
outputs=[text_output, audio_output]
|
| 101 |
)
|
| 102 |
|
| 103 |
+
demo.launch()
|
| 104 |
+
|