drixo commited on
Commit
027eeb4
·
verified ·
1 Parent(s): 6a574a7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -24
app.py CHANGED
@@ -4,8 +4,7 @@ from transformers import (
4
  MarianTokenizer,
5
  pipeline,
6
  AutoProcessor,
7
- SpeechT5ForTextToSpeech,
8
- SpeechT5HifiGan
9
  )
10
  import torch
11
  import numpy as np
@@ -31,29 +30,21 @@ language_models = {
31
  asr = pipeline("automatic-speech-recognition", model="openai/whisper-small")
32
 
33
  # --------------------------
34
- # Text-to-Speech (SpeechT5 + vocoder)
35
  # --------------------------
36
- tts_model_name = "microsoft/speecht5_tts"
37
- processor = AutoProcessor.from_pretrained(tts_model_name)
38
- tts_model = SpeechT5ForTextToSpeech.from_pretrained(tts_model_name)
39
- vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
40
-
41
- # For voice cloning you’d normally load speaker embeddings.
42
- # Here we just use random embedding so you always get a voice.
43
- speaker_embeddings = torch.randn(1, 512)
44
 
45
  def text_to_speech(text: str):
46
- """Convert translated text into speech waveform"""
47
- inputs = processor(text=text, return_tensors="pt")
48
  with torch.no_grad():
49
- speech = tts_model.generate_speech(
50
- inputs["input_ids"],
51
- speaker_embeddings,
52
- vocoder=vocoder
53
- )
54
- # Convert to numpy float32 for Gradio
55
- audio = speech.cpu().numpy().astype(np.float32)
56
- return (16000, audio) # ✅ Correct order: (sr, np.ndarray)
57
 
58
  def translate_audio(audio, lang_pair):
59
  """Full pipeline: STT -> Translate -> TTS"""
@@ -76,13 +67,13 @@ def translate_audio(audio, lang_pair):
76
  # Step 4: Convert translation to speech
77
  sr, audio_array = text_to_speech(translated_text)
78
 
79
- return translated_text, (sr, audio_array) # ✅ Correct order
80
 
81
  # --------------------------
82
  # Gradio UI
83
  # --------------------------
84
  with gr.Blocks() as demo:
85
- gr.Markdown("## 🎤 Speech Translator with Voice Output")
86
  with gr.Row():
87
  with gr.Column():
88
  audio_input = gr.Audio(
@@ -109,4 +100,5 @@ with gr.Blocks() as demo:
109
  outputs=[text_output, audio_output]
110
  )
111
 
112
- demo.launch()
 
 
4
  MarianTokenizer,
5
  pipeline,
6
  AutoProcessor,
7
+ AutoModelForTextToWaveform,
 
8
  )
9
  import torch
10
  import numpy as np
 
30
  asr = pipeline("automatic-speech-recognition", model="openai/whisper-small")
31
 
32
  # --------------------------
33
+ # Text-to-Speech (VibeVoice-1.5B)
34
  # --------------------------
35
+ tts_model_name = "microsoft/VibeVoice-1.5B"
36
+ tts_processor = AutoProcessor.from_pretrained(tts_model_name)
37
+ tts_model = AutoModelForTextToWaveform.from_pretrained(tts_model_name)
 
 
 
 
 
38
 
39
  def text_to_speech(text: str):
40
+ """Convert translated text into speech waveform with VibeVoice"""
41
+ inputs = tts_processor(text=text, return_tensors="pt")
42
  with torch.no_grad():
43
+ audio = tts_model.generate(**inputs) # VibeVoice generates waveform directly
44
+
45
+ # Convert tensor -> numpy
46
+ audio_np = audio.cpu().numpy().astype(np.float32).squeeze()
47
+ return (16000, audio_np) # ✅ return tuple (sample_rate, waveform)
 
 
 
48
 
49
  def translate_audio(audio, lang_pair):
50
  """Full pipeline: STT -> Translate -> TTS"""
 
67
  # Step 4: Convert translation to speech
68
  sr, audio_array = text_to_speech(translated_text)
69
 
70
+ return translated_text, (sr, audio_array)
71
 
72
  # --------------------------
73
  # Gradio UI
74
  # --------------------------
75
  with gr.Blocks() as demo:
76
+ gr.Markdown("## 🎤 Speech Translator with VibeVoice Output")
77
  with gr.Row():
78
  with gr.Column():
79
  audio_input = gr.Audio(
 
100
  outputs=[text_output, audio_output]
101
  )
102
 
103
+ demo.launch()
104
+