Update app.py
Browse files
app.py
CHANGED
|
@@ -12,25 +12,25 @@ transcriber = pipeline("automatic-speech-recognition", model="facebook/s2t-small
|
|
| 12 |
generator = pipeline("text-generation", model="gpt2")
|
| 13 |
|
| 14 |
# Initialize TTS tokenizer and model
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
# Initialize ASR pipeline
|
| 19 |
-
print("TTS Tokenizer:", tokenizer_tts) # Print the tokenizer for the TTS model
|
| 20 |
|
| 21 |
def transcribe_and_generate_audio(audio):
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
# Transcribe audio
|
| 24 |
-
asr_output = transcriber(
|
| 25 |
|
| 26 |
# Generate text based on ASR output
|
| 27 |
-
generated_text = generator(
|
| 28 |
|
| 29 |
-
# Generate audio from text
|
| 30 |
-
inputs =
|
| 31 |
set_seed(555)
|
| 32 |
with torch.no_grad():
|
| 33 |
-
outputs =
|
| 34 |
waveform = outputs.waveform[0]
|
| 35 |
waveform_path = "output.wav"
|
| 36 |
sf.write(waveform_path, waveform.numpy(), 16000, format='wav')
|
|
@@ -47,4 +47,4 @@ audio_input = gr.Interface(
|
|
| 47 |
)
|
| 48 |
|
| 49 |
# Launch the interface
|
| 50 |
-
audio_input.launch()
|
|
|
|
| 12 |
generator = pipeline("text-generation", model="gpt2")
|
| 13 |
|
| 14 |
# Initialize TTS tokenizer and model
|
| 15 |
+
tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
|
| 16 |
+
model = VitsModel.from_pretrained("facebook/mms-tts-eng")
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
def transcribe_and_generate_audio(audio):
|
| 19 |
+
sr, y = audio
|
| 20 |
+
y = y.astype(np.float32)
|
| 21 |
+
y /= np.max(np.abs(y))
|
| 22 |
|
| 23 |
# Transcribe audio
|
| 24 |
+
asr_output = transcriber({"sampling_rate": sr, "raw": y})["text"]
|
| 25 |
|
| 26 |
# Generate text based on ASR output
|
| 27 |
+
generated_text = generator(asr_output)[0]['generated_text']
|
| 28 |
|
| 29 |
+
# Generate audio from text
|
| 30 |
+
inputs = tokenizer(text=generated_text, return_tensors="pt")
|
| 31 |
set_seed(555)
|
| 32 |
with torch.no_grad():
|
| 33 |
+
outputs = model(**inputs)
|
| 34 |
waveform = outputs.waveform[0]
|
| 35 |
waveform_path = "output.wav"
|
| 36 |
sf.write(waveform_path, waveform.numpy(), 16000, format='wav')
|
|
|
|
| 47 |
)
|
| 48 |
|
| 49 |
# Launch the interface
|
| 50 |
+
audio_input.launch()
|