Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
import torch
|
| 2 |
import torchaudio
|
| 3 |
import gradio as gr
|
| 4 |
-
import
|
| 5 |
import wave
|
| 6 |
import numpy as np
|
| 7 |
from transformers import WhisperForCTC, WhisperProcessor, AutoModelForSeq2SeqLM, AutoTokenizer
|
|
@@ -29,16 +29,16 @@ def transcribe(audio):
|
|
| 29 |
predicted_ids = torch.argmax(logits, dim=-1)
|
| 30 |
transcription = processor_asr.batch_decode(predicted_ids)
|
| 31 |
return transcription[0]
|
| 32 |
-
|
| 33 |
-
@spaces.GPU(
|
| 34 |
# Text-to-text function
|
| 35 |
def generate_response(text):
|
| 36 |
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
|
| 37 |
outputs = text_model.generate(**inputs)
|
| 38 |
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 39 |
return response
|
| 40 |
-
|
| 41 |
-
@spaces.GPU(
|
| 42 |
# TTS function
|
| 43 |
def synthesize_speech(text):
|
| 44 |
inputs = tts_processor(text, return_tensors="pt")
|
|
@@ -46,58 +46,49 @@ def synthesize_speech(text):
|
|
| 46 |
mel_outputs, mel_outputs_postnet, _, alignments = tts_model.inference(inputs.input_ids)
|
| 47 |
audio = tts_model.infer(mel_outputs_postnet)
|
| 48 |
return audio
|
| 49 |
-
|
| 50 |
-
@spaces.GPU(
|
| 51 |
# Real-time processing function
|
| 52 |
def real_time_pipeline():
|
| 53 |
-
|
| 54 |
-
|
|
|
|
|
|
|
| 55 |
|
| 56 |
wake_word = "hello mate"
|
| 57 |
wake_word_detected = False
|
| 58 |
|
| 59 |
print("Listening for wake word...")
|
| 60 |
|
|
|
|
|
|
|
|
|
|
| 61 |
try:
|
| 62 |
while True:
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
# Save the audio to a temporary file for ASR
|
| 70 |
-
wf = wave.open("temp.wav", 'wb')
|
| 71 |
-
wf.setnchannels(1)
|
| 72 |
-
wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
|
| 73 |
-
wf.setframerate(16000)
|
| 74 |
-
wf.writeframes(b''.join(frames))
|
| 75 |
-
wf.close()
|
| 76 |
|
| 77 |
# Step 1: Transcribe audio to text
|
| 78 |
-
transcription = transcribe(
|
| 79 |
|
| 80 |
if wake_word in transcription:
|
| 81 |
wake_word_detected = True
|
| 82 |
print("Wake word detected. Processing audio...")
|
| 83 |
|
| 84 |
while wake_word_detected:
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
# Save the audio to a temporary file for ASR
|
| 92 |
-
wf = wave.open("temp.wav", 'wb')
|
| 93 |
-
wf.setnchannels(1)
|
| 94 |
-
wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
|
| 95 |
-
wf.setframerate(16000)
|
| 96 |
-
wf.writeframes(b''.join(frames))
|
| 97 |
-
wf.close()
|
| 98 |
|
| 99 |
# Step 1: Transcribe audio to text
|
| 100 |
-
transcription = transcribe(
|
| 101 |
|
| 102 |
# Step 2: Generate response using text-to-text model
|
| 103 |
response = generate_response(transcription)
|
|
@@ -109,26 +100,12 @@ def real_time_pipeline():
|
|
| 109 |
output_path = "output.wav"
|
| 110 |
torchaudio.save(output_path, synthesized_audio.squeeze(1), 22050)
|
| 111 |
|
| 112 |
-
# Play the synthesized audio
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
rate=wf.getframerate(),
|
| 117 |
-
output=True)
|
| 118 |
-
|
| 119 |
-
data = wf.readframes(1024)
|
| 120 |
-
while data:
|
| 121 |
-
stream_out.write(data)
|
| 122 |
-
data = wf.readframes(1024)
|
| 123 |
-
stream_out.stop_stream()
|
| 124 |
-
stream_out.close()
|
| 125 |
-
wf.close()
|
| 126 |
except KeyboardInterrupt:
|
| 127 |
print("Stopping...")
|
| 128 |
-
finally:
|
| 129 |
-
stream.stop_stream()
|
| 130 |
-
stream.close()
|
| 131 |
-
p.terminate()
|
| 132 |
|
| 133 |
# Gradio interface
|
| 134 |
gr_interface = gr.Interface(
|
|
@@ -140,4 +117,5 @@ gr_interface = gr.Interface(
|
|
| 140 |
description="ASR + Text-to-Text Model + TTS with Human-like Voice and Emotions"
|
| 141 |
)
|
| 142 |
|
| 143 |
-
|
|
|
|
|
|
| 1 |
import torch
|
| 2 |
import torchaudio
|
| 3 |
import gradio as gr
|
| 4 |
+
import soundfile as sf
|
| 5 |
import wave
|
| 6 |
import numpy as np
|
| 7 |
from transformers import WhisperForCTC, WhisperProcessor, AutoModelForSeq2SeqLM, AutoTokenizer
|
|
|
|
| 29 |
predicted_ids = torch.argmax(logits, dim=-1)
|
| 30 |
transcription = processor_asr.batch_decode(predicted_ids)
|
| 31 |
return transcription[0]
|
| 32 |
+
|
| 33 |
+
@spaces.GPU()
|
| 34 |
# Text-to-text function
|
| 35 |
def generate_response(text):
|
| 36 |
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
|
| 37 |
outputs = text_model.generate(**inputs)
|
| 38 |
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 39 |
return response
|
| 40 |
+
|
| 41 |
+
@spaces.GPU()
|
| 42 |
# TTS function
|
| 43 |
def synthesize_speech(text):
|
| 44 |
inputs = tts_processor(text, return_tensors="pt")
|
|
|
|
| 46 |
mel_outputs, mel_outputs_postnet, _, alignments = tts_model.inference(inputs.input_ids)
|
| 47 |
audio = tts_model.infer(mel_outputs_postnet)
|
| 48 |
return audio
|
| 49 |
+
|
| 50 |
+
@spaces.GPU()
|
| 51 |
# Real-time processing function
|
| 52 |
def real_time_pipeline():
|
| 53 |
+
# Adjust this part to handle live recording using soundfile and play back using simpleaudio
|
| 54 |
+
import simpleaudio as sa
|
| 55 |
+
import tempfile
|
| 56 |
+
import time
|
| 57 |
|
| 58 |
wake_word = "hello mate"
|
| 59 |
wake_word_detected = False
|
| 60 |
|
| 61 |
print("Listening for wake word...")
|
| 62 |
|
| 63 |
+
with tempfile.NamedTemporaryFile(delete=False) as tmp_wav_file:
|
| 64 |
+
tmp_wav_path = tmp_wav_file.name
|
| 65 |
+
|
| 66 |
try:
|
| 67 |
while True:
|
| 68 |
+
# Capture audio here (this is a simplified example, you need actual audio capture logic)
|
| 69 |
+
time.sleep(2) # Simulate 2 seconds of audio capture
|
| 70 |
+
|
| 71 |
+
# Save the captured audio to the temp file for ASR
|
| 72 |
+
data, sample_rate = sf.read(tmp_wav_path)
|
| 73 |
+
sf.write(tmp_wav_path, data, sample_rate)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
|
| 75 |
# Step 1: Transcribe audio to text
|
| 76 |
+
transcription = transcribe(tmp_wav_path).lower()
|
| 77 |
|
| 78 |
if wake_word in transcription:
|
| 79 |
wake_word_detected = True
|
| 80 |
print("Wake word detected. Processing audio...")
|
| 81 |
|
| 82 |
while wake_word_detected:
|
| 83 |
+
# Capture audio here (this is a simplified example, you need actual audio capture logic)
|
| 84 |
+
time.sleep(2) # Simulate 2 seconds of audio capture
|
| 85 |
+
|
| 86 |
+
# Save the captured audio to the temp file for ASR
|
| 87 |
+
data, sample_rate = sf.read(tmp_wav_path)
|
| 88 |
+
sf.write(tmp_wav_path, data, sample_rate)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
|
| 90 |
# Step 1: Transcribe audio to text
|
| 91 |
+
transcription = transcribe(tmp_wav_path)
|
| 92 |
|
| 93 |
# Step 2: Generate response using text-to-text model
|
| 94 |
response = generate_response(transcription)
|
|
|
|
| 100 |
output_path = "output.wav"
|
| 101 |
torchaudio.save(output_path, synthesized_audio.squeeze(1), 22050)
|
| 102 |
|
| 103 |
+
# Play the synthesized audio using simpleaudio
|
| 104 |
+
wave_obj = sa.WaveObject.from_wave_file(output_path)
|
| 105 |
+
play_obj = wave_obj.play()
|
| 106 |
+
play_obj.wait_done()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
except KeyboardInterrupt:
|
| 108 |
print("Stopping...")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
|
| 110 |
# Gradio interface
|
| 111 |
gr_interface = gr.Interface(
|
|
|
|
| 117 |
description="ASR + Text-to-Text Model + TTS with Human-like Voice and Emotions"
|
| 118 |
)
|
| 119 |
|
| 120 |
+
|
| 121 |
+
iface.launch(inline=False)
|