RealTime

Paused

App Files Files Community

VanguardAI commited on Jul 3, 2024

Commit

f1f4016

verified ·

1 Parent(s): 796ef02

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -56

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import torch
 import torchaudio
 import gradio as gr
-import pyaudio
 import wave
 import numpy as np
 from transformers import WhisperForCTC, WhisperProcessor, AutoModelForSeq2SeqLM, AutoTokenizer
@@ -29,16 +29,16 @@ def transcribe(audio):
     predicted_ids = torch.argmax(logits, dim=-1)
     transcription = processor_asr.batch_decode(predicted_ids)
     return transcription[0]
-@spaces.GPU(duration=300)
 # Text-to-text function
 def generate_response(text):
     inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
     outputs = text_model.generate(**inputs)
     response = tokenizer.decode(outputs[0], skip_special_tokens=True)
     return response
-@spaces.GPU(duration=300)
 # TTS function
 def synthesize_speech(text):
     inputs = tts_processor(text, return_tensors="pt")
@@ -46,58 +46,49 @@ def synthesize_speech(text):
         mel_outputs, mel_outputs_postnet, _, alignments = tts_model.inference(inputs.input_ids)
         audio = tts_model.infer(mel_outputs_postnet)
     return audio
-@spaces.GPU(duration=300)
 # Real-time processing function
 def real_time_pipeline():
-    p = pyaudio.PyAudio()
-    stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=1024)
     wake_word = "hello mate"
     wake_word_detected = False
     print("Listening for wake word...")
     try:
         while True:
-            frames = []
-            for _ in range(0, int(16000 / 1024 * 2)):  # 2 seconds of audio
-                data = stream.read(1024)
-                frames.append(data)
-            audio_data = np.frombuffer(b''.join(frames), dtype=np.int16)
-            # Save the audio to a temporary file for ASR
-            wf = wave.open("temp.wav", 'wb')
-            wf.setnchannels(1)
-            wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
-            wf.setframerate(16000)
-            wf.writeframes(b''.join(frames))
-            wf.close()
             # Step 1: Transcribe audio to text
-            transcription = transcribe("temp.wav").lower()
             if wake_word in transcription:
                 wake_word_detected = True
                 print("Wake word detected. Processing audio...")
                 while wake_word_detected:
-                    frames = []
-                    for _ in range(0, int(16000 / 1024 * 2)):  # 2 seconds of audio
-                        data = stream.read(1024)
-                        frames.append(data)
-                    audio_data = np.frombuffer(b''.join(frames), dtype=np.int16)
-                    # Save the audio to a temporary file for ASR
-                    wf = wave.open("temp.wav", 'wb')
-                    wf.setnchannels(1)
-                    wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
-                    wf.setframerate(16000)
-                    wf.writeframes(b''.join(frames))
-                    wf.close()
                     # Step 1: Transcribe audio to text
-                    transcription = transcribe("temp.wav")
                     # Step 2: Generate response using text-to-text model
                     response = generate_response(transcription)
@@ -109,26 +100,12 @@ def real_time_pipeline():
                     output_path = "output.wav"
                     torchaudio.save(output_path, synthesized_audio.squeeze(1), 22050)
-                    # Play the synthesized audio
-                    wf = wave.open(output_path, 'rb')
-                    stream_out = p.open(format=p.get_format_from_width(wf.getsampwidth()),
-                                        channels=wf.getnchannels(),
-                                        rate=wf.getframerate(),
-                                        output=True)
-                    data = wf.readframes(1024)
-                    while data:
-                        stream_out.write(data)
-                        data = wf.readframes(1024)
-                    stream_out.stop_stream()
-                    stream_out.close()
-                    wf.close()
     except KeyboardInterrupt:
         print("Stopping...")
-    finally:
-        stream.stop_stream()
-        stream.close()
-        p.terminate()
 # Gradio interface
 gr_interface = gr.Interface(
@@ -140,4 +117,5 @@ gr_interface = gr.Interface(
     description="ASR + Text-to-Text Model + TTS with Human-like Voice and Emotions"
 )
-iface.launch(inline=False)

 import torch
 import torchaudio
 import gradio as gr
+import soundfile as sf
 import wave
 import numpy as np
 from transformers import WhisperForCTC, WhisperProcessor, AutoModelForSeq2SeqLM, AutoTokenizer
     predicted_ids = torch.argmax(logits, dim=-1)
     transcription = processor_asr.batch_decode(predicted_ids)
     return transcription[0]
+@spaces.GPU()
 # Text-to-text function
 def generate_response(text):
     inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
     outputs = text_model.generate(**inputs)
     response = tokenizer.decode(outputs[0], skip_special_tokens=True)
     return response
+@spaces.GPU()
 # TTS function
 def synthesize_speech(text):
     inputs = tts_processor(text, return_tensors="pt")
         mel_outputs, mel_outputs_postnet, _, alignments = tts_model.inference(inputs.input_ids)
         audio = tts_model.infer(mel_outputs_postnet)
     return audio
+@spaces.GPU()
 # Real-time processing function
 def real_time_pipeline():
+    # Adjust this part to handle live recording using soundfile and play back using simpleaudio
+    import simpleaudio as sa
+    import tempfile
+    import time
     wake_word = "hello mate"
     wake_word_detected = False
     print("Listening for wake word...")
+    with tempfile.NamedTemporaryFile(delete=False) as tmp_wav_file:
+        tmp_wav_path = tmp_wav_file.name
     try:
         while True:
+            # Capture audio here (this is a simplified example, you need actual audio capture logic)
+            time.sleep(2)  # Simulate 2 seconds of audio capture
+            # Save the captured audio to the temp file for ASR
+            data, sample_rate = sf.read(tmp_wav_path)
+            sf.write(tmp_wav_path, data, sample_rate)
             # Step 1: Transcribe audio to text
+            transcription = transcribe(tmp_wav_path).lower()
             if wake_word in transcription:
                 wake_word_detected = True
                 print("Wake word detected. Processing audio...")
                 while wake_word_detected:
+                    # Capture audio here (this is a simplified example, you need actual audio capture logic)
+                    time.sleep(2)  # Simulate 2 seconds of audio capture
+                    # Save the captured audio to the temp file for ASR
+                    data, sample_rate = sf.read(tmp_wav_path)
+                    sf.write(tmp_wav_path, data, sample_rate)
                     # Step 1: Transcribe audio to text
+                    transcription = transcribe(tmp_wav_path)
                     # Step 2: Generate response using text-to-text model
                     response = generate_response(transcription)
                     output_path = "output.wav"
                     torchaudio.save(output_path, synthesized_audio.squeeze(1), 22050)
+                    # Play the synthesized audio using simpleaudio
+                    wave_obj = sa.WaveObject.from_wave_file(output_path)
+                    play_obj = wave_obj.play()
+                    play_obj.wait_done()
     except KeyboardInterrupt:
         print("Stopping...")
 # Gradio interface
 gr_interface = gr.Interface(
     description="ASR + Text-to-Text Model + TTS with Human-like Voice and Emotions"
 )
+iface.launch(inline=False)