Spaces:

agnixcode
/

voice_chatbot

Sleeping

App Files Files Community

Dua Rajper commited on Mar 3, 2025

Commit

ae8b1c6

verified ·

1 Parent(s): a47c2e6

Update app.py

Browse files

Files changed (1) hide show

app.py +69 -29

app.py CHANGED Viewed

@@ -1,12 +1,15 @@
 import os
 import streamlit as st
 from groq import Groq
-from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, pipeline  # Import pipeline
 from espnet2.bin.tts_inference import Text2Speech
 import soundfile as sf
 from pydub import AudioSegment
 import io
 from dotenv import load_dotenv
 # Load environment variables from .env file
 load_dotenv()
@@ -30,7 +33,8 @@ def load_models():
         "automatic-speech-recognition",
         model=stt_model,
         tokenizer=processor.tokenizer,
-        feature_extractor=processor.feature_extractor
     )
     # Text-to-Speech
@@ -40,33 +44,69 @@ def load_models():
 stt_pipe, tts_model = load_models()
 # Streamlit app
 st.title("Voice-Enabled Chatbot")
-# Audio input
-audio_file = st.file_uploader("Upload your voice input", type=['wav'])
-if audio_file is not None:
-    audio_bytes = audio_file.read()
-    audio = AudioSegment.from_file(io.BytesIO(audio_bytes))
-    audio.export("temp.wav", format="wav")
-    speech, _ = sf.read("temp.wav")
-    text = stt_pipe(speech)['text']
-    st.write("Transcribed Text:", text)
-    # Generate response using Groq API
-    try:
-        chat_completion = groq_client.chat.completions.create(
-            messages=[{"role": "user", "content": text}],
-            model="mixtral-8x7b-32768",
-            temperature=0.5,
-            max_tokens=1024
-        )
-        response = chat_completion.choices[0].message.content
-        st.write("Generated Response:", response)
-        # Convert response to speech
-        speech, *_ = tts_model(response)
-        sf.write("response.wav", speech, 22050)
-        st.audio("response.wav")
-    except Exception as e:
-        st.error(f"Error generating response: {e}")

 import os
 import streamlit as st
 from groq import Groq
+from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, pipeline
 from espnet2.bin.tts_inference import Text2Speech
 import soundfile as sf
 from pydub import AudioSegment
 import io
 from dotenv import load_dotenv
+from streamlit_webrtc import webrtc_streamer, WebRtcMode, AudioProcessorBase
+import av
+import numpy as np
 # Load environment variables from .env file
 load_dotenv()
         "automatic-speech-recognition",
         model=stt_model,
         tokenizer=processor.tokenizer,
+        feature_extractor=processor.feature_extractor,
+        return_timestamps=True  # Enable timestamps for long-form audio
     )
     # Text-to-Speech
 stt_pipe, tts_model = load_models()
+# Audio recorder
+class AudioRecorder(AudioProcessorBase):
+    def __init__(self):
+        self.audio_frames = []
+    def recv(self, frame: av.AudioFrame) -> av.AudioFrame:
+        self.audio_frames.append(frame.to_ndarray())
+        return frame
 # Streamlit app
 st.title("Voice-Enabled Chatbot")
+# Audio recorder
+st.write("Record your voice:")
+webrtc_ctx = webrtc_streamer(
+    key="audio-recorder",
+    mode=WebRtcMode.SENDONLY,
+    audio_processor_factory=AudioRecorder,
+    media_stream_constraints={"audio": True, "video": False},
+)
+if webrtc_ctx.audio_processor:
+    st.write("Recording... Press 'Stop' to finish recording.")
+    # Save recorded audio to a WAV file
+    if st.button("Stop and Process Recording"):
+        audio_frames = webrtc_ctx.audio_processor.audio_frames
+        if audio_frames:
+            # Combine audio frames into a single array
+            audio_data = np.concatenate(audio_frames)
+            # Save as WAV file
+            sf.write("recorded_audio.wav", audio_data, samplerate=16000)
+            st.success("Recording saved as recorded_audio.wav")
+            # Process the recorded audio
+            speech, _ = sf.read("recorded_audio.wav")
+            output = stt_pipe(speech)  # Transcribe with timestamps
+            # Display the full transcribed text
+            st.write("Transcribed Text:", output['text'])
+            # Display the text with timestamps (optional)
+            if 'chunks' in output:
+                st.write("Transcribed Text with Timestamps:")
+                for chunk in output['chunks']:
+                    st.write(f"{chunk['timestamp'][0]:.2f} - {chunk['timestamp'][1]:.2f}: {chunk['text']}")
+            # Generate response using Groq API
+            try:
+                chat_completion = groq_client.chat.completions.create(
+                    messages=[{"role": "user", "content": output['text']}],
+                    model="mixtral-8x7b-32768",
+                    temperature=0.5,
+                    max_tokens=1024
+                )
+                response = chat_completion.choices[0].message.content
+                st.write("Generated Response:", response)
+                # Convert response to speech
+                speech, *_ = tts_model(response, spembs=tts_model.spembs[0])  # Use the first speaker embedding
+                sf.write("response.wav", speech, 22050)
+                st.audio("response.wav")
+            except Exception as e:
+                st.error(f"Error generating response: {e}")
+        else:
+            st.error("No audio recorded. Please try again.")