Spaces:

agnixcode
/

voice_chatbot

Sleeping

App Files Files Community

Dua Rajper commited on Mar 4, 2025

Commit

a238dc9

verified ·

1 Parent(s): 45f3399

Update app.py

Browse files

Files changed (1) hide show

app.py +83 -59

app.py CHANGED Viewed

@@ -20,9 +20,8 @@ if not GROQ_API_KEY:
 groq_client = Groq(api_key=GROQ_API_KEY)
 # Load models
-@st.cache_resource  # Use st.cache_resource for caching models
 def load_models():
-    # Speech-to-Text
     processor = AutoProcessor.from_pretrained("openai/whisper-small")
     stt_model = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-small")
     stt_pipe = pipeline(
@@ -30,9 +29,8 @@ def load_models():
         model=stt_model,
         tokenizer=processor.tokenizer,
         feature_extractor=processor.feature_extractor,
-        return_timestamps=True  # Enable timestamps for long-form audio
     )
-    # Text-to-Speech
     tts_model = Text2Speech.from_pretrained("espnet/espnet_tts_vctk_espnet_spk_voxceleb12_rawnet")
     return stt_pipe, tts_model
@@ -50,59 +48,85 @@ class AudioRecorder(AudioProcessorBase):
 # Streamlit app
 st.title("Voice-Enabled Chatbot")
-# Audio recorder
-st.write("Record your voice:")
-webrtc_ctx = webrtc_streamer(
-    key="audio-recorder",
-    mode=WebRtcMode.SENDONLY,
-    audio_processor_factory=AudioRecorder,
-    media_stream_constraints={"audio": True, "video": False},
-)
-if webrtc_ctx.audio_processor:
-    st.write("Recording... Press 'Stop' to finish recording.")
-    # Save recorded audio to a WAV file
-    if st.button("Stop and Process Recording"):
-        audio_frames = webrtc_ctx.audio_processor.audio_frames
-        if audio_frames:
-            # Combine audio frames into a single array
-            audio_data = np.concatenate(audio_frames)
-            # Save as WAV file
-            sf.write("recorded_audio.wav", audio_data, samplerate=16000)
-            st.success("Recording saved as recorded_audio.wav")
-            # Process the recorded audio
-            speech, _ = sf.read("recorded_audio.wav")
-            output = stt_pipe(speech)  # Transcribe with timestamps
-            # Debug: Print the transcribed text
-            st.write("Transcribed Text:", output['text'])
-            # Display the text with timestamps (optional)
-            if 'chunks' in output:
-                st.write("Transcribed Text with Timestamps:")
-                for chunk in output['chunks']:
-                    st.write(f"{chunk['timestamp'][0]:.2f} - {chunk['timestamp'][1]:.2f}: {chunk['text']}")
-            # Generate response using Groq API
-            try:
-                # Debug: Print the input text
-                st.write("Input Text:", output['text'])
-                chat_completion = groq_client.chat.completions.create(
-                    messages=[{"role": "user", "content": output['text']}],
-                    model="mixtral-8x7b-32768",
-                    temperature=0.5,
-                    max_tokens=1024,
-                )
-                # Debug: Print the API response
-                st.write("API Response:", chat_completion)
-                # Extract the generated response
-                response = chat_completion.choices[0].message.content
-                st.write("Generated Response:", response)
-                # Convert response to speech
-                speech, *_ = tts_model(response, spembs=tts_model.spembs[0])  # Use the first speaker embedding
-                # Debug: Print the TTS output
-                st.write("TTS Output:", speech)
-                # Save and play the speech
-                sf.write("response.wav", speech, 22050)
-                st.audio("response.wav")
-            except Exception as e:
-                st.error(f"Error generating response: {e}")
-        else:
-            st.error("No audio recorded. Please try again.")

 groq_client = Groq(api_key=GROQ_API_KEY)
 # Load models
+@st.cache_resource
 def load_models():
     processor = AutoProcessor.from_pretrained("openai/whisper-small")
     stt_model = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-small")
     stt_pipe = pipeline(
         model=stt_model,
         tokenizer=processor.tokenizer,
         feature_extractor=processor.feature_extractor,
+        return_timestamps=True
     )
     tts_model = Text2Speech.from_pretrained("espnet/espnet_tts_vctk_espnet_spk_voxceleb12_rawnet")
     return stt_pipe, tts_model
 # Streamlit app
 st.title("Voice-Enabled Chatbot")
+# Audio upload
+uploaded_file = st.file_uploader("Upload a WAV file", type=["wav"])
+if uploaded_file is not None:
+    # Save uploaded file
+    with open("uploaded_audio.wav", "wb") as f:
+        f.write(uploaded_file.getbuffer())
+    st.success("File uploaded successfully!")
+    # Process the uploaded audio
+    speech, _ = sf.read("uploaded_audio.wav")
+    output = stt_pipe(speech)
+    st.write("Transcribed Text:", output['text'])
+    if 'chunks' in output:
+        st.write("Transcribed Text with Timestamps:")
+        for chunk in output['chunks']:
+            st.write(f"{chunk['timestamp'][0]:.2f} - {chunk['timestamp'][1]:.2f}: {chunk['text']}")
+    try:
+        st.write("Input Text:", output['text'])
+        chat_completion = groq_client.chat.completions.create(
+            messages=[{"role": "user", "content": output['text']}],
+            model="mixtral-8x7b-32768",
+            temperature=0.5,
+            max_tokens=2048,  # Increased max_tokens
+        )
+        st.write("API Response:", chat_completion)
+        response = chat_completion.choices[0].message.content
+        st.write("Generated Response:", response)
+        speech, *_ = tts_model(response, spembs=tts_model.spembs[0])
+        st.write("TTS Output:", speech)
+        sf.write("response.wav", speech, 22050)
+        st.audio("response.wav")
+    except Exception as e:
+        st.error(f"Error generating response: {e}")
+else:
+    # Audio recorder
+    st.write("Record your voice:")
+    webrtc_ctx = webrtc_streamer(
+        key="audio-recorder",
+        mode=WebRtcMode.SENDONLY,
+        audio_processor_factory=AudioRecorder,
+        media_stream_constraints={"audio": True, "video": False},
+    )
+    if webrtc_ctx.audio_processor:
+        st.write("Recording... Press 'Stop' to finish recording.")
+        if st.button("Stop and Process Recording"):
+            audio_frames = webrtc_ctx.audio_processor.audio_frames
+            if audio_frames:
+                audio_data = np.concatenate(audio_frames)
+                sf.write("recorded_audio.wav", audio_data, samplerate=16000)
+                st.success("Recording saved as recorded_audio.wav")
+                speech, _ = sf.read("recorded_audio.wav")
+                output = stt_pipe(speech)
+                st.write("Transcribed Text:", output['text'])
+                if 'chunks' in output:
+                    st.write("Transcribed Text with Timestamps:")
+                    for chunk in output['chunks']:
+                        st.write(f"{chunk['timestamp'][0]:.2f} - {chunk['timestamp'][1]:.2f}: {chunk['text']}")
+                try:
+                    st.write("Input Text:", output['text'])
+                    chat_completion = groq_client.chat.completions.create(
+                        messages=[{"role": "user", "content": output['text']}],
+                        model="mixtral-8x7b-32768",
+                        temperature=0.5,
+                        max_tokens=2048,  # Increased max_tokens
+                    )
+                    st.write("API Response:", chat_completion)
+                    response = chat_completion.choices[0].message.content
+                    st.write("Generated Response:", response)
+                    speech, *_ = tts_model(response, spembs=tts_model.spembs[0])
+                    st.write("TTS Output:", speech)
+                    sf.write("response.wav", speech, 22050)
+                    st.audio("response.wav")
+                except Exception as e:
+                    st.error(f"Error generating response: {e}")
+            else:
+                st.error("No audio recorded. Please try again.")