Dua Rajper commited on
Commit
ae8b1c6
·
verified ·
1 Parent(s): a47c2e6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +69 -29
app.py CHANGED
@@ -1,12 +1,15 @@
1
  import os
2
  import streamlit as st
3
  from groq import Groq
4
- from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, pipeline # Import pipeline
5
  from espnet2.bin.tts_inference import Text2Speech
6
  import soundfile as sf
7
  from pydub import AudioSegment
8
  import io
9
  from dotenv import load_dotenv
 
 
 
10
 
11
  # Load environment variables from .env file
12
  load_dotenv()
@@ -30,7 +33,8 @@ def load_models():
30
  "automatic-speech-recognition",
31
  model=stt_model,
32
  tokenizer=processor.tokenizer,
33
- feature_extractor=processor.feature_extractor
 
34
  )
35
 
36
  # Text-to-Speech
@@ -40,33 +44,69 @@ def load_models():
40
 
41
  stt_pipe, tts_model = load_models()
42
 
 
 
 
 
 
 
 
 
 
43
  # Streamlit app
44
  st.title("Voice-Enabled Chatbot")
45
 
46
- # Audio input
47
- audio_file = st.file_uploader("Upload your voice input", type=['wav'])
48
- if audio_file is not None:
49
- audio_bytes = audio_file.read()
50
- audio = AudioSegment.from_file(io.BytesIO(audio_bytes))
51
- audio.export("temp.wav", format="wav")
52
- speech, _ = sf.read("temp.wav")
53
- text = stt_pipe(speech)['text']
54
- st.write("Transcribed Text:", text)
55
-
56
- # Generate response using Groq API
57
- try:
58
- chat_completion = groq_client.chat.completions.create(
59
- messages=[{"role": "user", "content": text}],
60
- model="mixtral-8x7b-32768",
61
- temperature=0.5,
62
- max_tokens=1024
63
- )
64
- response = chat_completion.choices[0].message.content
65
- st.write("Generated Response:", response)
66
-
67
- # Convert response to speech
68
- speech, *_ = tts_model(response)
69
- sf.write("response.wav", speech, 22050)
70
- st.audio("response.wav")
71
- except Exception as e:
72
- st.error(f"Error generating response: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import streamlit as st
3
  from groq import Groq
4
+ from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, pipeline
5
  from espnet2.bin.tts_inference import Text2Speech
6
  import soundfile as sf
7
  from pydub import AudioSegment
8
  import io
9
  from dotenv import load_dotenv
10
+ from streamlit_webrtc import webrtc_streamer, WebRtcMode, AudioProcessorBase
11
+ import av
12
+ import numpy as np
13
 
14
  # Load environment variables from .env file
15
  load_dotenv()
 
33
  "automatic-speech-recognition",
34
  model=stt_model,
35
  tokenizer=processor.tokenizer,
36
+ feature_extractor=processor.feature_extractor,
37
+ return_timestamps=True # Enable timestamps for long-form audio
38
  )
39
 
40
  # Text-to-Speech
 
44
 
45
  stt_pipe, tts_model = load_models()
46
 
47
+ # Audio recorder
48
+ class AudioRecorder(AudioProcessorBase):
49
+ def __init__(self):
50
+ self.audio_frames = []
51
+
52
+ def recv(self, frame: av.AudioFrame) -> av.AudioFrame:
53
+ self.audio_frames.append(frame.to_ndarray())
54
+ return frame
55
+
56
  # Streamlit app
57
  st.title("Voice-Enabled Chatbot")
58
 
59
+ # Audio recorder
60
+ st.write("Record your voice:")
61
+ webrtc_ctx = webrtc_streamer(
62
+ key="audio-recorder",
63
+ mode=WebRtcMode.SENDONLY,
64
+ audio_processor_factory=AudioRecorder,
65
+ media_stream_constraints={"audio": True, "video": False},
66
+ )
67
+
68
+ if webrtc_ctx.audio_processor:
69
+ st.write("Recording... Press 'Stop' to finish recording.")
70
+
71
+ # Save recorded audio to a WAV file
72
+ if st.button("Stop and Process Recording"):
73
+ audio_frames = webrtc_ctx.audio_processor.audio_frames
74
+ if audio_frames:
75
+ # Combine audio frames into a single array
76
+ audio_data = np.concatenate(audio_frames)
77
+ # Save as WAV file
78
+ sf.write("recorded_audio.wav", audio_data, samplerate=16000)
79
+ st.success("Recording saved as recorded_audio.wav")
80
+
81
+ # Process the recorded audio
82
+ speech, _ = sf.read("recorded_audio.wav")
83
+ output = stt_pipe(speech) # Transcribe with timestamps
84
+
85
+ # Display the full transcribed text
86
+ st.write("Transcribed Text:", output['text'])
87
+
88
+ # Display the text with timestamps (optional)
89
+ if 'chunks' in output:
90
+ st.write("Transcribed Text with Timestamps:")
91
+ for chunk in output['chunks']:
92
+ st.write(f"{chunk['timestamp'][0]:.2f} - {chunk['timestamp'][1]:.2f}: {chunk['text']}")
93
+
94
+ # Generate response using Groq API
95
+ try:
96
+ chat_completion = groq_client.chat.completions.create(
97
+ messages=[{"role": "user", "content": output['text']}],
98
+ model="mixtral-8x7b-32768",
99
+ temperature=0.5,
100
+ max_tokens=1024
101
+ )
102
+ response = chat_completion.choices[0].message.content
103
+ st.write("Generated Response:", response)
104
+
105
+ # Convert response to speech
106
+ speech, *_ = tts_model(response, spembs=tts_model.spembs[0]) # Use the first speaker embedding
107
+ sf.write("response.wav", speech, 22050)
108
+ st.audio("response.wav")
109
+ except Exception as e:
110
+ st.error(f"Error generating response: {e}")
111
+ else:
112
+ st.error("No audio recorded. Please try again.")