Krish-05 commited on
Commit
bd7abe9
·
verified ·
1 Parent(s): 7caf3da

Update streamlit_app.py

Browse files
Files changed (1) hide show
  1. streamlit_app.py +315 -339
streamlit_app.py CHANGED
@@ -5,384 +5,360 @@ import time
5
  import logging
6
  import numpy as np
7
  import sys
8
- import io
9
- import soundfile as sf
10
- import queue
11
- import pkg_resources # Import pkg_resources for version checking
12
-
13
- from streamlit_webrtc import webrtc_streamer, WebRtcMode, RTCConfiguration
14
- import av # Required for audio frames processing
15
- from streamlit.components.v1 import html # Import html for custom JS
16
 
17
  # Configure logging
18
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
19
  logger = logging.getLogger(__name__)
20
 
21
- # --- Configuration ---
22
- FASTAPI_HOST = "localhost"
23
- FASTAPI_PORT = 7860
24
- FASTAPI_OLLAMA_URL = f"http://{FASTAPI_HOST}:{FASTAPI_PORT}/ask"
25
- FASTAPI_STT_URL = f"http://{FASTAPI_HOST}:{FASTAPI_PORT}/transcribe/"
26
-
27
- # --- Package Version Verification (for debugging/info) ---
28
- logger.info("--- Checking installed package versions at runtime ---")
29
- try:
30
- st_version = pkg_resources.get_distribution("streamlit").version
31
- logger.info(f"Streamlit version: {st_version}")
32
- except pkg_resources.DistributionNotFound:
33
- logger.warning("Streamlit not found at runtime.")
34
  try:
35
- requests_version = pkg_resources.get_distribution("requests").version
36
- logger.info(f"Requests version: {requests_version}")
37
- except pkg_resources.DistributionNotFound:
38
- logger.warning("Requests not found at runtime.")
39
- try:
40
- webrtc_version = pkg_resources.get_distribution("streamlit-webrtc").version
41
- logger.info(f"streamlit-webrtc version: {webrtc_version}")
42
- except pkg_resources.DistributionNotFound:
43
- logger.warning("streamlit-webrtc not found at runtime.")
44
- try:
45
- # CORRECTED: Use pkg_resources consistently
46
- transformers_version = pkg_resources.get_distribution("transformers").version
47
- logger.info(f"transformers version: {transformers_version}")
48
- except pkg_resources.DistributionNotFound:
49
- logger.warning("transformers not found (expected for current app logic).")
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  except Exception as e:
51
- logger.error(f"Error getting transformers version: {e}")
52
- logger.info("--- Finished checking package versions ---")
53
 
54
- # --- Streamlit Page Setup ---
 
 
 
 
 
 
55
  st.set_page_config(page_title="Ollama AI Assistant", page_icon="🤖", layout="wide")
56
 
57
- # --- Session State Initialization ---
 
58
  if 'chat_history' not in st.session_state:
59
  st.session_state.chat_history = [
60
  {"role": "assistant", "message": "Hello! How can I assist you today?"}
61
  ]
62
- if 'microphone_active' not in st.session_state:
63
- st.session_state.microphone_active = False
 
 
64
  if 'transcribed_text' not in st.session_state:
65
- st.session_state.transcribed_text = ""
66
- if 'audio_buffer' not in st.session_state:
67
- st.session_state.audio_buffer = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
- # --- App Header & Chat Display ---
70
  st.title("🤖 Ollama AI Assistant")
71
- st.caption("Start chatting with our AI assistant. Type your message below or use the speaker icon.")
72
- st.markdown("---")
 
 
73
  for chat in st.session_state.chat_history:
 
74
  with st.chat_message(chat["role"], avatar="🤖" if chat["role"] == "assistant" else "👤"):
75
  st.write(chat["message"])
76
 
77
- # --- WebRTC Streamer (Always Rendered, but audio processing is conditional) ---
78
- webrtc_ctx = webrtc_streamer(
79
- key="microphone_input_permanent", # Use a fixed key
80
- mode=WebRtcMode.SENDONLY,
81
- rtc_configuration=RTCConfiguration({"iceServers": [{"urls": ["stun:stun.l.google.com:19302"]}]}),
82
- audio_receiver_size=2048, # A larger buffer for hold-to-speak
83
- media_stream_constraints={"video": False, "audio": True}
84
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
- # Function to transcribe and update text area
87
- def transcribe_and_update_text_area(audio_data_list, sample_rate=48000):
88
- if audio_data_list:
89
- try:
90
- combined_audio = np.concatenate(audio_data_list)
91
- byte_io = io.BytesIO()
92
- sf.write(byte_io, combined_audio, sample_rate, format='WAV')
93
- byte_io.seek(0)
 
 
 
 
 
94
 
95
- files = {"audio_file": ("audio.wav", byte_io.getvalue(), "audio/wav")}
96
- logger.info(f"Sending {len(combined_audio)} samples (at {sample_rate} Hz) to FastAPI STT endpoint.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
 
98
- with st.spinner("Transcribing audio..."):
99
- response = requests.post(FASTAPI_STT_URL, files=files)
100
- response.raise_for_status()
101
- transcribed_data = response.json()
102
- st.session_state.transcribed_text = transcribed_data.get("transcribed_text", "Could not transcribe.")
103
- logger.info(f"Transcribed text received: {st.session_state.transcribed_text[:100]}...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  except requests.exceptions.ConnectionError:
105
- st.session_state.transcribed_text = (f"Error: Could not connect to the STT server. "
106
- f"Please ensure it is running at {FASTAPI_STT_URL}.")
107
- logger.error(f"ConnectionError to STT FastAPI at {FASTAPI_STT_URL}")
 
 
108
  except requests.exceptions.RequestException as e:
 
109
  error_details = e.response.text if e.response is not None else str(e)
110
- st.session_state.transcribed_text = (f"An error occurred during STT request. "
111
- f"Details: {error_details}")
112
- logger.error(f"Request error to STT FastAPI: {e}", exc_info=True)
 
 
113
  except Exception as e:
114
- st.session_state.transcribed_text = f"An unexpected error occurred during STT: {e}"
115
- logger.exception("An unexpected error occurred during STT transcription.")
116
- finally:
117
- st.session_state.audio_buffer = []
118
- st.session_state.microphone_active = False
119
- st.rerun()
120
- else:
121
- logger.info("No audio data to transcribe.")
122
- st.session_state.microphone_active = False
 
123
  st.rerun()
 
 
 
 
124
 
125
- # --- Custom JavaScript for Hold-to-Speak Button ---
126
- SPEAKER_BUTTON_HTML = """
127
- <style>
128
- .speaker-button-container {
129
- display: flex;
130
- align-items: center;
131
- justify-content: flex-end;
132
- padding-top: 10px;
133
- }
134
- .speaker-button {
135
- background-color: #4CAF50;
136
- color: white;
137
- padding: 10px 15px;
138
- border: none;
139
- border-radius: 5px;
140
- font-size: 16px;
141
- cursor: pointer;
142
- transition: background-color 0.3s ease;
143
- display: flex;
144
- align-items: center;
145
- gap: 8px;
146
- }
147
- .speaker-button:hover {
148
- background-color: #45a049;
149
- }
150
- .speaker-button:active {
151
- background-color: #3e8e41;
152
- }
153
- .speaker-button.active {
154
- background-color: #f44336;
155
- }
156
- .speaker-button.active:hover {
157
- background-color: #da190b;
158
- }
159
- @keyframes pulse {
160
- 0% { box-shadow: 0 0 0 0 rgba(244, 67, 54, 0.7); }
161
- 70% { box-shadow: 0 0 0 10px rgba(244, 67, 54, 0); }
162
- 100% { box-shadow: 0 0 0 0 rgba(244, 67, 54, 0); }
163
- }
164
- .speaker-button.active {
165
- animation: pulse 1.5s infinite;
166
- }
167
- </style>
168
- <div class="speaker-button-container">
169
- <button id="speakerButton" class="speaker-button">
170
- <i class="fa fa-microphone" style="font-size:24px"></i>
171
- <span id="buttonText">Hold to Speak</span>
172
- </button>
173
- </div>
174
- <script>
175
- const speakerButton = document.getElementById('speakerButton');
176
- const buttonText = document.getElementById('buttonText');
177
- let isRecording = false;
178
-
179
- function sendMessageToStreamlit(action) {
180
- window.parent.postMessage({
181
- streamlit: true,
182
- type: 'FROM_IFRAME',
183
- data: { action: action }
184
- }, '*');
185
- }
186
 
187
- window.addEventListener('message', event => {
188
- if (event.data.type === 'streamlit:setComponentValue' && event.data.key === 'speaker_button_state') {
189
- const state = event.data.value;
190
- if (state.active === true && !isRecording) {
191
- speakerButton.classList.add('active');
192
- buttonText.textContent = 'Recording... Release to Transcribe';
193
- isRecording = true;
194
- } else if (state.active === false && isRecording) {
195
- speakerButton.classList.remove('active');
196
- buttonText.textContent = 'Hold to Speak';
197
- isRecording = false;
198
- }
199
- }
200
- });
201
-
202
- speakerButton.addEventListener('mousedown', () => {
203
- if (!isRecording) {
204
- sendMessageToStreamlit('start_recording');
205
- speakerButton.classList.add('active');
206
- buttonText.textContent = 'Recording... Release to Transcribe';
207
- isRecording = true;
208
- }
209
- });
210
-
211
- speakerButton.addEventListener('mouseup', () => {
212
- if (isRecording) {
213
- sendMessageToStreamlit('stop_recording');
214
- speakerButton.classList.remove('active');
215
- buttonText.textContent = 'Processing...';
216
- isRecording = false;
217
- }
218
- });
219
-
220
- speakerButton.addEventListener('mouseleave', () => {
221
- if (isRecording) {
222
- sendMessageToStreamlit('stop_recording');
223
- speakerButton.classList.remove('active');
224
- buttonText.textContent = 'Processing...';
225
- isRecording = false;
226
- }
227
- });
228
-
229
- speakerButton.addEventListener('contextmenu', e => e.preventDefault());
230
 
231
- speakerButton.addEventListener('touchstart', (e) => {
232
- e.preventDefault();
233
- if (!isRecording) {
234
- sendMessageToStreamlit('start_recording');
235
- speakerButton.classList.add('active');
236
- buttonText.textContent = 'Recording... Release to Transcribe';
237
- isRecording = true;
238
- }
239
- }, { passive: false });
240
-
241
- speakerButton.addEventListener('touchend', (e) => {
242
- e.preventDefault();
243
- if (isRecording) {
244
- sendMessageToStreamlit('stop_recording');
245
- speakerButton.classList.remove('active');
246
- buttonText.textContent = 'Processing...';
247
- isRecording = false;
248
- }
249
- }, { passive: false });
250
-
251
- speakerButton.addEventListener('touchcancel', (e) => {
252
- e.preventDefault();
253
- if (isRecording) {
254
- sendMessageToStreamlit('stop_recording');
255
- speakerButton.classList.remove('active');
256
- buttonText.textContent = 'Processing...';
257
- isRecording = false;
258
- }
259
- }, { passive: false });
260
-
261
- </script>
262
- <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-awesome.min.css">
263
- """
264
-
265
- # --- Input Area ---
266
- col1, col2 = st.columns([0.8, 0.2])
267
 
268
- with col1:
269
- with st.form("chat_form", clear_on_submit=True):
270
- user_prompt = st.text_area(
271
- "Type your message here...",
272
- height=100,
273
- placeholder="e.g., Explain quantum computing in simple terms.",
274
- label_visibility="collapsed",
275
- key="user_input_text_area",
276
- value=st.session_state.transcribed_text
277
- )
278
 
279
- # Clear transcribed text if user starts typing in the text_area
280
- if user_prompt != st.session_state.transcribed_text and st.session_state.transcribed_text != "":
281
- st.session_state.transcribed_text = ""
282
- logger.info("Transcribed text cleared because user started typing/editing.")
 
 
 
 
 
 
 
283
 
284
- submitted = st.form_submit_button("Send")
285
- if submitted:
286
- st.session_state.microphone_active = False # Ensure mic is off on send
287
-
288
- if user_prompt:
289
- logger.info(f"User submitted prompt: {user_prompt[:100]}...")
290
- st.session_state.chat_history.append({"role": "user", "message": user_prompt})
291
- st.session_state.transcribed_text = ""
292
-
293
- with st.chat_message("assistant", avatar="🤖"):
294
- response_placeholder = st.empty()
295
- response_placeholder.write("Thinking...")
296
-
297
- full_response = ""
298
- byte_buffer = b""
299
- try:
300
- payload = {"text": user_prompt}
301
- headers = {"Content-Type": "application/json"}
302
- with requests.post(FASTAPI_OLLAMA_URL, json=payload, headers=headers, stream=True) as response:
303
- response.raise_for_status()
304
- for chunk in response.iter_content(chunk_size=1):
305
- if chunk:
306
- byte_buffer += chunk
307
- try:
308
- decoded_text = byte_buffer.decode("utf-8", errors="strict")
309
- full_response += decoded_text
310
- response_placeholder.markdown(full_response + "▌")
311
- byte_buffer = b""
312
- except UnicodeDecodeError:
313
- pass
314
- except Exception as e:
315
- full_response += chunk.decode("utf-8", errors="replace")
316
- response_placeholder.markdown(full_response + "▌")
317
- byte_buffer = b""
318
-
319
- if byte_buffer:
320
- full_response += byte_buffer.decode("utf-8", errors="replace")
321
-
322
- response_placeholder.markdown(full_response)
323
- except requests.exceptions.ConnectionError:
324
- full_response = (f"Error: Could not connect to the FastAPI server. "
325
- f"Please ensure it is running at {FASTAPI_OLLAMA_URL}.")
326
- response_placeholder.error(full_response)
327
- logger.error(f"Connection error to FastAPI LLM at {FASTAPI_OLLAMA_URL}", exc_info=True)
328
- except requests.exceptions.RequestException as e:
329
- error_details = e.response.text if e.response is not None else str(e)
330
- status_code = e.response.status_code if e.response is not None else "N/A"
331
- full_response = (f"An error occurred during the request to FastAPI. "
332
- f"Status code: {status_code}\nDetails: {error_details}")
333
- response_placeholder.error(full_response)
334
- logger.error(f"Request error to FastAPI LLM: {e}", exc_info=True)
335
- except Exception as e:
336
- full_response = f"An unexpected error occurred: {e}"
337
- response_placeholder.error(full_response)
338
- logger.exception("An unexpected error occurred during LLM processing.")
339
 
340
- st.session_state.chat_history.append({"role": "assistant", "message": full_response})
341
- st.rerun()
342
- else:
343
- st.warning("Please enter a prompt before clicking 'Send'.")
344
-
345
- with col2:
346
- speaker_button_event = html(SPEAKER_BUTTON_HTML, height=70, scrolling=False)
347
 
348
- # Communicate current recording state to the JavaScript component
349
- st.write(f"<script>window.parent.postMessage({{ type: 'streamlit:setComponentValue', key: 'speaker_button_state', value: {{ active: {str(st.session_state.microphone_active).lower()} }} }}, '*');</script>", unsafe_allow_html=True)
350
-
351
- # Process messages from the custom JavaScript button
352
- if isinstance(speaker_button_event, dict) and "action" in speaker_button_event: # CORRECTED LINE
353
- if speaker_button_event["action"] == "start_recording":
354
- if webrtc_ctx.state.playing and not st.session_state.microphone_active:
355
- st.session_state.microphone_active = True
356
- st.session_state.audio_buffer = []
357
- st.session_state.transcribed_text = ""
358
- logger.info("JS: Start recording signal received. Microphone active.")
359
- st.rerun()
360
- elif not webrtc_ctx.state.playing:
361
- st.warning("Please allow microphone access in your browser.")
362
- logger.warning("JS: Start recording signal received, but WebRTC context is not playing.")
363
- elif st.session_state.microphone_active:
364
- logger.info("JS: Start recording signal received, but microphone already active.")
365
 
366
- elif speaker_button_event["action"] == "stop_recording":
367
- if st.session_state.microphone_active:
368
- logger.info("JS: Stop recording signal received. Transcribing...")
369
- transcribe_and_update_text_area(st.session_state.audio_buffer)
370
- else:
371
- logger.info("JS: Stop recording signal received, but microphone was not active.")
372
-
373
- # --- Real-time audio buffering from webrtc_ctx ---
374
- if webrtc_ctx.state.playing and st.session_state.microphone_active:
375
- try:
376
- audio_frames = webrtc_ctx.audio_receiver.get_frames(timeout=0.01)
377
- for frame in audio_frames:
378
- audio_array = frame.to_ndarray(format="flt").flatten()
379
- st.session_state.audio_buffer.append(audio_array)
380
- except queue.Empty:
381
- pass
382
- except Exception as e:
383
- logger.error(f"Error getting audio frames from webrtc_ctx: {e}", exc_info=True)
384
 
385
 
386
  # --- Footer ---
387
  st.markdown("---")
388
- st.caption("Powered by Ollama, Hugging Face (STT), FastAPI, and Streamlit.")
 
5
  import logging
6
  import numpy as np
7
  import sys
8
+ import io # New: For handling audio bytes
9
+ from pydub import AudioSegment # New: For converting audio formats (requires ffmpeg)
10
+ from streamlit_webrtc import WebRtcMode, webrtc_streamer, AudioProcessorBase, ClientSettings # New: For microphone access
 
 
 
 
 
11
 
12
  # Configure logging
13
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
14
  logger = logging.getLogger(__name__)
15
 
16
+ # --- Debugging: Display installed package versions ---
 
 
 
 
 
 
 
 
 
 
 
 
17
  try:
18
+ import pkg_resources
19
+ st.sidebar.write(f"Streamlit version: {pkg_resources.get_distribution('streamlit').version}")
20
+ st.sidebar.write(f"Requests version: {pkg_resources.get_distribution('requests').version}")
21
+
22
+ try:
23
+ webrtc_version = pkg_resources.get_distribution("streamlit-webrtc").version
24
+ st.sidebar.write(f"streamlit-webrtc version: {webrtc_version}")
25
+ except pkg_resources.DistributionNotFound:
26
+ st.sidebar.write("streamlit-webrtc not found (expected for current app logic).")
27
+ except Exception as e:
28
+ st.sidebar.write(f"Could not get streamlit-webrtc version: {e}")
29
+ try:
30
+ # Check for faster-whisper and pydub
31
+ fw_version = pkg_resources.get_distribution("faster-whisper").version
32
+ st.sidebar.write(f"faster-whisper version: {fw_version}")
33
+ except pkg_resources.DistributionNotFound:
34
+ st.sidebar.write("faster-whisper not found (expected for current app logic).")
35
+ except Exception as e:
36
+ st.sidebar.write(f"Could not get faster-whisper version: {e}")
37
+ try:
38
+ pd_version = pkg_resources.get_distribution("pydub").version
39
+ st.sidebar.write(f"pydub version: {pd_version}")
40
+ except pkg_resources.DistributionNotFound:
41
+ st.sidebar.write("pydub not found (expected for current app logic).")
42
+ except Exception as e:
43
+ st.sidebar.write(f"Could not get pydub version: {e}")
44
+
45
+ # Not expecting transformers here, removed for clarity.
46
  except Exception as e:
47
+ st.sidebar.write(f"Could not get package versions: {e}")
48
+ # --- End Debugging Section ---
49
 
50
+ # Configuration for the FastAPI backend
51
+ FASTAPI_HOST = "localhost"
52
+ FASTAPI_PORT = 7860
53
+ FASTAPI_LLM_URL = f"http://{FASTAPI_HOST}:{FASTAPI_PORT}/ask" # For LLM requests
54
+ FASTAPI_STT_URL = f"http://{FASTAPI_HOST}:{FASTAPI_PORT}/transcribe_audio" # For STT requests
55
+
56
+ # Set Streamlit page configuration
57
  st.set_page_config(page_title="Ollama AI Assistant", page_icon="🤖", layout="wide")
58
 
59
+ # --- Session state for chat history ---
60
+ # Initialize chat history if it doesn't exist in session state
61
  if 'chat_history' not in st.session_state:
62
  st.session_state.chat_history = [
63
  {"role": "assistant", "message": "Hello! How can I assist you today?"}
64
  ]
65
+ logger.info("Chat history initialized.")
66
+
67
+ # --- Session state for STT and WebRTC ---
68
+ # This controls the microphone recording lifecycle
69
  if 'transcribed_text' not in st.session_state:
70
+ st.session_state.transcribed_text = "" # Stores the last transcribed text
71
+ if 'webrtc_state' not in st.session_state:
72
+ st.session_state.webrtc_state = "idle" # idle, listening, processing_audio
73
+
74
+ # --- Custom Audio Processor for VAD and Audio Buffering ---
75
+ class VADAudioProcessor(AudioProcessorBase):
76
+ """
77
+ Processes audio frames from WebRTC. It buffers audio and
78
+ implements a simple volume-based Voice Activity Detection (VAD).
79
+ """
80
+ def __init__(self):
81
+ self.audio_buffer = io.BytesIO()
82
+ self.silent_frames_count = 0
83
+ self.voice_detected = False
84
+ self.frame_rate = 16000 # Standard for WebRTC audio
85
+ self.samples_width = 2 # 16-bit audio (2 bytes per sample)
86
+ self.threshold = 500 # Adjust this based on environment noise and microphone sensitivity
87
+ self.max_silent_frames = 30 # Stop after N silent frames (~0.3 seconds at 10ms/frame)
88
+ self.total_frames_processed = 0
89
+ logger.info("VADAudioProcessor initialized.")
90
+
91
+ def _calculate_volume(self, audio_chunk: bytes) -> float:
92
+ """Calculate RMS (Root Mean Square) volume of an audio chunk."""
93
+ # Convert bytes to a numpy array of 16-bit integers
94
+ audio_array = np.frombuffer(audio_chunk, dtype=np.int16)
95
+ if audio_array.size == 0:
96
+ return 0.0
97
+ # Calculate RMS
98
+ rms = np.sqrt(np.mean(audio_array**2))
99
+ return rms
100
+
101
+ def process(self, audio_chunk: bytes) -> bytes:
102
+ """
103
+ Processes each incoming audio chunk from the microphone.
104
+ """
105
+ # Write the raw audio chunk to the buffer
106
+ self.audio_buffer.write(audio_chunk)
107
+ self.total_frames_processed += 1
108
+
109
+ # Perform simple VAD
110
+ volume = self._calculate_volume(audio_chunk)
111
+ # logger.debug(f"Audio chunk received, volume: {volume:.2f}") # Use debug for less verbose logging
112
+
113
+ if volume > self.threshold:
114
+ self.voice_detected = True
115
+ self.silent_frames_count = 0 # Reset silence count on voice detection
116
+ # logger.debug("Voice detected!")
117
+ elif self.voice_detected: # Only count silence if voice was previously detected
118
+ self.silent_frames_count += 1
119
+ # logger.debug(f"Silence detected. Silent frames: {self.silent_frames_count}")
120
+
121
+ # This processor simply collects data. The stopping logic is handled
122
+ # by the Streamlit app's main loop reacting to this processor's state.
123
+ return audio_chunk # Return the chunk (pass-through)
124
 
125
+ # --- App Header ---
126
  st.title("🤖 Ollama AI Assistant")
127
+ st.caption("Start chatting with our AI assistant. Type your message or use the microphone.")
128
+
129
+ # --- Chat Display ---
130
+ st.markdown("---") # Separator for visual clarity
131
  for chat in st.session_state.chat_history:
132
+ # Use Streamlit's chat_message container for distinct roles
133
  with st.chat_message(chat["role"], avatar="🤖" if chat["role"] == "assistant" else "👤"):
134
  st.write(chat["message"])
135
 
136
+ # --- Input Area ---
137
+ # Use a form to handle user input and submission
138
+ with st.form("chat_form", clear_on_submit=True):
139
+ # Store the user's prompt in session state so it can be pre-filled by STT
140
+ user_prompt_key = "user_input_text_area" # A unique key for the text area
141
+ user_prompt = st.text_area(
142
+ "Type your message here...",
143
+ height=100,
144
+ placeholder="e.g., Explain quantum computing in simple terms.",
145
+ label_visibility="collapsed", # Hide the default label for a cleaner look
146
+ key=user_prompt_key,
147
+ value=st.session_state.transcribed_text # Pre-fill with transcribed text from STT
148
+ )
149
+
150
+ col1, col2 = st.columns([1, 1])
151
+ with col1:
152
+ submitted = st.form_submit_button("Send")
153
+ with col2:
154
+ # Microphone button logic
155
+ record_button_label = "Stop Listening" if st.session_state.webrtc_state == "listening" else "Start Listening"
156
+ microphone_button = st.form_submit_button(record_button_label, key="microphone_button")
157
 
158
+ # Handle microphone button press to control WebRTC state
159
+ if microphone_button:
160
+ if st.session_state.webrtc_state == "idle":
161
+ # Transition to 'listening' state
162
+ st.session_state.webrtc_state = "listening"
163
+ st.session_state.transcribed_text = "" # Clear any previous transcription
164
+ st.info("Listening... Tap 'Stop Listening' or wait for silence to auto-stop.")
165
+ st.rerun() # Rerun to activate the WebRTC streamer
166
+ elif st.session_state.webrtc_state == "listening":
167
+ # User manually clicked 'Stop Listening', transition to 'processing_audio'
168
+ st.session_state.webrtc_state = "processing_audio"
169
+ st.info("Stopping recording and processing audio...")
170
+ st.rerun() # Rerun to trigger audio processing
171
 
172
+ # Process the prompt when the 'Send' button is submitted and prompt is not empty
173
+ if submitted and user_prompt:
174
+ logger.info(f"User submitted prompt: {user_prompt[:50]}...") # Log the submitted prompt
175
+ # Add user's message to chat history immediately
176
+ st.session_state.chat_history.append({"role": "user", "message": user_prompt})
177
+ st.session_state.transcribed_text = "" # Clear transcribed text after it's sent to LLM
178
+
179
+ # Display a "Thinking..." message while waiting for the AI response
180
+ with st.chat_message("assistant", avatar="🤖"):
181
+ response_placeholder = st.empty() # Create an empty placeholder for streaming content
182
+ response_placeholder.write("Thinking...") # Initial message
183
+ logger.info("Displaying 'Thinking...' message.")
184
+
185
+ full_response = "" # Initialize an empty string to build the full response
186
+ byte_buffer = b"" # Initialize a buffer for incomplete UTF-8 characters for streaming
187
+ try:
188
+ # Prepare the request payload for FastAPI LLM endpoint
189
+ payload = {"text": user_prompt}
190
+ headers = {"Content-Type": "application/json"}
191
+ logger.info(f"Sending LLM request to FastAPI at {FASTAPI_LLM_URL}")
192
 
193
+ # Make a streaming POST request to the FastAPI endpoint
194
+ with requests.post(FASTAPI_LLM_URL, json=payload, headers=headers, stream=True) as response:
195
+ logger.info(f"Received LLM response from FastAPI with status code: {response.status_code}")
196
+ response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
197
+
198
+ # Iterate over the response content as it streams (byte by byte)
199
+ for chunk in response.iter_content(chunk_size=1):
200
+ if chunk: # Filter out potential empty keep-alive chunks
201
+ byte_buffer += chunk # Append new bytes to the buffer
202
+ try:
203
+ # Attempt to decode the entire buffer using 'strict' error handling
204
+ decoded_text = byte_buffer.decode("utf-8", errors="strict")
205
+ full_response += decoded_text
206
+ response_placeholder.markdown(full_response + "▌") # Update display, add cursor
207
+ byte_buffer = b"" # Clear the buffer if decoding was successful
208
+ except UnicodeDecodeError:
209
+ # This is expected if a multi-byte character is split across chunks.
210
+ # Do nothing, just wait for the next chunk to complete the character.
211
+ pass
212
+ except Exception as e:
213
+ # Catch any other unexpected decoding errors
214
+ logger.error(f"Error decoding stream chunk: {e} - Raw bytes: {chunk}")
215
+ try:
216
+ full_response += chunk.decode("utf-8", errors="replace")
217
+ except Exception as decode_err:
218
+ logger.error(f"Failed to decode even with replace errors: {decode_err}")
219
+ full_response += "[Decoding Error]" # Indicate a severe decoding issue
220
+ response_placeholder.markdown(full_response + "▌")
221
+ byte_buffer = b"" # Clear buffer to try and recover
222
+
223
+ # After the loop, if there are any remaining bytes in the buffer, try to decode them
224
+ if byte_buffer:
225
+ try:
226
+ full_response += byte_buffer.decode("utf-8", errors="replace")
227
+ logger.warning("Remaining bytes in buffer decoded with replacement.")
228
+ except Exception as e:
229
+ logger.error(f"Failed to decode final buffer bytes: {e}")
230
+ full_response += "[Final Decoding Error]"
231
+ response_placeholder.markdown(full_response) # Final update without cursor
232
+ logger.info("Streaming complete. Full LLM response received.")
233
+
234
  except requests.exceptions.ConnectionError:
235
+ # Handle cases where Streamlit cannot connect to FastAPI
236
+ full_response = (f"Error: Could not connect to the FastAPI server. "
237
+ f"Please ensure it is running at {FASTAPI_LLM_URL}.")
238
+ response_placeholder.error(full_response) # Display error in the placeholder
239
+ logger.error(f"ConnectionError: Could not connect to FastAPI at {FASTAPI_LLM_URL}")
240
  except requests.exceptions.RequestException as e:
241
+ # Handle other request-related errors (e.g., HTTP errors from raise_for_status)
242
  error_details = e.response.text if e.response is not None else str(e)
243
+ status_code = e.response.status_code if e.response is not None else "N/A"
244
+ full_response = (f"An error occurred during the request to FastAPI. "
245
+ f"Status code: {status_code}\nDetails: {error_details}")
246
+ response_placeholder.error(full_response) # Display error in the placeholder
247
+ logger.error(f"Request error to FastAPI: {e}", exc_info=True)
248
  except Exception as e:
249
+ # Catch any other unexpected errors during the request or processing
250
+ full_response = f"An unexpected error occurred: {e}"
251
+ response_placeholder.error(full_response) # Display error in the placeholder
252
+ logger.exception("An unexpected error occurred during API request.") # Logs traceback
253
+
254
+ # After the streaming is complete (or an error occurred), add the final response
255
+ # to the chat history. This ensures it persists across reruns.
256
+ st.session_state.chat_history.append({"role": "assistant", "message": full_response})
257
+ logger.info("Final LLM response added to chat history.")
258
+ # Rerun the app to display the updated chat history with the final response
259
  st.rerun()
260
+ elif submitted and not user_prompt:
261
+ # Warn user if no prompt is entered for the 'Send' button
262
+ st.warning("Please enter a prompt before clicking 'Send'.")
263
+ logger.warning("User attempted to send an empty text prompt.")
264
 
265
+ # --- WebRTC Streamer for Microphone Input ---
266
+ webrtc_ctx = None
267
+ if st.session_state.webrtc_state in ["listening", "processing_audio"]:
268
+ logger.info(f"Initiating webrtc_streamer with state: {st.session_state.webrtc_state}")
269
+ webrtc_ctx = webrtc_streamer(
270
+ key="ollama-audio-input", # Unique key for this component
271
+ mode=WebRtcMode.SENDONLY, # Only send audio from browser to Python
272
+ audio_html_attrs={
273
+ "autoPlay": "true",
274
+ "controls": "",
275
+ "muted": "muted", # Mute local playback to avoid echo
276
+ },
277
+ # Use our custom processor to handle audio frames and VAD
278
+ in_audio_frames_processor_factory=VADAudioProcessor,
279
+ client_settings=ClientSettings(
280
+ rtc_configuration={"iceServers": [{"urls": ["stun:stun.l.google.com:19302"]}]}, # STUN server for NAT traversal
281
+ media_stream_constraints={"audio": True, "video": False}, # Only request audio stream
282
+ ),
283
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
284
 
285
+ # Display status messages while recording
286
+ if webrtc_ctx.state.playing and st.session_state.webrtc_state == "listening":
287
+ st.info("Microphone active. Speak clearly now...")
288
+ elif not webrtc_ctx.state.playing and st.session_state.webrtc_state == "listening":
289
+ st.warning("Waiting for microphone permissions... Please grant access if prompted.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
290
 
291
+ # Check VAD status from the audio processor
292
+ if webrtc_ctx.audio_processor:
293
+ processor: VADAudioProcessor = webrtc_ctx.audio_processor
294
+ # If voice was detected, and now prolonged silence is detected
295
+ if processor.voice_detected and processor.silent_frames_count >= processor.max_silent_frames:
296
+ logger.info("VAD detected prolonged silence. Transitioning to processing audio.")
297
+ # Set state to processing, which will cause a rerun and stop the streamer
298
+ if st.session_state.webrtc_state == "listening": # Only auto-stop if currently listening
299
+ st.session_state.webrtc_state = "processing_audio"
300
+ st.info("Silence detected. Processing audio for transcription...")
301
+ st.rerun() # Trigger a rerun to process the audio
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
302
 
303
+ # --- Audio Processing and STT Call after Recording Stops ---
304
+ # This block runs when we transition to 'processing_audio' state and the WebRTC session is truly stopped.
305
+ if st.session_state.webrtc_state == "processing_audio" and (webrtc_ctx is None or not webrtc_ctx.state.playing):
306
+ logger.info("WebRTC session stopped (or never started in processing_audio state). Attempting to get audio.")
307
+ # Ensure we have an audio processor instance from the stopped session
308
+ if webrtc_ctx and webrtc_ctx.audio_processor:
309
+ processor: VADAudioProcessor = webrtc_ctx.audio_processor
310
+ if processor.audio_buffer.tell() > 0: # Check if any audio data was recorded
311
+ recorded_audio_bytes = processor.audio_buffer.getvalue()
312
+ logger.info(f"Recorded audio buffer size: {len(recorded_audio_bytes)} bytes.")
313
 
314
+ # Convert raw 16-bit PCM (from WebRTC) to WAV format using pydub
315
+ try:
316
+ audio = AudioSegment(
317
+ recorded_audio_bytes,
318
+ sample_width=processor.samples_width,
319
+ frame_rate=processor.frame_rate,
320
+ channels=1 # WebRTC typically provides mono audio
321
+ )
322
+ wav_io = io.BytesIO()
323
+ audio.export(wav_io, format="wav") # Export to WAV format
324
+ wav_io.seek(0) # Rewind the buffer to the beginning for reading
325
 
326
+ st.info("Sending recorded audio to STT backend for transcription...")
327
+ # Send the WAV audio bytes to the FastAPI STT endpoint
328
+ files = {'audio_file': ('audio.wav', wav_io.getvalue(), 'audio/wav')}
329
+ response = requests.post(FASTAPI_STT_URL, files=files)
330
+ response.raise_for_status() # Raise HTTPError for bad responses
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
331
 
332
+ transcription_result = response.json()
333
+ transcribed_text = transcription_result.get("transcribed_text", "").strip()
334
+ st.session_state.transcribed_text = transcribed_text # Store transcribed text
 
 
 
 
335
 
336
+ logger.info(f"Transcription received: {transcribed_text[:100]}...")
337
+ if transcribed_text:
338
+ st.success("Transcription complete!")
339
+ else:
340
+ st.warning("No clear speech detected or transcription resulted in empty text.")
341
+ except requests.exceptions.RequestException as e:
342
+ st.error(f"Error sending audio to STT backend: {e}")
343
+ logger.error(f"STT Backend error: {e}", exc_info=True)
344
+ st.session_state.transcribed_text = "" # Clear on error
345
+ except Exception as e:
346
+ st.error(f"An unexpected error occurred during audio processing or STT: {e}")
347
+ logger.exception("Unexpected error in STT processing.")
348
+ st.session_state.transcribed_text = "" # Clear on error
349
+ else:
350
+ st.warning("No audio was recorded during the session.")
351
+ st.session_state.transcribed_text = ""
 
352
 
353
+ # Reset WebRTC state to idle after processing is complete
354
+ st.session_state.webrtc_state = "idle"
355
+ st.rerun() # Rerun to update the text area with transcription and reset UI
356
+ elif st.session_state.webrtc_state == "processing_audio":
357
+ st.warning("WebRTC context or audio processor was not available for transcription. Retrying or check permissions.")
358
+ st.session_state.webrtc_state = "idle" # Reset for next attempt
359
+ st.rerun()
 
 
 
 
 
 
 
 
 
 
 
360
 
361
 
362
  # --- Footer ---
363
  st.markdown("---")
364
+ st.caption("Powered by Ollama, FastAPI, Streamlit, and WebRTC.")