mgbam commited on
Commit
cd82c0d
·
verified ·
1 Parent(s): 78b49d0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +217 -68
app.py CHANGED
@@ -1,68 +1,217 @@
1
- import gradio as gr
2
- from utils import init_openai, transcribe_audio, chat_with_gpt
3
-
4
- # 1. Initialize OpenAI key
5
- init_openai()
6
-
7
- # 2. System prompt
8
- SYSTEM_PROMPT = (
9
- "You are RentBot, a friendly virtual leasing assistant. "
10
- "Answer inquiries about rental listings, schedule showings, and "
11
- "provide clear instructions to potential tenants."
12
- )
13
-
14
- def handle_interaction(audio) -> tuple[str, str, str]:
15
- """
16
- Saves audio, transcribes with Whisper, chats with GPT, and returns
17
- (transcript, reply, status).
18
- """
19
- audio_path = "input.wav"
20
- audio.save(audio_path)
21
- transcript = transcribe_audio(audio_path)
22
- reply = chat_with_gpt(SYSTEM_PROMPT, transcript)
23
- return transcript, reply, "✅ Completed"
24
-
25
- # 3. Custom CSS for card styling
26
- custom_css = """
27
- #input-panel, #output-panel {
28
- background: white;
29
- border-radius: 12px;
30
- padding: 1rem;
31
- box-shadow: 0 2px 8px rgba(0,0,0,0.1);
32
- }
33
- .status {
34
- margin-top: 0.5rem;
35
- color: #555;
36
- text-align: center;
37
- }
38
- #ask-btn {
39
- width: 100%;
40
- margin-top: 0.5rem;
41
- }
42
- """
43
-
44
- # 4. Build the Blocks app without gr.Box
45
- with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
46
- gr.Markdown("## 🏠 RentBot 24/7") # Title
47
-
48
- with gr.Row():
49
- # Input section
50
- with gr.Column(elem_id="input-panel"):
51
- gr.Markdown("### 🎙️ Ask RentBot")
52
- audio_input = gr.Audio(sources="microphone", type="filepath", label="")
53
- ask_btn = gr.Button("Tap to Speak & Ask", variant="primary", elem_id="ask-btn")
54
- status_txt = gr.Textbox(label="Status", value="Ready to record", interactive=False, elem_classes="status")
55
-
56
- # Output section
57
- with gr.Column(elem_id="output-panel"):
58
- gr.Markdown("### 💬 Conversation")
59
- transcript_out = gr.Textbox(label="Transcribed Text", interactive=False, placeholder="Transcript appears here…")
60
- reply_out = gr.Textbox(label="RentBot Reply", interactive=False, placeholder="Reply appears here…")
61
-
62
- # Wire up the button click event
63
- ask_btn.click(fn=handle_interaction,
64
- inputs=audio_input,
65
- outputs=[transcript_out, reply_out, status_txt])
66
-
67
- # 5. Launch
68
- demo.launch(server_name="0.0.0.0", server_port=7860, share=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # rentbot/app.py
2
+ import os
3
+ import base64
4
+ import json
5
+ import asyncio
6
+ import numpy as np
7
+ from fastapi import FastAPI, WebSocket, WebSocketDisconnect
8
+ from dotenv import load_dotenv
9
+
10
+ from audio_utils import ulaw_to_pcm16
11
+ from stt_handler import transcribe_audio_chunk
12
+ from llm_handler import get_llm_response
13
+ from tts_handler import text_to_speech_stream
14
+ from tool_handler import execute_tool_call
15
+
16
+ # Load environment variables
17
+ load_dotenv()
18
+
19
+ app = FastAPI()
20
+
21
+ # Configuration
22
+ SILENCE_THRESHOLD_SECONDS = 0.7
23
+ AUDIO_RATE = 8000 # Hz for Twilio media streams
24
+ AUDIO_BUFFER_SIZE = int(SILENCE_THRESHOLD_SECONDS * AUDIO_RATE)
25
+
26
+ # In-memory session storage (for demonstration)
27
+ # In production, use Redis or another persistent store.
28
+ sessions = {}
29
+
30
+ @app.websocket("/rentbot")
31
+ async def websocket_endpoint(ws: WebSocket):
32
+ await ws.accept()
33
+ stream_sid = None
34
+ audio_buffer = np.array([], dtype=np.int16)
35
+
36
+ try:
37
+ # Initial greeting
38
+ # We need a stream_sid to send audio, so we wait for the first 'start' message
39
+ # A more robust solution might send a pre-recorded greeting or handle this flow differently.
40
+
41
+ async for message in ws.iter_text():
42
+ data = json.loads(message)
43
+
44
+ if data['event'] == 'start':
45
+ stream_sid = data['start']['streamSid']
46
+ sessions[stream_sid] = {
47
+ "messages": [{"role": "system", "content": os.getenv("SYSTEM_PROMPT")}],
48
+ "processing_task": None
49
+ }
50
+ print(f"New stream started: {stream_sid}")
51
+
52
+ # Send an initial greeting
53
+ initial_greeting = "Hi! I'm RentBot, your leasing assistant. How can I help you today?"
54
+ sessions[stream_sid]["messages"].append({"role": "assistant", "content": initial_greeting})
55
+
56
+ async def send_initial_greeting():
57
+ tts_iterator = text_to_speech_stream(iter([initial_greeting]))
58
+ async for audio_chunk in tts_iterator:
59
+ payload = base64.b64encode(audio_chunk).decode('utf-8')
60
+ await ws.send_json({
61
+ "event": "media",
62
+ "streamSid": stream_sid,
63
+ "media": {"payload": payload}
64
+ })
65
+ # Mark the end of the bot's speech
66
+ await ws.send_json({"event": "mark", "streamSid": stream_sid, "mark": {"name": "bot_turn_end"}})
67
+
68
+ asyncio.create_task(send_initial_greeting())
69
+
70
+
71
+ elif data['event'] == 'media':
72
+ if not stream_sid: continue
73
+
74
+ # Decode the base64 µ-law audio and add to buffer
75
+ chunk_ulaw = base64.b64decode(data['media']['payload'])
76
+ chunk_pcm = ulaw_to_pcm16(chunk_ulaw)
77
+ audio_buffer = np.append(audio_buffer, chunk_pcm)
78
+
79
+ # If buffer is full (indicating continuous speech), process it
80
+ if len(audio_buffer) >= AUDIO_BUFFER_SIZE:
81
+ # If a task is already running, let it finish
82
+ if sessions[stream_sid]["processing_task"] and not sessions[stream_sid]["processing_task"].done():
83
+ continue # Skip starting a new task
84
+
85
+ # Start processing in a background task
86
+ task = asyncio.create_task(process_user_audio(ws, stream_sid, audio_buffer))
87
+ sessions[stream_sid]["processing_task"] = task
88
+ audio_buffer = np.array([], dtype=np.int16) # Reset buffer
89
+
90
+ elif data['event'] == 'mark':
91
+ # This indicates the user has likely paused. Let's process any remaining audio.
92
+ if not stream_sid: continue
93
+
94
+ if len(audio_buffer) > 1000: # Heuristic: process if there's meaningful audio left
95
+ if not (sessions[stream_sid]["processing_task"] and not sessions[stream_sid]["processing_task"].done()):
96
+ task = asyncio.create_task(process_user_audio(ws, stream_sid, audio_buffer))
97
+ sessions[stream_sid]["processing_task"] = task
98
+ audio_buffer = np.array([], dtype=np.int16) # Reset buffer
99
+
100
+ elif data['event'] == 'stop':
101
+ print(f"Stream stopped: {stream_sid}")
102
+ break
103
+
104
+ except WebSocketDisconnect:
105
+ print(f"WebSocket disconnected for stream {stream_sid}")
106
+ except Exception as e:
107
+ print(f"An error occurred: {e}")
108
+ finally:
109
+ if stream_sid and stream_sid in sessions:
110
+ if sessions[stream_sid]["processing_task"]:
111
+ sessions[stream_sid]["processing_task"].cancel()
112
+ del sessions[stream_sid]
113
+ print(f"Session cleaned up for stream {stream_sid}")
114
+
115
+ async def process_user_audio(ws: WebSocket, stream_sid: str, audio_chunk: np.ndarray):
116
+ """The main logic loop: STT -> LLM -> (Tool/TTS)"""
117
+ print(f"[{stream_sid}] Processing audio chunk of size {len(audio_chunk)}...")
118
+
119
+ # 1. Speech-to-Text
120
+ user_text = await transcribe_audio_chunk(audio_chunk)
121
+ if not user_text:
122
+ print(f"[{stream_sid}] No text transcribed.")
123
+ return
124
+
125
+ print(f"[{stream_sid}] User said: {user_text}")
126
+ sessions[stream_sid]["messages"].append({"role": "user", "content": user_text})
127
+
128
+ # 2. Text-to-Assistant Reply (LLM)
129
+ llm_response_generator = get_llm_response(sessions[stream_sid]["messages"])
130
+
131
+ # We need to accumulate the text to know if there's a tool call
132
+ text_chunks_for_tts = []
133
+ assistant_message = None
134
+ tool_calls = None
135
+
136
+ async def llm_logic_handler():
137
+ nonlocal assistant_message, tool_calls
138
+ # This async generator will yield text chunks and then return the final message and tool calls
139
+ # This is a slightly advanced pattern to handle both streaming and final return values
140
+ class ResponseHandler:
141
+ def __init__(self, generator):
142
+ self._generator = generator
143
+ self.final_result = None
144
+
145
+ async def __aiter__(self):
146
+ return self
147
+
148
+ async def __anext__(self):
149
+ try:
150
+ return await self._generator.__anext__()
151
+ except StopAsyncIteration as e:
152
+ self.final_result = e.value
153
+ raise
154
+
155
+ response_handler = ResponseHandler(llm_response_generator)
156
+
157
+ async for chunk in response_handler:
158
+ text_chunks_for_tts.append(chunk)
159
+
160
+ assistant_message, tool_calls = response_handler.final_result
161
+
162
+ # Run the LLM logic and TTS streaming concurrently
163
+ llm_task = asyncio.create_task(llm_logic_handler())
164
+ tts_task = asyncio.create_task(stream_and_send_audio(ws, stream_sid, (chunk for chunk in text_chunks_for_tts)))
165
+
166
+ await asyncio.gather(llm_task, tts_task) # Wait for both to complete
167
+
168
+ # Add the full assistant response to history
169
+ if assistant_message and assistant_message.get("content"):
170
+ sessions[stream_sid]["messages"].append(assistant_message)
171
+
172
+ # 3. Handle Tool Calls if any
173
+ if tool_calls:
174
+ # Add the assistant's request for a tool call to history
175
+ sessions[stream_sid]["messages"].append(assistant_message)
176
+
177
+ for tool_call in tool_calls:
178
+ print(f"[{stream_sid}] Executing tool: {tool_call.function.name}")
179
+ tool_result_message = execute_tool_call(tool_call)
180
+ sessions[stream_sid]["messages"].append(tool_result_message)
181
+
182
+ # 4. Get a final response from the LLM after executing the tool
183
+ final_response_generator = get_llm_response(sessions[stream_sid]["messages"])
184
+
185
+ final_text_chunks = []
186
+ async for chunk in final_response_generator:
187
+ final_text_chunks.append(chunk)
188
+
189
+ # Add final response to history
190
+ final_assistant_message, _ = await final_response_generator
191
+ if final_assistant_message:
192
+ sessions[stream_sid]["messages"].append(final_assistant_message)
193
+
194
+ # Stream the final response audio
195
+ await stream_and_send_audio(ws, stream_sid, iter(final_text_chunks))
196
+
197
+
198
+ async def stream_and_send_audio(ws: WebSocket, stream_sid: str, text_iterator):
199
+ """Stream text to TTS and send the resulting audio back over the WebSocket."""
200
+ async for audio_chunk in text_to_speech_stream(text_iterator):
201
+ if audio_chunk:
202
+ payload = base64.b64encode(audio_chunk).decode('utf-8')
203
+ await ws.send_json({
204
+ "event": "media",
205
+ "streamSid": stream_sid,
206
+ "media": {"payload": payload}
207
+ })
208
+
209
+ # Mark the end of the bot's turn to let Twilio know it can listen for the user again
210
+ await ws.send_json({"event": "mark", "streamSid": stream_sid, "mark": {"name": "bot_turn_end"}})
211
+ print(f"[{stream_sid}] Finished sending bot's audio turn.")
212
+
213
+
214
+ if __name__ == "__main__":
215
+ import uvicorn
216
+ print("Starting RentBot server...")
217
+ uvicorn.run(app, host="0.0.0.0", port=8000)