Spaces:

mgbam
/

rentbot

Runtime error

App Files Files Community

mgbam commited on Jul 18, 2025

Commit

cd82c0d

verified ·

1 Parent(s): 78b49d0

Update app.py

Browse files

Files changed (1) hide show

app.py +217 -68

app.py CHANGED Viewed

@@ -1,68 +1,217 @@
-import gradio as gr
-from utils import init_openai, transcribe_audio, chat_with_gpt
-# 1. Initialize OpenAI key
-init_openai()
-# 2. System prompt
-SYSTEM_PROMPT = (
-    "You are RentBot, a friendly virtual leasing assistant. "
-    "Answer inquiries about rental listings, schedule showings, and "
-    "provide clear instructions to potential tenants."
-)
-def handle_interaction(audio) -> tuple[str, str, str]:
-    """
-    Saves audio, transcribes with Whisper, chats with GPT, and returns
-    (transcript, reply, status).
-    """
-    audio_path = "input.wav"
-    audio.save(audio_path)
-    transcript = transcribe_audio(audio_path)
-    reply = chat_with_gpt(SYSTEM_PROMPT, transcript)
-    return transcript, reply, "✅ Completed"
-# 3. Custom CSS for card styling
-custom_css = """
-#input-panel, #output-panel {
-  background: white;
-  border-radius: 12px;
-  padding: 1rem;
-  box-shadow: 0 2px 8px rgba(0,0,0,0.1);
-}
-.status {
-  margin-top: 0.5rem;
-  color: #555;
-  text-align: center;
-}
-#ask-btn {
-  width: 100%;
-  margin-top: 0.5rem;
-}
-"""
-# 4. Build the Blocks app without gr.Box
-with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
-    gr.Markdown("## 🏠 RentBot 24/7")  # Title
-    with gr.Row():
-        # Input section
-        with gr.Column(elem_id="input-panel"):
-            gr.Markdown("### 🎙️ Ask RentBot")
-            audio_input = gr.Audio(sources="microphone", type="filepath", label="")
-            ask_btn = gr.Button("Tap to Speak & Ask", variant="primary", elem_id="ask-btn")
-            status_txt = gr.Textbox(label="Status", value="Ready to record", interactive=False, elem_classes="status")
-        # Output section
-        with gr.Column(elem_id="output-panel"):
-            gr.Markdown("### 💬 Conversation")
-            transcript_out = gr.Textbox(label="Transcribed Text", interactive=False, placeholder="Transcript appears here…")
-            reply_out = gr.Textbox(label="RentBot Reply", interactive=False, placeholder="Reply appears here…")
-    # Wire up the button click event
-    ask_btn.click(fn=handle_interaction,
-                  inputs=audio_input,
-                  outputs=[transcript_out, reply_out, status_txt])
-# 5. Launch
-demo.launch(server_name="0.0.0.0", server_port=7860, share=True)

+# rentbot/app.py
+import os
+import base64
+import json
+import asyncio
+import numpy as np
+from fastapi import FastAPI, WebSocket, WebSocketDisconnect
+from dotenv import load_dotenv
+from audio_utils import ulaw_to_pcm16
+from stt_handler import transcribe_audio_chunk
+from llm_handler import get_llm_response
+from tts_handler import text_to_speech_stream
+from tool_handler import execute_tool_call
+# Load environment variables
+load_dotenv()
+app = FastAPI()
+# Configuration
+SILENCE_THRESHOLD_SECONDS = 0.7
+AUDIO_RATE = 8000  # Hz for Twilio media streams
+AUDIO_BUFFER_SIZE = int(SILENCE_THRESHOLD_SECONDS * AUDIO_RATE)
+# In-memory session storage (for demonstration)
+# In production, use Redis or another persistent store.
+sessions = {}
+@app.websocket("/rentbot")
+async def websocket_endpoint(ws: WebSocket):
+    await ws.accept()
+    stream_sid = None
+    audio_buffer = np.array([], dtype=np.int16)
+    try:
+        # Initial greeting
+        # We need a stream_sid to send audio, so we wait for the first 'start' message
+        # A more robust solution might send a pre-recorded greeting or handle this flow differently.
+        async for message in ws.iter_text():
+            data = json.loads(message)
+            if data['event'] == 'start':
+                stream_sid = data['start']['streamSid']
+                sessions[stream_sid] = {
+                    "messages": [{"role": "system", "content": os.getenv("SYSTEM_PROMPT")}],
+                    "processing_task": None
+                }
+                print(f"New stream started: {stream_sid}")
+                # Send an initial greeting
+                initial_greeting = "Hi! I'm RentBot, your leasing assistant. How can I help you today?"
+                sessions[stream_sid]["messages"].append({"role": "assistant", "content": initial_greeting})
+                async def send_initial_greeting():
+                    tts_iterator = text_to_speech_stream(iter([initial_greeting]))
+                    async for audio_chunk in tts_iterator:
+                        payload = base64.b64encode(audio_chunk).decode('utf-8')
+                        await ws.send_json({
+                            "event": "media",
+                            "streamSid": stream_sid,
+                            "media": {"payload": payload}
+                        })
+                    # Mark the end of the bot's speech
+                    await ws.send_json({"event": "mark", "streamSid": stream_sid, "mark": {"name": "bot_turn_end"}})
+                asyncio.create_task(send_initial_greeting())
+            elif data['event'] == 'media':
+                if not stream_sid: continue
+                # Decode the base64 µ-law audio and add to buffer
+                chunk_ulaw = base64.b64decode(data['media']['payload'])
+                chunk_pcm = ulaw_to_pcm16(chunk_ulaw)
+                audio_buffer = np.append(audio_buffer, chunk_pcm)
+                # If buffer is full (indicating continuous speech), process it
+                if len(audio_buffer) >= AUDIO_BUFFER_SIZE:
+                    # If a task is already running, let it finish
+                    if sessions[stream_sid]["processing_task"] and not sessions[stream_sid]["processing_task"].done():
+                        continue # Skip starting a new task
+                    # Start processing in a background task
+                    task = asyncio.create_task(process_user_audio(ws, stream_sid, audio_buffer))
+                    sessions[stream_sid]["processing_task"] = task
+                    audio_buffer = np.array([], dtype=np.int16) # Reset buffer
+            elif data['event'] == 'mark':
+                # This indicates the user has likely paused. Let's process any remaining audio.
+                if not stream_sid: continue
+                if len(audio_buffer) > 1000: # Heuristic: process if there's meaningful audio left
+                    if not (sessions[stream_sid]["processing_task"] and not sessions[stream_sid]["processing_task"].done()):
+                        task = asyncio.create_task(process_user_audio(ws, stream_sid, audio_buffer))
+                        sessions[stream_sid]["processing_task"] = task
+                        audio_buffer = np.array([], dtype=np.int16) # Reset buffer
+            elif data['event'] == 'stop':
+                print(f"Stream stopped: {stream_sid}")
+                break
+    except WebSocketDisconnect:
+        print(f"WebSocket disconnected for stream {stream_sid}")
+    except Exception as e:
+        print(f"An error occurred: {e}")
+    finally:
+        if stream_sid and stream_sid in sessions:
+            if sessions[stream_sid]["processing_task"]:
+                sessions[stream_sid]["processing_task"].cancel()
+            del sessions[stream_sid]
+        print(f"Session cleaned up for stream {stream_sid}")
+async def process_user_audio(ws: WebSocket, stream_sid: str, audio_chunk: np.ndarray):
+    """The main logic loop: STT -> LLM -> (Tool/TTS)"""
+    print(f"[{stream_sid}] Processing audio chunk of size {len(audio_chunk)}...")
+    # 1. Speech-to-Text
+    user_text = await transcribe_audio_chunk(audio_chunk)
+    if not user_text:
+        print(f"[{stream_sid}] No text transcribed.")
+        return
+    print(f"[{stream_sid}] User said: {user_text}")
+    sessions[stream_sid]["messages"].append({"role": "user", "content": user_text})
+    # 2. Text-to-Assistant Reply (LLM)
+    llm_response_generator = get_llm_response(sessions[stream_sid]["messages"])
+    # We need to accumulate the text to know if there's a tool call
+    text_chunks_for_tts = []
+    assistant_message = None
+    tool_calls = None
+    async def llm_logic_handler():
+        nonlocal assistant_message, tool_calls
+        # This async generator will yield text chunks and then return the final message and tool calls
+        # This is a slightly advanced pattern to handle both streaming and final return values
+        class ResponseHandler:
+            def __init__(self, generator):
+                self._generator = generator
+                self.final_result = None
+            async def __aiter__(self):
+                return self
+            async def __anext__(self):
+                try:
+                    return await self._generator.__anext__()
+                except StopAsyncIteration as e:
+                    self.final_result = e.value
+                    raise
+        response_handler = ResponseHandler(llm_response_generator)
+        async for chunk in response_handler:
+            text_chunks_for_tts.append(chunk)
+        assistant_message, tool_calls = response_handler.final_result
+    # Run the LLM logic and TTS streaming concurrently
+    llm_task = asyncio.create_task(llm_logic_handler())
+    tts_task = asyncio.create_task(stream_and_send_audio(ws, stream_sid, (chunk for chunk in text_chunks_for_tts)))
+    await asyncio.gather(llm_task, tts_task) # Wait for both to complete
+    # Add the full assistant response to history
+    if assistant_message and assistant_message.get("content"):
+       sessions[stream_sid]["messages"].append(assistant_message)
+    # 3. Handle Tool Calls if any
+    if tool_calls:
+        # Add the assistant's request for a tool call to history
+        sessions[stream_sid]["messages"].append(assistant_message)
+        for tool_call in tool_calls:
+            print(f"[{stream_sid}] Executing tool: {tool_call.function.name}")
+            tool_result_message = execute_tool_call(tool_call)
+            sessions[stream_sid]["messages"].append(tool_result_message)
+        # 4. Get a final response from the LLM after executing the tool
+        final_response_generator = get_llm_response(sessions[stream_sid]["messages"])
+        final_text_chunks = []
+        async for chunk in final_response_generator:
+            final_text_chunks.append(chunk)
+        # Add final response to history
+        final_assistant_message, _ = await final_response_generator
+        if final_assistant_message:
+            sessions[stream_sid]["messages"].append(final_assistant_message)
+        # Stream the final response audio
+        await stream_and_send_audio(ws, stream_sid, iter(final_text_chunks))
+async def stream_and_send_audio(ws: WebSocket, stream_sid: str, text_iterator):
+    """Stream text to TTS and send the resulting audio back over the WebSocket."""
+    async for audio_chunk in text_to_speech_stream(text_iterator):
+        if audio_chunk:
+            payload = base64.b64encode(audio_chunk).decode('utf-8')
+            await ws.send_json({
+                "event": "media",
+                "streamSid": stream_sid,
+                "media": {"payload": payload}
+            })
+    # Mark the end of the bot's turn to let Twilio know it can listen for the user again
+    await ws.send_json({"event": "mark", "streamSid": stream_sid, "mark": {"name": "bot_turn_end"}})
+    print(f"[{stream_sid}] Finished sending bot's audio turn.")
+if __name__ == "__main__":
+    import uvicorn
+    print("Starting RentBot server...")
+    uvicorn.run(app, host="0.0.0.0", port=8000)