Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -7,7 +7,7 @@ from openai import OpenAI
|
|
| 7 |
from websockets import connect, Data, ClientConnection
|
| 8 |
from dotenv import load_dotenv
|
| 9 |
|
| 10 |
-
#
|
| 11 |
load_dotenv()
|
| 12 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
| 13 |
ASSISTANT_ID = os.getenv("ASSISTANT_ID")
|
|
@@ -17,7 +17,7 @@ HEADERS = {"Authorization": f"Bearer {OPENAI_API_KEY}", "OpenAI-Beta": "realtime
|
|
| 17 |
WS_URI = "wss://api.openai.com/v1/realtime?intent=transcription"
|
| 18 |
connections = {}
|
| 19 |
|
| 20 |
-
#
|
| 21 |
class WebSocketClient:
|
| 22 |
def __init__(self, uri, headers, client_id):
|
| 23 |
self.uri, self.headers, self.client_id = uri, headers, client_id
|
|
@@ -45,7 +45,10 @@ class WebSocketClient:
|
|
| 45 |
buf = io.BytesIO(); sf.write(buf, int16, sr, format='WAV', subtype='PCM_16')
|
| 46 |
audio = AudioSegment.from_file(buf, format="wav").set_frame_rate(24000)
|
| 47 |
out = io.BytesIO(); audio.export(out, format="wav"); out.seek(0)
|
| 48 |
-
await self.websocket.send(json.dumps({
|
|
|
|
|
|
|
|
|
|
| 49 |
|
| 50 |
async def receive_messages(self):
|
| 51 |
async for msg in self.websocket:
|
|
@@ -74,7 +77,7 @@ def clear_transcript(cid):
|
|
| 74 |
if cid in connections: connections[cid].transcript = ""
|
| 75 |
return ""
|
| 76 |
|
| 77 |
-
#
|
| 78 |
def handle_chat(user_input, history, thread_id, image_url):
|
| 79 |
if not OPENAI_API_KEY or not ASSISTANT_ID:
|
| 80 |
return "❌ Missing secrets!", history, thread_id, image_url
|
|
@@ -97,7 +100,10 @@ def handle_chat(user_input, history, thread_id, image_url):
|
|
| 97 |
if msg.role == "assistant":
|
| 98 |
content = msg.content[0].text.value
|
| 99 |
history.append((user_input, content))
|
| 100 |
-
match = re.search(
|
|
|
|
|
|
|
|
|
|
| 101 |
if match: image_url = match.group(0)
|
| 102 |
break
|
| 103 |
|
|
@@ -106,7 +112,7 @@ def handle_chat(user_input, history, thread_id, image_url):
|
|
| 106 |
except Exception as e:
|
| 107 |
return f"❌ {e}", history, thread_id, image_url
|
| 108 |
|
| 109 |
-
#
|
| 110 |
with gr.Blocks(theme=gr.themes.Soft()) as app:
|
| 111 |
gr.Markdown("# 📄 Document AI Assistant")
|
| 112 |
|
|
@@ -115,31 +121,37 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
|
|
| 115 |
thread_state = gr.State()
|
| 116 |
image_state = gr.State()
|
| 117 |
client_id = gr.State()
|
|
|
|
| 118 |
|
| 119 |
-
with gr.Row():
|
| 120 |
with gr.Column(scale=1):
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
inputs=[user_prompt, chat_state, thread_state, image_state],
|
| 138 |
outputs=[user_prompt, chat, thread_state, image_state])
|
| 139 |
-
|
| 140 |
image_state.change(fn=lambda x: x, inputs=image_state, outputs=image_display)
|
| 141 |
voice_input.stream(fn=send_audio, inputs=[voice_input, client_id], outputs=voice_transcript, stream_every=0.5)
|
| 142 |
clear_btn.click(fn=clear_transcript, inputs=[client_id], outputs=voice_transcript)
|
| 143 |
-
app.load(create_ws, outputs=[client_id])
|
| 144 |
|
| 145 |
app.launch()
|
|
|
|
| 7 |
from websockets import connect, Data, ClientConnection
|
| 8 |
from dotenv import load_dotenv
|
| 9 |
|
| 10 |
+
# ============ Load Secrets ============
|
| 11 |
load_dotenv()
|
| 12 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
| 13 |
ASSISTANT_ID = os.getenv("ASSISTANT_ID")
|
|
|
|
| 17 |
WS_URI = "wss://api.openai.com/v1/realtime?intent=transcription"
|
| 18 |
connections = {}
|
| 19 |
|
| 20 |
+
# ============ WebSocket Client ============
|
| 21 |
class WebSocketClient:
|
| 22 |
def __init__(self, uri, headers, client_id):
|
| 23 |
self.uri, self.headers, self.client_id = uri, headers, client_id
|
|
|
|
| 45 |
buf = io.BytesIO(); sf.write(buf, int16, sr, format='WAV', subtype='PCM_16')
|
| 46 |
audio = AudioSegment.from_file(buf, format="wav").set_frame_rate(24000)
|
| 47 |
out = io.BytesIO(); audio.export(out, format="wav"); out.seek(0)
|
| 48 |
+
await self.websocket.send(json.dumps({
|
| 49 |
+
"type": "input_audio_buffer.append",
|
| 50 |
+
"audio": base64.b64encode(out.read()).decode()
|
| 51 |
+
}))
|
| 52 |
|
| 53 |
async def receive_messages(self):
|
| 54 |
async for msg in self.websocket:
|
|
|
|
| 77 |
if cid in connections: connections[cid].transcript = ""
|
| 78 |
return ""
|
| 79 |
|
| 80 |
+
# ============ Chat Assistant ============
|
| 81 |
def handle_chat(user_input, history, thread_id, image_url):
|
| 82 |
if not OPENAI_API_KEY or not ASSISTANT_ID:
|
| 83 |
return "❌ Missing secrets!", history, thread_id, image_url
|
|
|
|
| 100 |
if msg.role == "assistant":
|
| 101 |
content = msg.content[0].text.value
|
| 102 |
history.append((user_input, content))
|
| 103 |
+
match = re.search(
|
| 104 |
+
r'https://raw\.githubusercontent\.com/AndrewLORTech/surgical-pathology-manual/main/[\w\-/]*\.png',
|
| 105 |
+
content
|
| 106 |
+
)
|
| 107 |
if match: image_url = match.group(0)
|
| 108 |
break
|
| 109 |
|
|
|
|
| 112 |
except Exception as e:
|
| 113 |
return f"❌ {e}", history, thread_id, image_url
|
| 114 |
|
| 115 |
+
# ============ Gradio UI ============
|
| 116 |
with gr.Blocks(theme=gr.themes.Soft()) as app:
|
| 117 |
gr.Markdown("# 📄 Document AI Assistant")
|
| 118 |
|
|
|
|
| 121 |
thread_state = gr.State()
|
| 122 |
image_state = gr.State()
|
| 123 |
client_id = gr.State()
|
| 124 |
+
voice_enabled = gr.State(False)
|
| 125 |
|
| 126 |
+
with gr.Row(equal_height=True):
|
| 127 |
with gr.Column(scale=1):
|
| 128 |
+
image_display = gr.Image(label="🖼️ Document", type="filepath", show_download_button=False)
|
| 129 |
+
|
| 130 |
+
with gr.Column(scale=1.4):
|
| 131 |
+
chat = gr.Chatbot(label="💬 Chat", height=460)
|
| 132 |
+
|
| 133 |
+
with gr.Row():
|
| 134 |
+
user_prompt = gr.Textbox(placeholder="Ask your question...", show_label=False, scale=6)
|
| 135 |
+
mic_toggle_btn = gr.Button("🎙️", scale=1)
|
| 136 |
+
send_btn = gr.Button("Send", variant="primary", scale=2)
|
| 137 |
+
|
| 138 |
+
with gr.Accordion("🎤 Voice Transcription", open=False) as voice_section:
|
| 139 |
+
with gr.Row():
|
| 140 |
+
voice_input = gr.Audio(label="Mic", streaming=True)
|
| 141 |
+
voice_transcript = gr.Textbox(label="Transcript", lines=2, interactive=False)
|
| 142 |
+
clear_btn = gr.Button("🧹 Clear Transcript")
|
| 143 |
+
|
| 144 |
+
# FUNCTIONAL CONNECTIONS
|
| 145 |
+
def toggle_voice(curr):
|
| 146 |
+
return not curr, gr.update(visible=not curr)
|
| 147 |
+
|
| 148 |
+
mic_toggle_btn.click(fn=toggle_voice, inputs=voice_enabled, outputs=[voice_enabled, voice_section])
|
| 149 |
+
send_btn.click(fn=handle_chat,
|
| 150 |
inputs=[user_prompt, chat_state, thread_state, image_state],
|
| 151 |
outputs=[user_prompt, chat, thread_state, image_state])
|
|
|
|
| 152 |
image_state.change(fn=lambda x: x, inputs=image_state, outputs=image_display)
|
| 153 |
voice_input.stream(fn=send_audio, inputs=[voice_input, client_id], outputs=voice_transcript, stream_every=0.5)
|
| 154 |
clear_btn.click(fn=clear_transcript, inputs=[client_id], outputs=voice_transcript)
|
| 155 |
+
app.load(fn=create_ws, outputs=[client_id])
|
| 156 |
|
| 157 |
app.launch()
|