Spaces:

IAMTFRMZA
/

documentaitestv3

Sleeping

App Files Files Community

IAMTFRMZA commited on Apr 17, 2025

Commit

d13b654

verified ·

1 Parent(s): f310bae

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -74

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import gradio as gr
 import os, time, re, json, base64, asyncio, threading, uuid, io
 import numpy as np
@@ -17,6 +18,7 @@ HEADERS = {"Authorization": f"Bearer {OPENAI_API_KEY}", "OpenAI-Beta": "realtime
 WS_URI = "wss://api.openai.com/v1/realtime?intent=transcription"
 connections = {}
 class WebSocketClient:
     def __init__(self, uri, headers, client_id):
         self.uri = uri
@@ -69,6 +71,7 @@ class WebSocketClient:
             if data["type"] == "conversation.item.input_audio_transcription.delta":
                 self.transcript += data["delta"]
 def create_ws():
     cid = str(uuid.uuid4())
     client = WebSocketClient(WS_URI, HEADERS, cid)
@@ -80,16 +83,15 @@ def send_audio(chunk, cid):
     if not cid or cid not in connections:
         return "Connecting..."
     sr, arr = chunk
-    if len(connections[cid].transcript) > 1000:
-        connections[cid].transcript = ""
     connections[cid].enqueue_audio_chunk(sr, arr)
-    return connections[cid].transcript.strip()
 def clear_transcript(cid):
     if cid in connections:
         connections[cid].transcript = ""
     return ""
 def handle_chat(user_input, history, thread_id, image_url):
     if not OPENAI_API_KEY or not ASSISTANT_ID:
         return "❌ Missing secrets!", history, thread_id, image_url
@@ -111,8 +113,7 @@ def handle_chat(user_input, history, thread_id, image_url):
         for msg in reversed(msgs.data):
             if msg.role == "assistant":
                 content = msg.content[0].text.value
-                history.append({"role": "user", "content": user_input})
-                history.append({"role": "assistant", "content": content})
                 match = re.search(
                     r'https://raw\.githubusercontent\.com/AndrewLORTech/surgical-pathology-manual/main/[\w\-/]*\.png',
                     content
@@ -125,98 +126,45 @@ def handle_chat(user_input, history, thread_id, image_url):
     except Exception as e:
         return f"❌ {e}", history, thread_id, image_url
-def send_transcript_to_assistant(transcript, history, thread_id, image_url, cid):
-    if not transcript.strip():
-        return gr.update(), history, thread_id, image_url
-    if cid in connections:
-        connections[cid].transcript = ""
-    return handle_chat(transcript, history, thread_id, image_url)
-def clear_chat_and_transcript(client_id):
-    if client_id in connections:
-        connections[client_id].transcript = ""
-    return [], "", None, None
-# Fix image viewer fallback
-def update_image_display(image_url):
-    if image_url and isinstance(image_url, str) and image_url.startswith("http"):
-        return image_url
-    return None
-# UI
 with gr.Blocks(theme=gr.themes.Soft()) as app:
     gr.Markdown("# 📄 Document AI Assistant")
-    gr.HTML("""
-    <style>
-    #ask-btn, #clear-chat-btn, #record-audio button {
-        font-size: 16px !important;
-        padding: 12px 28px !important;
-        border-radius: 6px;
-        margin-top: 10px;
-        background-color: #4b5563 !important;
-        color: white !important;
-        border: 1px solid #9ca3af !important;
-    }
-    #ask-btn:hover, #clear-chat-btn:hover, #record-audio button:hover {
-        background-color: #6b7280 !important;
-        color: #fff !important;
-    }
-    button {
-        margin-right: 8px;
-    }
-    #record-audio button svg {
-        margin-right: 6px;
-    }
-    #record-audio label {
-        display: none;
-    }
-    </style>
-    """)
     chat_state = gr.State([])
     thread_state = gr.State()
     image_state = gr.State()
     client_id = gr.State()
     with gr.Row(equal_height=True):
         with gr.Column(scale=1):
             image_display = gr.Image(label="🖼️ Document", type="filepath", show_download_button=False)
-        with gr.Column(scale=2):
-            chat = gr.Chatbot(label="💬 Chat", height=460, type="messages")
             with gr.Row():
                 user_prompt = gr.Textbox(placeholder="Ask your question...", show_label=False, scale=6)
                 send_btn = gr.Button("Send", variant="primary", scale=2)
             with gr.Accordion("🎤 Voice Transcription", open=False) as voice_section:
-                gr.Markdown("**🎙️ Tap below to record your voice**")
-                voice_input = gr.Audio(label="", streaming=True, elem_id="record-audio")
-                voice_transcript = gr.Textbox(label="Transcript", lines=2, interactive=False)
                 with gr.Row():
-                    ask_btn = gr.Button("🟢 Ask", elem_id="ask-btn")
-                    clear_chat_btn = gr.Button("🧹 Clear Chat", elem_id="clear-chat-btn")
     # Functional bindings
     send_btn.click(fn=handle_chat,
                    inputs=[user_prompt, chat_state, thread_state, image_state],
                    outputs=[user_prompt, chat, thread_state, image_state])
-    image_state.change(fn=update_image_display, inputs=image_state, outputs=image_display)
     voice_input.stream(fn=send_audio, inputs=[voice_input, client_id], outputs=voice_transcript, stream_every=0.5)
-    ask_btn.click(fn=send_transcript_to_assistant,
-                  inputs=[voice_transcript, chat_state, thread_state, image_state, client_id],
-                  outputs=[user_prompt, chat, thread_state, image_state])
-    clear_chat_btn.click(fn=clear_chat_and_transcript,
-                         inputs=[client_id],
-                         outputs=[chat, voice_transcript, thread_state, image_state])
     app.load(fn=create_ws, outputs=[client_id])
-app.launch()

+# top of the file
 import gradio as gr
 import os, time, re, json, base64, asyncio, threading, uuid, io
 import numpy as np
 WS_URI = "wss://api.openai.com/v1/realtime?intent=transcription"
 connections = {}
+# WebSocket Client
 class WebSocketClient:
     def __init__(self, uri, headers, client_id):
         self.uri = uri
             if data["type"] == "conversation.item.input_audio_transcription.delta":
                 self.transcript += data["delta"]
+# Real-time transcription connection manager
 def create_ws():
     cid = str(uuid.uuid4())
     client = WebSocketClient(WS_URI, HEADERS, cid)
     if not cid or cid not in connections:
         return "Connecting..."
     sr, arr = chunk
     connections[cid].enqueue_audio_chunk(sr, arr)
+    return connections[cid].transcript
 def clear_transcript(cid):
     if cid in connections:
         connections[cid].transcript = ""
     return ""
+# ============ Chat Assistant ============
 def handle_chat(user_input, history, thread_id, image_url):
     if not OPENAI_API_KEY or not ASSISTANT_ID:
         return "❌ Missing secrets!", history, thread_id, image_url
         for msg in reversed(msgs.data):
             if msg.role == "assistant":
                 content = msg.content[0].text.value
+                history.append((user_input, content))
                 match = re.search(
                     r'https://raw\.githubusercontent\.com/AndrewLORTech/surgical-pathology-manual/main/[\w\-/]*\.png',
                     content
     except Exception as e:
         return f"❌ {e}", history, thread_id, image_url
+# ============ Gradio UI ============
 with gr.Blocks(theme=gr.themes.Soft()) as app:
     gr.Markdown("# 📄 Document AI Assistant")
     chat_state = gr.State([])
     thread_state = gr.State()
     image_state = gr.State()
     client_id = gr.State()
+    voice_enabled = gr.State(False)
     with gr.Row(equal_height=True):
         with gr.Column(scale=1):
             image_display = gr.Image(label="🖼️ Document", type="filepath", show_download_button=False)
+        with gr.Column(scale=1.4):
+            chat = gr.Chatbot(label="💬 Chat", height=460)
             with gr.Row():
                 user_prompt = gr.Textbox(placeholder="Ask your question...", show_label=False, scale=6)
+                mic_toggle_btn = gr.Button("🎙️", scale=1)
                 send_btn = gr.Button("Send", variant="primary", scale=2)
             with gr.Accordion("🎤 Voice Transcription", open=False) as voice_section:
                 with gr.Row():
+                    voice_input = gr.Audio(label="Mic", streaming=True)
+                    voice_transcript = gr.Textbox(label="Transcript", lines=2, interactive=False)
+                clear_btn = gr.Button("🧹 Clear Transcript")
     # Functional bindings
+    def toggle_voice(curr):
+        return not curr, gr.update(visible=not curr)
+    mic_toggle_btn.click(fn=toggle_voice, inputs=voice_enabled, outputs=[voice_enabled, voice_section])
     send_btn.click(fn=handle_chat,
                    inputs=[user_prompt, chat_state, thread_state, image_state],
                    outputs=[user_prompt, chat, thread_state, image_state])
+    image_state.change(fn=lambda x: x, inputs=image_state, outputs=image_display)
     voice_input.stream(fn=send_audio, inputs=[voice_input, client_id], outputs=voice_transcript, stream_every=0.5)
+    clear_btn.click(fn=clear_transcript, inputs=[client_id], outputs=voice_transcript)
     app.load(fn=create_ws, outputs=[client_id])
+app.launch()