Spaces:

Agents-MCP-Hackathon
/

MedCodeMCP

Sleeping

App Files Files Community

gpaasch commited on Jun 8, 2025

Commit

ede3b41

1 Parent(s): 5758d49

microphone should now transcribe directly to text wihtout needing intermediate audio file processing

Browse files

Files changed (1) hide show

src/app.py +44 -69

src/app.py CHANGED Viewed

@@ -169,62 +169,40 @@ or, if you have enough info, output a final JSON with fields:
 {"diagnoses":[…], "confidences":[…]}.
 """
-def process_speech(new_transcript, history):
-    if not new_transcript:
-        return history
-    if not isinstance(new_transcript, str):
-        print(f"Warning: Expected string transcript, got {type(new_transcript)}")
-        new_transcript = str(new_transcript)
     try:
-        # First, get potential diagnoses based on symptoms
         diagnosis_query = f"""
-        Given these symptoms: '{new_transcript}'
         Identify the most likely ICD-10 diagnoses and key questions to differentiate between them.
         Focus only on symptoms mentioned and their clinical implications.
-        Format response as:
-        1. Primary suspected diagnosis: [ICD-10 code] - [description]
-        2. Alternative diagnosis: [ICD-10 code] - [description]
-        3. Key differentiating question
         """
         response = symptom_index.as_query_engine().query(diagnosis_query)
-        # Parse response into structured format
-        lines = str(response).strip().split('\n')
-        diagnoses = []
-        follow_up = ""
-        for line in lines:
-            if '[' in line and ']' in line:  # Extract ICD-10 codes
-                code = line[line.find('[')+1:line.find(']')]
-                diagnoses.append(code)
-            elif 'Key differentiating question' in line:
-                follow_up = line.split(':')[-1].strip()
         formatted_response = {
-            "diagnoses": diagnoses[:2],  # Top 2 diagnoses
-            "confidences": [0.7, 0.3] if len(diagnoses) > 1 else [0.7],  # Weighted confidences
-            "follow_up": follow_up if follow_up else "What other symptoms are you experiencing?"
         }
-        history.append({"role": "user", "content": new_transcript})
-        history.append({"role": "assistant", "content": json.dumps(formatted_response, indent=2)})
     except Exception as e:
-        print(f"Error processing speech: {str(e)}")
-        error_response = {
-            "diagnoses": ["Error processing symptoms"],
-            "confidences": [0],
-            "follow_up": "Could you please repeat your symptoms?"
-        }
-        history.append({"role": "user", "content": new_transcript})
-        history.append({"role": "assistant", "content": json.dumps(error_response, indent=2)})
-    return history
 def text_to_speech(text):
     """Convert text to speech and return audio HTML element."""
@@ -270,10 +248,16 @@ with gr.Blocks() as demo:
         with gr.Column(scale=2):
             # Moved microphone row above chatbot
             with gr.Row():
-                microphone = gr.Microphone(
                     label="Describe your symptoms",
-                    streaming=True,
-                    type="filepath"
                 )
                 clear_btn = gr.Button("Clear Chat", variant="secondary")
@@ -308,39 +292,30 @@ with gr.Blocks() as demo:
     clear_btn.click(lambda: None, None, chatbot, queue=False)
     def enhanced_process_speech(audio_path, history, api_key=None, model_tier="small", temp=0.7):
-        """Process speech input and return formatted chat messages."""
         if not audio_path:
             return history
-        # First get the raw transcript and response
-        transcript = process_speech(audio_path, [])  # Start fresh history
-        if not transcript or len(transcript) < 2:
-            return history + [
-                {"role": "user", "content": "Audio recording"},
-                {"role": "assistant", "content": "I couldn't process that audio. Could you try again?"}
-            ]
-        try:
-            # Get the last assistant response
-            user_message = transcript[-2]["content"]  # What the user said
-            assistant_json = transcript[-1]["content"]  # JSON response from assistant
-            # Parse and format the assistant's response
-            response_dict = json.loads(assistant_json)
-            formatted_response = format_response_for_user(response_dict)
-            # Add the exchange to history in the correct message format
             return history + [
-                {"role": "user", "content": user_message},
-                {"role": "assistant", "content": formatted_response}
             ]
         except Exception as e:
-            print(f"Error formatting chat response: {e}")
-            return history + [
-                {"role": "user", "content": "Error processing audio"},
-                {"role": "assistant", "content": "Sorry, I encountered an error processing your symptoms. Could you try again?"}
-            ]
     microphone.stream(
         fn=enhanced_process_speech,

 {"diagnoses":[…], "confidences":[…]}.
 """
+def process_speech(audio_path, history):
+    """Process speech input and convert to text."""
     try:
+        if not audio_path:
+            return []
+        # The audio_path now contains the transcribed text directly from Gradio
+        transcript = audio_path
+        # Query the symptom index
         diagnosis_query = f"""
+        Given these symptoms: '{transcript}'
         Identify the most likely ICD-10 diagnoses and key questions to differentiate between them.
         Focus only on symptoms mentioned and their clinical implications.
         """
         response = symptom_index.as_query_engine().query(diagnosis_query)
+        # Format response
         formatted_response = {
+            "diagnoses": [],
+            "confidences": [],
+            "follow_up": str(response)
         }
+        return [
+            {"role": "user", "content": transcript},
+            {"role": "assistant", "content": json.dumps(formatted_response)}
+        ]
     except Exception as e:
+        print(f"Error processing speech: {e}")
+        return []
 def text_to_speech(text):
     """Convert text to speech and return audio HTML element."""
         with gr.Column(scale=2):
             # Moved microphone row above chatbot
             with gr.Row():
+                microphone = gr.Audio(
+                    source="microphone",
+                    type="text",  # Changed from filepath to text
                     label="Describe your symptoms",
+                    streaming=True
+                )
+                transcript_box = gr.Textbox(
+                    label="Transcribed Text",
+                    interactive=False,
+                    show_label=True
                 )
                 clear_btn = gr.Button("Clear Chat", variant="secondary")
     clear_btn.click(lambda: None, None, chatbot, queue=False)
     def enhanced_process_speech(audio_path, history, api_key=None, model_tier="small", temp=0.7):
+        """Handle speech processing and chat formatting."""
         if not audio_path:
             return history
+        # Process the new audio input
+        new_messages = process_speech(audio_path, history)
+        if not new_messages:
+            return history
+        try:
+            # Format last assistant response
+            assistant_response = new_messages[-1]["content"]
+            response_dict = json.loads(assistant_response)
+            formatted_text = format_response_for_user(response_dict)
+            # Add to history with proper message format
             return history + [
+                {"role": "user", "content": new_messages[0]["content"]},
+                {"role": "assistant", "content": formatted_text}
             ]
         except Exception as e:
+            print(f"Error formatting response: {e}")
+            return history
     microphone.stream(
         fn=enhanced_process_speech,