Spaces:

Agents-MCP-Hackathon
/

MedCodeMCP

Sleeping

gpaasch commited on Jun 8, 2025

Commit

00bcf43

1 Parent(s): 95321db

1. Removed unsafe `.get()` calls

2. Added proper type checking
3. Better handling of different result formats
4. More robust error handling
5. Cleaner string handling

Files changed (2) hide show

requirements.txt +5 -1
src/app.py +174 -55

requirements.txt CHANGED Viewed

@@ -1,6 +1,9 @@
 # Core dependencies
 gradio[full]>=5.33.0
-gradio[mcp]>=5.33.0
 # LLM and embeddings
 llama-index>=0.9.0
@@ -10,6 +13,7 @@ sentence-transformers>=2.2.0
 # Audio processing
 ffmpeg-python
 # System utilities
 psutil

 # Core dependencies
 gradio[full]>=5.33.0
+transformers>=4.37.0
+torch>=2.2.0
+torchaudio>=2.2.0
+numpy>=1.24.0
 # LLM and embeddings
 llama-index>=0.9.0
 # Audio processing
 ffmpeg-python
+librosa>=0.10.1
 # System utilities
 psutil

src/app.py CHANGED Viewed

@@ -13,6 +13,9 @@ import torch
 from gtts import gTTS
 import io
 import base64
 # Model options mapped to their requirements
 MODEL_OPTIONS = {
@@ -36,6 +39,26 @@ MODEL_OPTIONS = {
     }
 }
 def get_system_specs() -> Dict[str, float]:
     """Get system specifications."""
     # Get RAM
@@ -169,35 +192,68 @@ or, if you have enough info, output a final JSON with fields:
 {"diagnoses":[…], "confidences":[…]}.
 """
-def process_speech(audio_path, history):
     """Process speech input and convert to text."""
     try:
-        if not audio_path:
             return []
-        # Extract just the transcribed text if it's a tuple
-        transcript = audio_path[1] if isinstance(audio_path, tuple) else audio_path
-        # Query the symptom index
-        diagnosis_query = f"""
-        Given these symptoms: '{transcript}'
-        Identify the most likely ICD-10 diagnoses and key questions.
-        Focus on clinical implications.
-        """
-        response = symptom_index.as_query_engine().query(diagnosis_query)
-        return [
-            {"role": "user", "content": transcript},
-            {"role": "assistant", "content": json.dumps({
-                "diagnoses": [],
-                "confidences": [],
-                "follow_up": str(response)
-            })}
-        ]
     except Exception as e:
-        print(f"Error processing speech: {e}")
         return []
 def update_transcription(audio_path):
@@ -240,8 +296,10 @@ with gr.Blocks(
             # Moved microphone row above chatbot
             with gr.Row():
                 microphone = gr.Audio(
-                    label="Describe your symptoms",
-                    streaming=True
                 )
                 transcript_box = gr.Textbox(
                     label="Transcribed Text",
@@ -296,49 +354,110 @@ with gr.Blocks(
         return result.strip()
     def enhanced_process_speech(audio_path, history, api_key=None, model_tier="small", temp=0.7):
-        """Handle speech processing and chat formatting."""
         if not audio_path:
             return history
-        # Process the new audio input
-        new_messages = process_speech(audio_path, history)
-        if not new_messages:
-            return history
         try:
-            # Format last assistant response
-            assistant_response = new_messages[-1]["content"]
-            response_dict = json.loads(assistant_response)
-            formatted_text = format_response_for_user(response_dict)
-            # Add to history with proper message format
-            return history + [
-                {"role": "user", "content": new_messages[0]["content"]},
-                {"role": "assistant", "content": formatted_text}
-            ]
         except Exception as e:
-            print(f"Error formatting response: {e}")
             return history
     microphone.stream(
         fn=enhanced_process_speech,
-        inputs=[
-            microphone,
-            chatbot,
-            api_key,
-            model_selector,
-            temperature
-        ],
         outputs=chatbot,
-        show_progress="hidden"
     )
-    microphone.stream(  # Add real-time transcription updates
-        fn=update_transcription,
         inputs=[microphone],
         outputs=transcript_box,
-        show_progress="hidden"
     )
     clear_btn.click(

 from gtts import gTTS
 import io
 import base64
+import numpy as np
+from transformers.pipelines import pipeline  # Changed from transformers import pipeline
+from transformers import WhisperProcessor
 # Model options mapped to their requirements
 MODEL_OPTIONS = {
     }
 }
+# Initialize Whisper with proper configuration
+transcriber = pipeline(
+    "automatic-speech-recognition",
+    model="openai/whisper-base.en",
+    chunk_length_s=30,
+    stride_length_s=5,
+    return_timestamps=True,
+    device="cpu",  # Explicitly set to CPU since we're seeing GPU warnings
+    torch_dtype=torch.float32,
+    generate_kwargs={
+        "task": "transcribe",
+        "language": "en",
+        "use_cache": True,
+        "return_timestamps": True
+    }
+)
+# Create processor for proper attention mask
+processor = WhisperProcessor.from_pretrained("openai/whisper-base.en")
 def get_system_specs() -> Dict[str, float]:
     """Get system specifications."""
     # Get RAM
 {"diagnoses":[…], "confidences":[…]}.
 """
+def process_speech(audio_data, history):
     """Process speech input and convert to text."""
     try:
+        if not audio_data:
             return []
+        if isinstance(audio_data, tuple) and len(audio_data) == 2:
+            sample_rate, audio_array = audio_data
+            # Audio preprocessing
+            if audio_array.ndim > 1:
+                audio_array = audio_array.mean(axis=1)
+            audio_array = audio_array.astype(np.float32)
+            audio_array /= np.max(np.abs(audio_array))
+            # Transcribe with error handling
+            try:
+                result = transcriber(
+                    {"sampling_rate": sample_rate, "raw": audio_array},
+                    batch_size=8
+                )
+                # Handle different result types
+                if isinstance(result, dict) and "text" in result:
+                    transcript = result["text"].strip()
+                elif isinstance(result, str):
+                    transcript = result.strip()
+                else:
+                    print(f"Unexpected transcriber result type: {type(result)}")
+                    return []
+                if not transcript:
+                    print("No transcription generated")
+                    return []
+                # Query symptoms with transcribed text
+                diagnosis_query = f"""
+                Given these symptoms: '{transcript}'
+                Identify the most likely ICD-10 diagnoses and key questions.
+                Focus on clinical implications.
+                """
+                response = symptom_index.as_query_engine().query(diagnosis_query)
+                return [
+                    {"role": "user", "content": transcript},
+                    {"role": "assistant", "content": json.dumps({
+                        "diagnoses": [],
+                        "confidences": [],
+                        "follow_up": str(response)
+                    })}
+                ]
+            except Exception as e:
+                print(f"Transcription error: {str(e)}")
+                return []
+        else:
+            print(f"Invalid audio format: {type(audio_data)}")
+            return []
     except Exception as e:
+        print(f"Processing error: {str(e)}")
         return []
 def update_transcription(audio_path):
             # Moved microphone row above chatbot
             with gr.Row():
                 microphone = gr.Audio(
+                    sources=["microphone"],
+                    streaming=True,
+                    type="numpy",
+                    label="Describe your symptoms"
                 )
                 transcript_box = gr.Textbox(
                     label="Transcribed Text",
         return result.strip()
     def enhanced_process_speech(audio_path, history, api_key=None, model_tier="small", temp=0.7):
+        """Handle streaming speech processing and chat updates."""
         if not audio_path:
             return history
         try:
+            # Process audio stream
+            if isinstance(audio_path, tuple) and len(audio_path) == 2:
+                sample_rate, audio_array = audio_path
+                # Audio preprocessing
+                if audio_array.ndim > 1:
+                    audio_array = audio_array.mean(axis=1)
+                audio_array = audio_array.astype(np.float32)
+                audio_array /= np.max(np.abs(audio_array))
+                # Get transcription from Whisper
+                result = transcriber(
+                    {"sampling_rate": sample_rate, "raw": audio_array},
+                    batch_size=8,
+                    return_timestamps=True
+                )
+                # Handle different result types
+                transcript = ""
+                if isinstance(result, dict):
+                    transcript = result.get("text", "")
+                elif isinstance(result, str):
+                    transcript = result
+                elif isinstance(result, (list, tuple)) and len(result) > 0:
+                    transcript = str(result[0])
+                else:
+                    print(f"Unexpected transcriber result type: {type(result)}")
+                    return history
+                transcript = transcript.strip()
+                if not transcript:
+                    return history
+                # Process the symptoms
+                diagnosis_query = f"""
+                Based on these symptoms: '{transcript}'
+                Provide relevant ICD-10 codes and diagnostic questions.
+                """
+                response = symptom_index.as_query_engine().query(diagnosis_query)
+                # Format and return chat messages
+                return history + [
+                    {"role": "user", "content": transcript},
+                    {"role": "assistant", "content": format_response_for_user({
+                        "diagnoses": [],
+                        "confidences": [],
+                        "follow_up": str(response)
+                    })}
+                ]
         except Exception as e:
+            print(f"Streaming error: {str(e)}")
             return history
     microphone.stream(
         fn=enhanced_process_speech,
+        inputs=[microphone, chatbot, api_key, model_selector, temperature],
         outputs=chatbot,
+        show_progress="hidden",
+        api_name=False,
+        queue=True  # Enable queuing for better stream handling
     )
+    # Update transcription handler
+    def update_live_transcription(audio):
+        """Real-time transcription updates."""
+        if not audio or not isinstance(audio, tuple):
+            return ""
+        try:
+            sample_rate, audio_array = audio
+            if audio_array.ndim > 1:
+                audio_array = audio_array.mean(axis=1)
+            audio_array = audio_array.astype(np.float32)
+            audio_array /= np.max(np.abs(audio_array))
+            result = transcriber(
+                {"sampling_rate": sample_rate, "raw": audio_array}
+            )
+            # Handle different result types
+            if isinstance(result, dict):
+                return result.get("text", "").strip()
+            elif isinstance(result, str):
+                return result.strip()
+            elif isinstance(result, (list, tuple)) and len(result) > 0:
+                return str(result[0]).strip()
+            return ""
+        except Exception as e:
+            print(f"Transcription error: {str(e)}")
+            return ""
+    microphone.stream(
+        fn=update_live_transcription,
         inputs=[microphone],
         outputs=transcript_box,
+        show_progress="hidden",
+        queue=True
     )
     clear_btn.click(