Spaces:

Agents-MCP-Hackathon
/

MedCodeMCP

Sleeping

App Files Files Community

gpaasch commited on Jun 8, 2025

Commit

ec82b9a

1 Parent(s): 54fa492

attention mask is not set

Browse files

Files changed (1) hide show

src/app.py +83 -29

src/app.py CHANGED Viewed

@@ -41,12 +41,11 @@ MODEL_OPTIONS = {
     }
 }
-# Initialize Whisper with proper configuration
-# Create components separately
 feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-base.en")
 tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-base.en")
-processor = WhisperProcessor(feature_extractor, tokenizer)
 transcriber = pipeline(
     "automatic-speech-recognition",
     model="openai/whisper-base.en",
@@ -54,9 +53,48 @@ transcriber = pipeline(
     stride_length_s=5,
     device="cpu",
     torch_dtype=torch.float32,
-    # Remove feature_extractor and tokenizer parameters as they're included in the model
     generate_kwargs={
-        "use_cache": True
     }
 )
@@ -298,14 +336,23 @@ with gr.Blocks(
     and patients understand potential diagnoses based on described symptoms.
     ### How it works:
-    1. Click the microphone button and describe your symptoms
     2. The AI will analyze your description and suggest possible diagnoses
     3. Answer follow-up questions to refine the diagnosis
     """)
     with gr.Row():
         with gr.Column(scale=2):
-            # Moved microphone row above chatbot
             with gr.Row():
                 microphone = gr.Audio(
                     sources=["microphone"],
@@ -371,7 +418,6 @@ with gr.Blocks(
             return history
         try:
-            # Process audio stream
             if isinstance(audio_path, tuple) and len(audio_path) == 2:
                 sample_rate, audio_array = audio_path
@@ -381,26 +427,33 @@ with gr.Blocks(
                 audio_array = audio_array.astype(np.float32)
                 audio_array /= np.max(np.abs(audio_array))
                 # Get transcription from Whisper
-                result = transcriber(
-                    {"sampling_rate": sample_rate, "raw": audio_array},
-                    batch_size=8,
-                    return_timestamps=True
-                )
-                # Handle different result types
                 transcript = ""
                 if isinstance(result, dict):
-                    transcript = result.get("text", "")
                 elif isinstance(result, str):
-                    transcript = result
-                elif isinstance(result, (list, tuple)) and len(result) > 0:
-                    transcript = str(result[0])
-                else:
-                    print(f"Unexpected transcriber result type: {type(result)}")
-                    return history
-                transcript = transcript.strip()
                 if not transcript:
                     return history
@@ -470,19 +523,20 @@ with gr.Blocks(
         try:
             sample_rate, audio_array = audio
-            input_features = process_audio(audio_array, sample_rate)
-            result = transcriber(input_features)
-            # Handle different result types
             if isinstance(result, dict):
                 return result.get("text", "").strip()
             elif isinstance(result, str):
                 return result.strip()
-            elif isinstance(result, (list, tuple)) and len(result) > 0:
-                return str(result[0]).strip()
             return ""
         except Exception as e:
             print(f"Transcription error: {str(e)}")
             return ""

     }
 }
+# Initialize Whisper components
 feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-base.en")
 tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-base.en")
+# Configure transcription pipeline with only necessary components
 transcriber = pipeline(
     "automatic-speech-recognition",
     model="openai/whisper-base.en",
     stride_length_s=5,
     device="cpu",
     torch_dtype=torch.float32,
     generate_kwargs={
+        "use_cache": True,
+        "return_timestamps": True
+    }
+)
+# Audio preprocessing function
+def prepare_audio_features(audio_array, sample_rate):
+    """Prepare audio features with proper format."""
+    # Convert stereo to mono
+    if audio_array.ndim > 1:
+        audio_array = audio_array.mean(axis=1)
+    audio_array = audio_array.astype(np.float32)
+    # Normalize audio
+    audio_array /= np.max(np.abs(audio_array))
+    # Resample to 16kHz if needed
+    if sample_rate != 16000:
+        resampler = T.Resample(orig_freq=sample_rate, new_freq=16000)
+        audio_tensor = torch.FloatTensor(audio_array)
+        audio_tensor = resampler(audio_tensor)
+        audio_array = audio_tensor.numpy()
+    # Return proper dictionary format for pipeline
+    return {
+        "raw": audio_array,
+        "sampling_rate": 16000
+    }
+# Update transcriber configuration
+transcriber = pipeline(
+    "automatic-speech-recognition",
+    model="openai/whisper-base.en",
+    chunk_length_s=30,
+    stride_length_s=5,
+    device="cpu",
+    torch_dtype=torch.float32,
+    feature_extractor=feature_extractor,
+    generate_kwargs={
+        "use_cache": True,
+        "return_timestamps": True
     }
 )
     and patients understand potential diagnoses based on described symptoms.
     ### How it works:
+    1. Either click the record button and describe your symptoms or type them into the textbox
     2. The AI will analyze your description and suggest possible diagnoses
     3. Answer follow-up questions to refine the diagnosis
     """)
     with gr.Row():
         with gr.Column(scale=2):
+            # Add text input above microphone
+            with gr.Row():
+                text_input = gr.Textbox(
+                    label="Type your symptoms",
+                    placeholder="Or type your symptoms here...",
+                    lines=3
+                )
+                submit_btn = gr.Button("Submit", variant="primary")
+            # Existing microphone row
             with gr.Row():
                 microphone = gr.Audio(
                     sources=["microphone"],
             return history
         try:
             if isinstance(audio_path, tuple) and len(audio_path) == 2:
                 sample_rate, audio_array = audio_path
                 audio_array = audio_array.astype(np.float32)
                 audio_array /= np.max(np.abs(audio_array))
+                # Ensure correct sampling rate
+                if sample_rate != 16000:
+                    resampler = T.Resample(
+                        orig_freq=sample_rate,
+                        new_freq=16000
+                    )
+                    audio_tensor = torch.FloatTensor(audio_array)
+                    audio_tensor = resampler(audio_tensor)
+                    audio_array = audio_tensor.numpy()
+                    sample_rate = 16000
+                # Format input dictionary exactly as required
+                transcriber_input = {
+                    "raw": audio_array,
+                    "sampling_rate": sample_rate
+                }
                 # Get transcription from Whisper
+                result = transcriber(transcriber_input)
+                # Extract text from result
                 transcript = ""
                 if isinstance(result, dict):
+                    transcript = result.get("text", "").strip()
                 elif isinstance(result, str):
+                    transcript = result.strip()
                 if not transcript:
                     return history
         try:
             sample_rate, audio_array = audio
+            # Process audio and get proper format
+            inputs = prepare_audio_features(audio_array, sample_rate)
+            # Pass to transcriber
+            result = transcriber(inputs)
+            # Extract text from result
             if isinstance(result, dict):
                 return result.get("text", "").strip()
             elif isinstance(result, str):
                 return result.strip()
             return ""
         except Exception as e:
             print(f"Transcription error: {str(e)}")
             return ""