Spaces:

Agents-MCP-Hackathon
/

MedCodeMCP

Sleeping

App Files Files Community

gpaasch commited on Jun 8, 2025

Commit

54fa492

1 Parent(s): 00bcf43

audio processing pipeliine

Browse files

Files changed (1) hide show

src/app.py +56 -21

src/app.py CHANGED Viewed

@@ -15,7 +15,9 @@ import io
 import base64
 import numpy as np
 from transformers.pipelines import pipeline  # Changed from transformers import pipeline
-from transformers import WhisperProcessor
 # Model options mapped to their requirements
 MODEL_OPTIONS = {
@@ -40,25 +42,24 @@ MODEL_OPTIONS = {
 }
 # Initialize Whisper with proper configuration
 transcriber = pipeline(
     "automatic-speech-recognition",
     model="openai/whisper-base.en",
     chunk_length_s=30,
     stride_length_s=5,
-    return_timestamps=True,
-    device="cpu",  # Explicitly set to CPU since we're seeing GPU warnings
     torch_dtype=torch.float32,
     generate_kwargs={
-        "task": "transcribe",
-        "language": "en",
-        "use_cache": True,
-        "return_timestamps": True
     }
 )
-# Create processor for proper attention mask
-processor = WhisperProcessor.from_pretrained("openai/whisper-base.en")
 def get_system_specs() -> Dict[str, float]:
     """Get system specifications."""
     # Get RAM
@@ -207,12 +208,23 @@ def process_speech(audio_data, history):
             audio_array = audio_array.astype(np.float32)
             audio_array /= np.max(np.abs(audio_array))
             # Transcribe with error handling
             try:
-                result = transcriber(
-                    {"sampling_rate": sample_rate, "raw": audio_array},
-                    batch_size=8
-                )
                 # Handle different result types
                 if isinstance(result, dict) and "text" in result:
@@ -422,6 +434,34 @@ with gr.Blocks(
         queue=True  # Enable queuing for better stream handling
     )
     # Update transcription handler
     def update_live_transcription(audio):
         """Real-time transcription updates."""
@@ -430,14 +470,9 @@ with gr.Blocks(
         try:
             sample_rate, audio_array = audio
-            if audio_array.ndim > 1:
-                audio_array = audio_array.mean(axis=1)
-            audio_array = audio_array.astype(np.float32)
-            audio_array /= np.max(np.abs(audio_array))
-            result = transcriber(
-                {"sampling_rate": sample_rate, "raw": audio_array}
-            )
             # Handle different result types
             if isinstance(result, dict):

 import base64
 import numpy as np
 from transformers.pipelines import pipeline  # Changed from transformers import pipeline
+from transformers import WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor
+import torchaudio
+import torchaudio.transforms as T
 # Model options mapped to their requirements
 MODEL_OPTIONS = {
 }
 # Initialize Whisper with proper configuration
+# Create components separately
+feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-base.en")
+tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-base.en")
+processor = WhisperProcessor(feature_extractor, tokenizer)
 transcriber = pipeline(
     "automatic-speech-recognition",
     model="openai/whisper-base.en",
     chunk_length_s=30,
     stride_length_s=5,
+    device="cpu",
     torch_dtype=torch.float32,
+    # Remove feature_extractor and tokenizer parameters as they're included in the model
     generate_kwargs={
+        "use_cache": True
     }
 )
 def get_system_specs() -> Dict[str, float]:
     """Get system specifications."""
     # Get RAM
             audio_array = audio_array.astype(np.float32)
             audio_array /= np.max(np.abs(audio_array))
+            # Ensure correct sampling rate
+            if sample_rate != 16000:
+                resampler = T.Resample(sample_rate, 16000)
+                audio_tensor = torch.FloatTensor(audio_array)
+                audio_tensor = resampler(audio_tensor)
+                audio_array = audio_tensor.numpy()
+                sample_rate = 16000
             # Transcribe with error handling
             try:
+                # Format dictionary correctly with required keys
+                inputs = {
+                    "raw": audio_array,
+                    "sampling_rate": sample_rate
+                }
+                result = transcriber(inputs)
                 # Handle different result types
                 if isinstance(result, dict) and "text" in result:
         queue=True  # Enable queuing for better stream handling
     )
+    def process_audio(audio_array, sample_rate):
+        """Pre-process audio for Whisper."""
+        if audio_array.ndim > 1:
+            audio_array = audio_array.mean(axis=1)
+        # Convert to tensor for resampling
+        audio_tensor = torch.FloatTensor(audio_array)
+        # Resample to 16kHz if needed
+        if sample_rate != 16000:
+            resampler = T.Resample(sample_rate, 16000)
+            audio_tensor = resampler(audio_tensor)
+        # Normalize
+        audio_tensor = audio_tensor / torch.max(torch.abs(audio_tensor))
+        # Use feature extractor with correct sampling rate
+        features = feature_extractor(
+            audio_tensor.numpy(),
+            sampling_rate=16000,  # Always use 16kHz
+            return_tensors="pt"
+        )
+        return {
+            "input_features": features.input_features,
+            "sampling_rate": 16000  # Return resampled rate
+        }
     # Update transcription handler
     def update_live_transcription(audio):
         """Real-time transcription updates."""
         try:
             sample_rate, audio_array = audio
+            input_features = process_audio(audio_array, sample_rate)
+            result = transcriber(input_features)
             # Handle different result types
             if isinstance(result, dict):