Spaces:

PatienceIzere
/

AudioTranscriber

Sleeping

App Files Files Community

PatienceIzere commited on Nov 20, 2025

Commit

a6e99c9

verified ·

1 Parent(s): 535de7a

Update hf_transcriber.py

Browse files

Files changed (1) hide show

hf_transcriber.py +25 -20

hf_transcriber.py CHANGED Viewed

@@ -59,24 +59,26 @@ class HFTranscriber:
         except Exception as e:
             raise Exception(f"Failed to load model {self.model_name}: {str(e)}")
-    def transcribe_audio(self, audio_path: str) -> Tuple[List[int], int]:
         """
-        Transcribe audio file to notes using the loaded Hugging Face model.
         Args:
-            audio_path (str): Path to the audio file
         Returns:
-            tuple: (notes, sample_rate) where notes is a list of MIDI note numbers
         """
         try:
-            # Load and preprocess audio
-            waveform, sample_rate = self._load_audio(audio_path)
             if self.is_speecht5:
                 # Process the audio input for SpeechT5
                 inputs = self.processor(
-                    audio=waveform,
                     sampling_rate=sample_rate,
                     return_tensors="pt"
                 ).to(self.device)
@@ -91,10 +93,12 @@ class HFTranscriber:
                 # Decode the generated ids to text
                 transcription = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
             else:
-                # Process the audio input for wav2vec2
                 inputs = self.processor(
-                    waveform,
                     sampling_rate=sample_rate,
                     return_tensors="pt",
                     padding=True
@@ -105,18 +109,19 @@ class HFTranscriber:
                     logits = self.model(inputs).logits
                 # Get predicted token ids
-                predicted_ids = torch.argmax(logits, dim=-1)
-                # Decode the predicted ids to text
-                transcription = self.processor.batch_decode(predicted_ids)[0]
-            # Convert text to MIDI notes (simplified example)
-            notes = self._text_to_midi_notes(transcription)
-            return notes, sample_rate
         except Exception as e:
-            raise Exception(f"Transcription failed: {str(e)}")
     def _text_to_midi_notes(self, text: str) -> List[int]:
         """Convert transcribed text to MIDI notes (simplified example)."""

         except Exception as e:
             raise Exception(f"Failed to load model {self.model_name}: {str(e)}")
+    def transcribe_audio(self, audio_array: np.ndarray, sample_rate: int) -> Dict[str, Any]:
         """
+        Transcribe audio data to text using the loaded Hugging Face model.
         Args:
+            audio_array (np.ndarray): Audio data as a numpy array
+            sample_rate (int): Sample rate of the audio data
         Returns:
+            dict: Dictionary containing 'text' and optionally 'word_timestamps'
         """
         try:
+            # Convert to mono if needed
+            if len(audio_array.shape) > 1:
+                audio_array = librosa.to_mono(audio_array)
             if self.is_speecht5:
                 # Process the audio input for SpeechT5
                 inputs = self.processor(
+                    audio=audio_array,
                     sampling_rate=sample_rate,
                     return_tensors="pt"
                 ).to(self.device)
                 # Decode the generated ids to text
                 transcription = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+                return {'text': transcription}
             else:
+                # Process the audio input for wav2vec2/whisper
                 inputs = self.processor(
+                    audio_array,
                     sampling_rate=sample_rate,
                     return_tensors="pt",
                     padding=True
                     logits = self.model(inputs).logits
                 # Get predicted token ids
+                pred_ids = torch.argmax(logits, dim=-1)
+                # Convert to text
+                transcription = self.processor.batch_decode(pred_ids)[0]
+                # Return the transcription text
+                return {
+                    'text': transcription,
+                    'word_timestamps': []  # Word-level timestamps not available in this basic implementation
+                }
         except Exception as e:
+            raise Exception(f"Error during transcription: {str(e)}")
     def _text_to_midi_notes(self, text: str) -> List[int]:
         """Convert transcribed text to MIDI notes (simplified example)."""