Spaces:

RP-Azul
/

P1

Sleeping

RP-Azul commited on Nov 4, 2024

Commit

4a6a287

verified ·

1 Parent(s): 7547fe3

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -9,23 +9,29 @@ import librosa
 pipe1 = pipeline("automatic-speech-recognition", model="openai/whisper-base")
 def audio_to_text(audio):
-    # Convert mp3 to wav and resample to 16 kHz
-    audio_data, sample_rate = librosa.load(audio, sr=16000)  # Load and resample to 16 kHz
     # Convert to mono if the audio has more than one channel
     if len(audio_data.shape) > 1:
         audio_data = np.mean(audio_data, axis=1)
-    # Transcribe the audio input with timestamp support for long-form audio
     audio_array = np.array(audio_data).astype(np.float32)
     transcription = pipe1(audio_array, return_timestamps=True)
-    # Extract text with timestamps if available
     if "segments" in transcription:
-        transcription_text = "\n".join(
-            [f"[{segment['start']:.2f}s - {segment['end']:.2f}s]: {segment['text']}" for segment in transcription["segments"]]
-        )
     elif "text" in transcription:
         transcription_text = transcription["text"]
     else:
         transcription_text = "No transcription available."

 pipe1 = pipeline("automatic-speech-recognition", model="openai/whisper-base")
 def audio_to_text(audio):
+    # Check if audio is a tuple of (sample_rate, numpy_array)
+    if isinstance(audio, tuple):
+        sample_rate, audio_data = audio  # Unpack sample rate and data
+    else:
+        # If it's a file path, load and resample to 16kHz directly
+        audio_data, sample_rate = librosa.load(audio, sr=16000)
     # Convert to mono if the audio has more than one channel
     if len(audio_data.shape) > 1:
         audio_data = np.mean(audio_data, axis=1)
+    # Convert audio data to numpy array of float32 type
     audio_array = np.array(audio_data).astype(np.float32)
+    # Transcribe the audio input with return_timestamps=True
     transcription = pipe1(audio_array, return_timestamps=True)
+    # Check the output structure of transcription
     if "segments" in transcription:
+        # Extract text from each segment if "segments" key exists
+        transcription_text = " ".join([segment["text"] for segment in transcription["segments"]])
     elif "text" in transcription:
+        # Use the full transcription if it's directly in the "text" field
         transcription_text = transcription["text"]
     else:
         transcription_text = "No transcription available."