Spaces:

RP-Azul
/

P1

Sleeping

App Files Files Community

RP-Azul commited on Nov 4, 2024

Commit

5bed895

verified ·

1 Parent(s): c8b8068

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -4

app.py CHANGED Viewed

@@ -9,24 +9,38 @@ import librosa
 pipe1 = pipeline("automatic-speech-recognition", model="openai/whisper-base")
 def audio_to_text(audio):
-    # Check if audio is a file path or a tuple of (sample_rate, numpy_array)
     if isinstance(audio, tuple):
         sample_rate, audio_data = audio  # Unpack sample rate and data
     else:
-        audio_data, sample_rate = librosa.load(audio, sr=16000)  # Load and resample to 16kHz directly
     # Convert to mono if the audio has more than one channel
     if len(audio_data.shape) > 1:
         audio_data = np.mean(audio_data, axis=1)
     audio_array = np.array(audio_data).astype(np.float32)
-    # Transcribe the audio input
     transcription = pipe1(audio_array, return_timestamps=True)
-    transcription_text = " ".join([segment["text"] for segment in transcription["segments"]])
     print("Transcription:", transcription_text)
     return transcription_text
 demo = gr.Interface(fn=audio_to_text, inputs=gr.Audio(), outputs="text")
 demo.launch(share=True)

 pipe1 = pipeline("automatic-speech-recognition", model="openai/whisper-base")
 def audio_to_text(audio):
+    # Check if audio is a tuple of (sample_rate, numpy_array)
     if isinstance(audio, tuple):
         sample_rate, audio_data = audio  # Unpack sample rate and data
     else:
+        # If it's a file path, load and resample to 16kHz directly
+        audio_data, sample_rate = librosa.load(audio, sr=16000)
     # Convert to mono if the audio has more than one channel
     if len(audio_data.shape) > 1:
         audio_data = np.mean(audio_data, axis=1)
+    # Convert audio data to numpy array of float32 type
     audio_array = np.array(audio_data).astype(np.float32)
+    # Transcribe the audio input with return_timestamps=True
     transcription = pipe1(audio_array, return_timestamps=True)
+    # Check the output structure of transcription
+    if "segments" in transcription:
+        # Extract text from each segment if "segments" key exists
+        transcription_text = " ".join([segment["text"] for segment in transcription["segments"]])
+    elif "text" in transcription:
+        # Use the full transcription if it's directly in the "text" field
+        transcription_text = transcription["text"]
+    else:
+        transcription_text = "No transcription available."
+    # Print and return the transcription text
     print("Transcription:", transcription_text)
     return transcription_text
 demo = gr.Interface(fn=audio_to_text, inputs=gr.Audio(), outputs="text")
 demo.launch(share=True)