Update app.py
Browse files
app.py
CHANGED
|
@@ -9,24 +9,38 @@ import librosa
|
|
| 9 |
pipe1 = pipeline("automatic-speech-recognition", model="openai/whisper-base")
|
| 10 |
|
| 11 |
def audio_to_text(audio):
|
| 12 |
-
# Check if audio is a
|
| 13 |
if isinstance(audio, tuple):
|
| 14 |
sample_rate, audio_data = audio # Unpack sample rate and data
|
| 15 |
else:
|
| 16 |
-
|
|
|
|
| 17 |
|
| 18 |
# Convert to mono if the audio has more than one channel
|
| 19 |
if len(audio_data.shape) > 1:
|
| 20 |
audio_data = np.mean(audio_data, axis=1)
|
|
|
|
|
|
|
| 21 |
audio_array = np.array(audio_data).astype(np.float32)
|
| 22 |
|
| 23 |
-
# Transcribe the audio input
|
| 24 |
transcription = pipe1(audio_array, return_timestamps=True)
|
| 25 |
-
transcription_text = " ".join([segment["text"] for segment in transcription["segments"]])
|
| 26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
print("Transcription:", transcription_text)
|
| 28 |
return transcription_text
|
| 29 |
|
| 30 |
|
|
|
|
| 31 |
demo = gr.Interface(fn=audio_to_text, inputs=gr.Audio(), outputs="text")
|
| 32 |
demo.launch(share=True)
|
|
|
|
| 9 |
pipe1 = pipeline("automatic-speech-recognition", model="openai/whisper-base")
|
| 10 |
|
| 11 |
def audio_to_text(audio):
|
| 12 |
+
# Check if audio is a tuple of (sample_rate, numpy_array)
|
| 13 |
if isinstance(audio, tuple):
|
| 14 |
sample_rate, audio_data = audio # Unpack sample rate and data
|
| 15 |
else:
|
| 16 |
+
# If it's a file path, load and resample to 16kHz directly
|
| 17 |
+
audio_data, sample_rate = librosa.load(audio, sr=16000)
|
| 18 |
|
| 19 |
# Convert to mono if the audio has more than one channel
|
| 20 |
if len(audio_data.shape) > 1:
|
| 21 |
audio_data = np.mean(audio_data, axis=1)
|
| 22 |
+
|
| 23 |
+
# Convert audio data to numpy array of float32 type
|
| 24 |
audio_array = np.array(audio_data).astype(np.float32)
|
| 25 |
|
| 26 |
+
# Transcribe the audio input with return_timestamps=True
|
| 27 |
transcription = pipe1(audio_array, return_timestamps=True)
|
|
|
|
| 28 |
|
| 29 |
+
# Check the output structure of transcription
|
| 30 |
+
if "segments" in transcription:
|
| 31 |
+
# Extract text from each segment if "segments" key exists
|
| 32 |
+
transcription_text = " ".join([segment["text"] for segment in transcription["segments"]])
|
| 33 |
+
elif "text" in transcription:
|
| 34 |
+
# Use the full transcription if it's directly in the "text" field
|
| 35 |
+
transcription_text = transcription["text"]
|
| 36 |
+
else:
|
| 37 |
+
transcription_text = "No transcription available."
|
| 38 |
+
|
| 39 |
+
# Print and return the transcription text
|
| 40 |
print("Transcription:", transcription_text)
|
| 41 |
return transcription_text
|
| 42 |
|
| 43 |
|
| 44 |
+
|
| 45 |
demo = gr.Interface(fn=audio_to_text, inputs=gr.Audio(), outputs="text")
|
| 46 |
demo.launch(share=True)
|