Update app.py
Browse files
app.py
CHANGED
|
@@ -7,31 +7,24 @@ import soundfile as sf
|
|
| 7 |
import librosa
|
| 8 |
|
| 9 |
pipe1 = pipeline("automatic-speech-recognition", model="openai/whisper-base")
|
| 10 |
-
#pipe2 = pipeline("summarization", model="facebook/bart-large-cnn")
|
| 11 |
-
#pipe3 = DiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5")
|
| 12 |
-
#pipe3.to("cuda" if torch.cuda.is_available() else "cpu")
|
| 13 |
|
| 14 |
def audio_to_text(audio):
|
| 15 |
# Check if audio is a file path or a tuple of (sample_rate, numpy_array)
|
| 16 |
if isinstance(audio, tuple):
|
| 17 |
sample_rate, audio_data = audio # Unpack sample rate and data
|
| 18 |
else:
|
| 19 |
-
# If it's a file path, load the audio
|
| 20 |
audio_data, sample_rate = librosa.load(audio, sr=16000) # Load and resample to 16kHz directly
|
| 21 |
|
| 22 |
# Convert to mono if the audio has more than one channel
|
| 23 |
if len(audio_data.shape) > 1:
|
| 24 |
-
audio_data = np.mean(audio_data, axis=1)
|
|
|
|
| 25 |
|
| 26 |
-
# Resample the audio to 16 kHz if it's not already at 16 kHz
|
| 27 |
-
if sample_rate != 16000:
|
| 28 |
-
audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
|
| 29 |
|
| 30 |
-
# Convert to numpy array with float32 data type
|
| 31 |
audio_array = np.array(audio_data).astype(np.float32)
|
| 32 |
|
| 33 |
# Transcribe the audio input
|
| 34 |
-
transcription = pipe1(audio_array)
|
| 35 |
transcription_text = transcription['text']
|
| 36 |
|
| 37 |
# Print and return the transcription text
|
|
|
|
| 7 |
import librosa
|
| 8 |
|
| 9 |
pipe1 = pipeline("automatic-speech-recognition", model="openai/whisper-base")
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
def audio_to_text(audio):
|
| 12 |
# Check if audio is a file path or a tuple of (sample_rate, numpy_array)
|
| 13 |
if isinstance(audio, tuple):
|
| 14 |
sample_rate, audio_data = audio # Unpack sample rate and data
|
| 15 |
else:
|
|
|
|
| 16 |
audio_data, sample_rate = librosa.load(audio, sr=16000) # Load and resample to 16kHz directly
|
| 17 |
|
| 18 |
# Convert to mono if the audio has more than one channel
|
| 19 |
if len(audio_data.shape) > 1:
|
| 20 |
+
audio_data = np.mean(audio_data, axis=1)
|
| 21 |
+
|
| 22 |
|
|
|
|
|
|
|
|
|
|
| 23 |
|
|
|
|
| 24 |
audio_array = np.array(audio_data).astype(np.float32)
|
| 25 |
|
| 26 |
# Transcribe the audio input
|
| 27 |
+
transcription = pipe1([audio_array])
|
| 28 |
transcription_text = transcription['text']
|
| 29 |
|
| 30 |
# Print and return the transcription text
|