Spaces:

RP-Azul
/

P1

Sleeping

RP-Azul commited on Nov 4, 2024

Commit

1e150f3

verified ·

1 Parent(s): bdad405

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -7,31 +7,24 @@ import soundfile as sf
 import librosa
 pipe1 = pipeline("automatic-speech-recognition", model="openai/whisper-base")
-#pipe2 = pipeline("summarization", model="facebook/bart-large-cnn")
-#pipe3 = DiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5")
-#pipe3.to("cuda" if torch.cuda.is_available() else "cpu")
 def audio_to_text(audio):
     # Check if audio is a file path or a tuple of (sample_rate, numpy_array)
     if isinstance(audio, tuple):
         sample_rate, audio_data = audio  # Unpack sample rate and data
     else:
-        # If it's a file path, load the audio
         audio_data, sample_rate = librosa.load(audio, sr=16000)  # Load and resample to 16kHz directly
     # Convert to mono if the audio has more than one channel
     if len(audio_data.shape) > 1:
-        audio_data = np.mean(audio_data, axis=1)  # Averaging channels to convert to mono
-    # Resample the audio to 16 kHz if it's not already at 16 kHz
-    if sample_rate != 16000:
-        audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
-    # Convert to numpy array with float32 data type
     audio_array = np.array(audio_data).astype(np.float32)
     # Transcribe the audio input
-    transcription = pipe1(audio_array)
     transcription_text = transcription['text']
     # Print and return the transcription text

 import librosa
 pipe1 = pipeline("automatic-speech-recognition", model="openai/whisper-base")
 def audio_to_text(audio):
     # Check if audio is a file path or a tuple of (sample_rate, numpy_array)
     if isinstance(audio, tuple):
         sample_rate, audio_data = audio  # Unpack sample rate and data
     else:
         audio_data, sample_rate = librosa.load(audio, sr=16000)  # Load and resample to 16kHz directly
     # Convert to mono if the audio has more than one channel
     if len(audio_data.shape) > 1:
+        audio_data = np.mean(audio_data, axis=1)
     audio_array = np.array(audio_data).astype(np.float32)
     # Transcribe the audio input
+    transcription = pipe1([audio_array])
     transcription_text = transcription['text']
     # Print and return the transcription text