RP-Azul commited on
Commit
1e150f3
·
verified ·
1 Parent(s): bdad405

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +3 -10
app.py CHANGED
@@ -7,31 +7,24 @@ import soundfile as sf
7
  import librosa
8
 
9
  pipe1 = pipeline("automatic-speech-recognition", model="openai/whisper-base")
10
- #pipe2 = pipeline("summarization", model="facebook/bart-large-cnn")
11
- #pipe3 = DiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5")
12
- #pipe3.to("cuda" if torch.cuda.is_available() else "cpu")
13
 
14
  def audio_to_text(audio):
15
  # Check if audio is a file path or a tuple of (sample_rate, numpy_array)
16
  if isinstance(audio, tuple):
17
  sample_rate, audio_data = audio # Unpack sample rate and data
18
  else:
19
- # If it's a file path, load the audio
20
  audio_data, sample_rate = librosa.load(audio, sr=16000) # Load and resample to 16kHz directly
21
 
22
  # Convert to mono if the audio has more than one channel
23
  if len(audio_data.shape) > 1:
24
- audio_data = np.mean(audio_data, axis=1) # Averaging channels to convert to mono
 
25
 
26
- # Resample the audio to 16 kHz if it's not already at 16 kHz
27
- if sample_rate != 16000:
28
- audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
29
 
30
- # Convert to numpy array with float32 data type
31
  audio_array = np.array(audio_data).astype(np.float32)
32
 
33
  # Transcribe the audio input
34
- transcription = pipe1(audio_array)
35
  transcription_text = transcription['text']
36
 
37
  # Print and return the transcription text
 
7
  import librosa
8
 
9
  pipe1 = pipeline("automatic-speech-recognition", model="openai/whisper-base")
 
 
 
10
 
11
  def audio_to_text(audio):
12
  # Check if audio is a file path or a tuple of (sample_rate, numpy_array)
13
  if isinstance(audio, tuple):
14
  sample_rate, audio_data = audio # Unpack sample rate and data
15
  else:
 
16
  audio_data, sample_rate = librosa.load(audio, sr=16000) # Load and resample to 16kHz directly
17
 
18
  # Convert to mono if the audio has more than one channel
19
  if len(audio_data.shape) > 1:
20
+ audio_data = np.mean(audio_data, axis=1)
21
+
22
 
 
 
 
23
 
 
24
  audio_array = np.array(audio_data).astype(np.float32)
25
 
26
  # Transcribe the audio input
27
+ transcription = pipe1([audio_array])
28
  transcription_text = transcription['text']
29
 
30
  # Print and return the transcription text