RP-Azul commited on
Commit
4a6a287
·
verified ·
1 Parent(s): 7547fe3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -8
app.py CHANGED
@@ -9,23 +9,29 @@ import librosa
9
  pipe1 = pipeline("automatic-speech-recognition", model="openai/whisper-base")
10
 
11
  def audio_to_text(audio):
12
- # Convert mp3 to wav and resample to 16 kHz
13
- audio_data, sample_rate = librosa.load(audio, sr=16000) # Load and resample to 16 kHz
14
-
 
 
 
 
15
  # Convert to mono if the audio has more than one channel
16
  if len(audio_data.shape) > 1:
17
  audio_data = np.mean(audio_data, axis=1)
18
 
19
- # Transcribe the audio input with timestamp support for long-form audio
20
  audio_array = np.array(audio_data).astype(np.float32)
 
 
21
  transcription = pipe1(audio_array, return_timestamps=True)
22
 
23
- # Extract text with timestamps if available
24
  if "segments" in transcription:
25
- transcription_text = "\n".join(
26
- [f"[{segment['start']:.2f}s - {segment['end']:.2f}s]: {segment['text']}" for segment in transcription["segments"]]
27
- )
28
  elif "text" in transcription:
 
29
  transcription_text = transcription["text"]
30
  else:
31
  transcription_text = "No transcription available."
 
9
  pipe1 = pipeline("automatic-speech-recognition", model="openai/whisper-base")
10
 
11
  def audio_to_text(audio):
12
+ # Check if audio is a tuple of (sample_rate, numpy_array)
13
+ if isinstance(audio, tuple):
14
+ sample_rate, audio_data = audio # Unpack sample rate and data
15
+ else:
16
+ # If it's a file path, load and resample to 16kHz directly
17
+ audio_data, sample_rate = librosa.load(audio, sr=16000)
18
+
19
  # Convert to mono if the audio has more than one channel
20
  if len(audio_data.shape) > 1:
21
  audio_data = np.mean(audio_data, axis=1)
22
 
23
+ # Convert audio data to numpy array of float32 type
24
  audio_array = np.array(audio_data).astype(np.float32)
25
+
26
+ # Transcribe the audio input with return_timestamps=True
27
  transcription = pipe1(audio_array, return_timestamps=True)
28
 
29
+ # Check the output structure of transcription
30
  if "segments" in transcription:
31
+ # Extract text from each segment if "segments" key exists
32
+ transcription_text = " ".join([segment["text"] for segment in transcription["segments"]])
 
33
  elif "text" in transcription:
34
+ # Use the full transcription if it's directly in the "text" field
35
  transcription_text = transcription["text"]
36
  else:
37
  transcription_text = "No transcription available."