RP-Azul commited on
Commit
5bed895
·
verified ·
1 Parent(s): c8b8068

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -4
app.py CHANGED
@@ -9,24 +9,38 @@ import librosa
9
  pipe1 = pipeline("automatic-speech-recognition", model="openai/whisper-base")
10
 
11
  def audio_to_text(audio):
12
- # Check if audio is a file path or a tuple of (sample_rate, numpy_array)
13
  if isinstance(audio, tuple):
14
  sample_rate, audio_data = audio # Unpack sample rate and data
15
  else:
16
- audio_data, sample_rate = librosa.load(audio, sr=16000) # Load and resample to 16kHz directly
 
17
 
18
  # Convert to mono if the audio has more than one channel
19
  if len(audio_data.shape) > 1:
20
  audio_data = np.mean(audio_data, axis=1)
 
 
21
  audio_array = np.array(audio_data).astype(np.float32)
22
 
23
- # Transcribe the audio input
24
  transcription = pipe1(audio_array, return_timestamps=True)
25
- transcription_text = " ".join([segment["text"] for segment in transcription["segments"]])
26
 
 
 
 
 
 
 
 
 
 
 
 
27
  print("Transcription:", transcription_text)
28
  return transcription_text
29
 
30
 
 
31
  demo = gr.Interface(fn=audio_to_text, inputs=gr.Audio(), outputs="text")
32
  demo.launch(share=True)
 
9
  pipe1 = pipeline("automatic-speech-recognition", model="openai/whisper-base")
10
 
11
  def audio_to_text(audio):
12
+ # Check if audio is a tuple of (sample_rate, numpy_array)
13
  if isinstance(audio, tuple):
14
  sample_rate, audio_data = audio # Unpack sample rate and data
15
  else:
16
+ # If it's a file path, load and resample to 16kHz directly
17
+ audio_data, sample_rate = librosa.load(audio, sr=16000)
18
 
19
  # Convert to mono if the audio has more than one channel
20
  if len(audio_data.shape) > 1:
21
  audio_data = np.mean(audio_data, axis=1)
22
+
23
+ # Convert audio data to numpy array of float32 type
24
  audio_array = np.array(audio_data).astype(np.float32)
25
 
26
+ # Transcribe the audio input with return_timestamps=True
27
  transcription = pipe1(audio_array, return_timestamps=True)
 
28
 
29
+ # Check the output structure of transcription
30
+ if "segments" in transcription:
31
+ # Extract text from each segment if "segments" key exists
32
+ transcription_text = " ".join([segment["text"] for segment in transcription["segments"]])
33
+ elif "text" in transcription:
34
+ # Use the full transcription if it's directly in the "text" field
35
+ transcription_text = transcription["text"]
36
+ else:
37
+ transcription_text = "No transcription available."
38
+
39
+ # Print and return the transcription text
40
  print("Transcription:", transcription_text)
41
  return transcription_text
42
 
43
 
44
+
45
  demo = gr.Interface(fn=audio_to_text, inputs=gr.Audio(), outputs="text")
46
  demo.launch(share=True)