RP-Azul commited on
Commit
c39770b
·
verified ·
1 Parent(s): d15474b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -4
app.py CHANGED
@@ -11,9 +11,13 @@ pipe1 = pipeline("automatic-speech-recognition", model="openai/whisper-base")
11
  #pipe3 = DiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5")
12
  #pipe3.to("cuda" if torch.cuda.is_available() else "cpu")
13
 
14
- def audio_to_image(audio):
15
- # Load the audio file
16
- audio_data, sample_rate = sf.read(audio)
 
 
 
 
17
 
18
  # Convert to mono if the audio has more than one channel
19
  if len(audio_data.shape) > 1:
@@ -34,5 +38,6 @@ def audio_to_image(audio):
34
  print("Transcription:", transcription_text)
35
  return transcription_text
36
 
37
- demo = gr.Interface(fn=audio_to_image, inputs=gr.Audio(), outputs="text")
 
38
  demo.launch(share=True)
 
11
  #pipe3 = DiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5")
12
  #pipe3.to("cuda" if torch.cuda.is_available() else "cpu")
13
 
14
+ def audio_to_text(audio):
15
+ # Check if audio is a file path or a tuple of (sample_rate, numpy_array)
16
+ if isinstance(audio, tuple):
17
+ sample_rate, audio_data = audio # Unpack sample rate and data
18
+ else:
19
+ # If it's a file path, load the audio
20
+ audio_data, sample_rate = librosa.load(audio, sr=16000) # Load and resample to 16kHz directly
21
 
22
  # Convert to mono if the audio has more than one channel
23
  if len(audio_data.shape) > 1:
 
38
  print("Transcription:", transcription_text)
39
  return transcription_text
40
 
41
+
42
+ demo = gr.Interface(fn=audio_to_text, inputs=gr.Audio(source="microphone", type="numpy"), outputs="text")
43
  demo.launch(share=True)