RP-Azul commited on
Commit
359777d
·
verified ·
1 Parent(s): 7954582

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -24
app.py CHANGED
@@ -12,31 +12,27 @@ pipe1 = pipeline("automatic-speech-recognition", model="openai/whisper-base")
12
  #pipe3.to("cuda" if torch.cuda.is_available() else "cpu")
13
 
14
  def audio_to_image(audio):
15
- # Load the audio file and convert it to a numpy array
16
- audio_data, _ = sf.read(audio) # Load audio file
17
- audio_array = np.array(audio_data) # Convert to numpy array
18
 
19
- # Transcribe the audio input
20
- transcription = pipe1(audio_array)
 
21
 
22
- #transcription = pipe1(audio)
23
-
24
- transcription_text = transcription['text']
25
-
26
- #summary = pipe2(transcription_text, max_length=50, min_length=10, do_sample=False)
27
- #summary_text = summary[0]['summary_text']
28
-
29
- #prompt = summary_text
30
- #image = pipe3(prompt).images[0]
31
-
32
- #return image
33
- #print("Transcription:", transcription_text)
34
- #print("Summary:", summary_text)
35
- #return transcription_text, summary_text
36
- return transcription_text
37
-
38
 
39
- #demo = gr.Interface(fn=audio_to_image, inputs=gr.Audio(), outputs="image")
40
- #demo = gr.Interface(fn=audio_to_image, inputs=gr.Audio(), outputs=[gr.Textbox(label="Transcription"), gr.Textbox(label="Summary")])
41
- demo = gr.Interface(fn=audio_to_image, inputs=gr.Audio(), outputs="text")
 
 
 
 
 
 
42
  demo.launch(share=True)
 
12
  #pipe3.to("cuda" if torch.cuda.is_available() else "cpu")
13
 
14
  def audio_to_image(audio):
15
+ # Load the audio file
16
+ audio_data, sample_rate = sf.read(audio)
 
17
 
18
+ # Convert to mono if the audio has more than one channel
19
+ if len(audio_data.shape) > 1:
20
+ audio_data = np.mean(audio_data, axis=1) # Averaging channels to convert to mono
21
 
22
+ # Resample the audio to 16 kHz if it's not already at 16 kHz
23
+ if sample_rate != 16000:
24
+ audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
25
+
26
+ # Convert to numpy array with float32 data type
27
+ audio_array = np.array(audio_data).astype(np.float32)
 
 
 
 
 
 
 
 
 
 
28
 
29
+ # Transcribe the audio input
30
+ transcription = pipe1(audio_array, sampling_rate=16000)
31
+ transcription_text = transcription['text']
32
+
33
+ # Print and return the transcription text
34
+ print("Transcription:", transcription_text)
35
+ return transcription_text
36
+
37
+ demo = gr.Interface(fn=audio_to_text, inputs=gr.Audio(), outputs="text")
38
  demo.launch(share=True)