Devarsh24 commited on
Commit
4c685d2
·
verified ·
1 Parent(s): 9632884

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -23
app.py CHANGED
@@ -1,11 +1,11 @@
1
- # to create nueral network
2
  import torch
3
 
4
  # for interface
5
  import gradio as gr
6
 
7
  # to open images
8
- from PIL import Image
9
 
10
  # used for audio
11
  import scipy.io.wavfile as wavfile
@@ -13,41 +13,66 @@ import scipy.io.wavfile as wavfile
13
  # Use a pipeline as a high-level helper
14
  from transformers import pipeline
15
 
 
 
16
 
17
- device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
18
-
19
- narrator = pipeline("text-to-speech", model="facebook/mms-tts-eng")
20
-
21
- # Load the pretrained weights
22
- caption_image = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
23
 
 
 
 
 
 
 
24
 
25
  # Define the function to generate audio from text
26
  def generate_audio(text):
27
  # Generate the narrated text
28
  narrated_text = narrator(text)
29
 
 
 
 
 
 
 
30
  # Save the audio to WAV file
31
- wavfile.write("output.wav", rate=narrated_text["sampling_rate"],
32
- data=narrated_text["audio"][0])
33
 
34
  # Return the path to the saved output WAV file
35
- return "output.wav" # return audio
36
 
37
- def caption_my_image(pil_image):
 
 
 
 
 
 
 
 
38
 
39
- semantics = caption_image(images=pil_image)[0]['generated_text']
40
  audio = generate_audio(semantics)
41
- return semantics,audio # returns both text and audio output
 
42
 
 
43
 
44
- gr.close_all()
 
 
 
 
 
 
 
 
 
45
 
46
- demo = gr.Interface(fn=caption_my_image,
47
- inputs=[gr.Image(label="Select Image",type="pil")],
48
- outputs=[
49
- gr.Textbox(label="Image Caption"),
50
- gr.Audio(label="Image Caption Audio")],
51
- title="IMAGE CAPTIONING WITH AUDIO OUTPUT",
52
- description="THIS APPLICATION WILL BE USED TO CAPTION IMAGES WITH THE HELP OF AI")
53
- demo.launch()
 
1
+ # to create neural network
2
  import torch
3
 
4
  # for interface
5
  import gradio as gr
6
 
7
  # to open images
8
+ from PIL import Image
9
 
10
  # used for audio
11
  import scipy.io.wavfile as wavfile
 
13
  # Use a pipeline as a high-level helper
14
  from transformers import pipeline
15
 
16
+ # device: 0 for GPU, -1 for CPU
17
+ device = 0 if torch.cuda.is_available() else -1
18
 
19
+ # Text-to-speech model (English)
20
+ narrator = pipeline(
21
+ "text-to-speech",
22
+ model="facebook/mms-tts-eng",
23
+ device=device
24
+ )
25
 
26
+ # Load the pretrained image captioning model
27
+ caption_image = pipeline(
28
+ "image-to-text",
29
+ model="Salesforce/blip-image-captioning-base",
30
+ device=device
31
+ )
32
 
33
  # Define the function to generate audio from text
34
  def generate_audio(text):
35
  # Generate the narrated text
36
  narrated_text = narrator(text)
37
 
38
+ # narrator output format: dict with "audio" and "sampling_rate"
39
+ audio = narrated_text["audio"]
40
+ # sometimes it's a list of arrays, handle that:
41
+ if isinstance(audio, list):
42
+ audio = audio[0]
43
+
44
  # Save the audio to WAV file
45
+ output_path = "output.wav"
46
+ wavfile.write(output_path, rate=narrated_text["sampling_rate"], data=audio)
47
 
48
  # Return the path to the saved output WAV file
49
+ return output_path # return audio file path
50
 
51
+ def caption_my_image(pil_image: Image.Image):
52
+ # Call pipeline with positional input (no `images=` keyword)
53
+ result = caption_image(pil_image)
54
+
55
+ # result is usually a list of dicts
56
+ if isinstance(result, list):
57
+ semantics = result[0]["generated_text"]
58
+ else:
59
+ semantics = result["generated_text"]
60
 
 
61
  audio = generate_audio(semantics)
62
+ return semantics, audio # returns both text and audio output
63
+
64
 
65
+ # gr.close_all() # <- NOT NEEDED, remove to avoid issues
66
 
67
+ demo = gr.Interface(
68
+ fn=caption_my_image,
69
+ inputs=[gr.Image(label="Select Image", type="pil")],
70
+ outputs=[
71
+ gr.Textbox(label="Image Caption"),
72
+ gr.Audio(label="Image Caption Audio")
73
+ ],
74
+ title="IMAGE CAPTIONING WITH AUDIO OUTPUT",
75
+ description="THIS APPLICATION WILL BE USED TO CAPTION IMAGES WITH THE HELP OF AI"
76
+ )
77
 
78
+ demo.launch()