Spaces:

Devarsh24
/

Image_Captioning_Advanced

Sleeping

App Files Files Community

Devarsh24 commited on Dec 5, 2025

Commit

4c685d2

verified ·

1 Parent(s): 9632884

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -23

app.py CHANGED Viewed

@@ -1,11 +1,11 @@
-# to create nueral network
 import torch
 # for interface
 import gradio as gr
 # to open images
-from PIL  import Image
 # used for audio
 import scipy.io.wavfile as wavfile
@@ -13,41 +13,66 @@ import scipy.io.wavfile as wavfile
 # Use a pipeline as a high-level helper
 from transformers import pipeline
-device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
-narrator = pipeline("text-to-speech", model="facebook/mms-tts-eng")
-# Load the pretrained weights
-caption_image = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
 # Define the function to generate audio from text
 def generate_audio(text):
     # Generate the narrated text
     narrated_text = narrator(text)
     # Save the audio to WAV file
-    wavfile.write("output.wav", rate=narrated_text["sampling_rate"],
-                  data=narrated_text["audio"][0])
     # Return the path to the saved output WAV file
-    return "output.wav" # return audio
-def caption_my_image(pil_image):
-    semantics = caption_image(images=pil_image)[0]['generated_text']
     audio = generate_audio(semantics)
-    return semantics,audio  # returns both text and audio output
-gr.close_all()
-demo = gr.Interface(fn=caption_my_image,
-                    inputs=[gr.Image(label="Select Image",type="pil")],
-                    outputs=[
-                        gr.Textbox(label="Image Caption"),
-                        gr.Audio(label="Image Caption Audio")],
-                    title="IMAGE CAPTIONING WITH AUDIO OUTPUT",
-                    description="THIS APPLICATION WILL BE USED TO CAPTION IMAGES WITH THE HELP OF AI")
-demo.launch()

+# to create neural network
 import torch
 # for interface
 import gradio as gr
 # to open images
+from PIL import Image
 # used for audio
 import scipy.io.wavfile as wavfile
 # Use a pipeline as a high-level helper
 from transformers import pipeline
+# device: 0 for GPU, -1 for CPU
+device = 0 if torch.cuda.is_available() else -1
+# Text-to-speech model (English)
+narrator = pipeline(
+    "text-to-speech",
+    model="facebook/mms-tts-eng",
+    device=device
+)
+# Load the pretrained image captioning model
+caption_image = pipeline(
+    "image-to-text",
+    model="Salesforce/blip-image-captioning-base",
+    device=device
+)
 # Define the function to generate audio from text
 def generate_audio(text):
     # Generate the narrated text
     narrated_text = narrator(text)
+    # narrator output format: dict with "audio" and "sampling_rate"
+    audio = narrated_text["audio"]
+    # sometimes it's a list of arrays, handle that:
+    if isinstance(audio, list):
+        audio = audio[0]
     # Save the audio to WAV file
+    output_path = "output.wav"
+    wavfile.write(output_path, rate=narrated_text["sampling_rate"], data=audio)
     # Return the path to the saved output WAV file
+    return output_path  # return audio file path
+def caption_my_image(pil_image: Image.Image):
+    # Call pipeline with positional input (no `images=` keyword)
+    result = caption_image(pil_image)
+    # result is usually a list of dicts
+    if isinstance(result, list):
+        semantics = result[0]["generated_text"]
+    else:
+        semantics = result["generated_text"]
     audio = generate_audio(semantics)
+    return semantics, audio  # returns both text and audio output
+# gr.close_all()  # <- NOT NEEDED, remove to avoid issues
+demo = gr.Interface(
+    fn=caption_my_image,
+    inputs=[gr.Image(label="Select Image", type="pil")],
+    outputs=[
+        gr.Textbox(label="Image Caption"),
+        gr.Audio(label="Image Caption Audio")
+    ],
+    title="IMAGE CAPTIONING WITH AUDIO OUTPUT",
+    description="THIS APPLICATION WILL BE USED TO CAPTION IMAGES WITH THE HELP OF AI"
+)
+demo.launch()