Spaces:

Devarsh24
/

Image_Captioning_Advanced

Sleeping

App Files Files Community

Image_Captioning_Advanced / app.py

Devarsh24

Update app.py

4c685d2 verified 3 months ago

raw

history blame contribute delete

2.08 kB

	# to create neural network
	import torch

	# for interface
	import gradio as gr

	# to open images
	from PIL import Image

	# used for audio
	import scipy.io.wavfile as wavfile

	# Use a pipeline as a high-level helper
	from transformers import pipeline

	# device: 0 for GPU, -1 for CPU
	device = 0 if torch.cuda.is_available() else -1

	# Text-to-speech model (English)
	narrator = pipeline(
	"text-to-speech",
	model="facebook/mms-tts-eng",
	device=device
	)

	# Load the pretrained image captioning model
	caption_image = pipeline(
	"image-to-text",
	model="Salesforce/blip-image-captioning-base",
	device=device
	)

	# Define the function to generate audio from text
	def generate_audio(text):
	# Generate the narrated text
	narrated_text = narrator(text)

	# narrator output format: dict with "audio" and "sampling_rate"
	audio = narrated_text["audio"]
	# sometimes it's a list of arrays, handle that:
	if isinstance(audio, list):
	audio = audio[0]

	# Save the audio to WAV file
	output_path = "output.wav"
	wavfile.write(output_path, rate=narrated_text["sampling_rate"], data=audio)

	# Return the path to the saved output WAV file
	return output_path # return audio file path

	def caption_my_image(pil_image: Image.Image):
	# Call pipeline with positional input (no `images=` keyword)
	result = caption_image(pil_image)

	# result is usually a list of dicts
	if isinstance(result, list):
	semantics = result[0]["generated_text"]
	else:
	semantics = result["generated_text"]

	audio = generate_audio(semantics)
	return semantics, audio # returns both text and audio output


	# gr.close_all() # <- NOT NEEDED, remove to avoid issues

	demo = gr.Interface(
	fn=caption_my_image,
	inputs=[gr.Image(label="Select Image", type="pil")],
	outputs=[
	gr.Textbox(label="Image Caption"),
	gr.Audio(label="Image Caption Audio")
	],
	title="IMAGE CAPTIONING WITH AUDIO OUTPUT",
	description="THIS APPLICATION WILL BE USED TO CAPTION IMAGES WITH THE HELP OF AI"
	)

	demo.launch()