Spaces:
Sleeping
Sleeping
| # to create neural network | |
| import torch | |
| # for interface | |
| import gradio as gr | |
| # to open images | |
| from PIL import Image | |
| # used for audio | |
| import scipy.io.wavfile as wavfile | |
| # Use a pipeline as a high-level helper | |
| from transformers import pipeline | |
| # device: 0 for GPU, -1 for CPU | |
| device = 0 if torch.cuda.is_available() else -1 | |
| # Text-to-speech model (English) | |
| narrator = pipeline( | |
| "text-to-speech", | |
| model="facebook/mms-tts-eng", | |
| device=device | |
| ) | |
| # Load the pretrained image captioning model | |
| caption_image = pipeline( | |
| "image-to-text", | |
| model="Salesforce/blip-image-captioning-base", | |
| device=device | |
| ) | |
| # Define the function to generate audio from text | |
| def generate_audio(text): | |
| # Generate the narrated text | |
| narrated_text = narrator(text) | |
| # narrator output format: dict with "audio" and "sampling_rate" | |
| audio = narrated_text["audio"] | |
| # sometimes it's a list of arrays, handle that: | |
| if isinstance(audio, list): | |
| audio = audio[0] | |
| # Save the audio to WAV file | |
| output_path = "output.wav" | |
| wavfile.write(output_path, rate=narrated_text["sampling_rate"], data=audio) | |
| # Return the path to the saved output WAV file | |
| return output_path # return audio file path | |
| def caption_my_image(pil_image: Image.Image): | |
| # Call pipeline with positional input (no `images=` keyword) | |
| result = caption_image(pil_image) | |
| # result is usually a list of dicts | |
| if isinstance(result, list): | |
| semantics = result[0]["generated_text"] | |
| else: | |
| semantics = result["generated_text"] | |
| audio = generate_audio(semantics) | |
| return semantics, audio # returns both text and audio output | |
| # gr.close_all() # <- NOT NEEDED, remove to avoid issues | |
| demo = gr.Interface( | |
| fn=caption_my_image, | |
| inputs=[gr.Image(label="Select Image", type="pil")], | |
| outputs=[ | |
| gr.Textbox(label="Image Caption"), | |
| gr.Audio(label="Image Caption Audio") | |
| ], | |
| title="IMAGE CAPTIONING WITH AUDIO OUTPUT", | |
| description="THIS APPLICATION WILL BE USED TO CAPTION IMAGES WITH THE HELP OF AI" | |
| ) | |
| demo.launch() | |