| from transformers import pipeline | |
| from transformers.utils import logging | |
| from helper import ignore_warnings, load_image_from_url, render_results_in_image, summarize_predictions_natural_language | |
| import gradio as gr | |
| logging.set_verbosity_error() | |
| ignore_warnings() | |
| od_pipe = pipeline("object-detection", model="facebook/detr-resnet-50") | |
| tts_pipe = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs") | |
| def detect_objects(pil_image): | |
| pipeline_output = od_pipe(pil_image) | |
| processed_image = render_results_in_image(pil_image, pipeline_output) | |
| text = summarize_predictions_natural_language(pipeline_output) | |
| narrated_text = tts_pipe(text) | |
| sr=narrated_text["sampling_rate"] | |
| audio = narrated_text["audio"][0] | |
| return processed_image, text,(sr, audio) | |
| demo = gr.Interface(title="Object Detection in an Image and Narration - test & demo app by Srinivas.V..", | |
| description="Upload any image,preferably an image with many clearly distinguishable objects and submit. Play the audio to listen", | |
| fn=detect_objects, | |
| inputs=gr.Image(label="Input image", | |
| type="pil"), | |
| outputs=[gr.Image(label="Output image with predicted instances", | |
| type="pil"), gr.Textbox(label='Description of detected objects', lines=3), | |
| gr.Audio(label='Play audio to listen about the detected objectes in the image')] | |
| ) | |
| demo.launch(debug=True, share=True) |