Create app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Use a pipeline as a high-level helper
|
| 2 |
+
from transformers import pipeline
|
| 3 |
+
import gradio as gr
|
| 4 |
+
from PIL import Image, ImageDraw
|
| 5 |
+
import scipy.io.wavfile as wavfile
|
| 6 |
+
|
| 7 |
+
# from phonemizer.backend.espeak.wrapper import EspeakWrapper
|
| 8 |
+
# _ESPEAK_LIBRARY = '/opt/homebrew/Cellar/espeak/1.48.04_1/lib/libespeak.1.1.48.dylib' #use the Path to the library.
|
| 9 |
+
# EspeakWrapper.set_library(_ESPEAK_LIBRARY)
|
| 10 |
+
|
| 11 |
+
object_detector_model_path = pipeline("object-detection", model="facebook/detr-resnet-50")
|
| 12 |
+
|
| 13 |
+
# object_detector_model_path = "../Models/models--facebook--detr-resnet-50/snapshots/1d5f47bd3bdd2c4bbfa585418ffe6da5028b4c0b"
|
| 14 |
+
|
| 15 |
+
# object_detector = pipeline("object-detection", model=object_detector_model_path)
|
| 16 |
+
|
| 17 |
+
narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")
|
| 18 |
+
|
| 19 |
+
# tts_model_path = "../Models/models--kakao-enterprise--vits-ljs/snapshots/3bcb8321394f671bd948ebf0d086d694dda95464"
|
| 20 |
+
|
| 21 |
+
# narrator = pipeline("text-to-speech", model=tts_model_path)
|
| 22 |
+
|
| 23 |
+
# Define the function to generate audio from text
|
| 24 |
+
def generate_audio(text):
|
| 25 |
+
# Generate the narrated text
|
| 26 |
+
narrated_text = narrator(text)
|
| 27 |
+
|
| 28 |
+
# Save the audio to WAV file
|
| 29 |
+
wavfile.write("finetuned_output.wav", rate=narrated_text["sampling_rate"],
|
| 30 |
+
data=narrated_text["audio"][0])
|
| 31 |
+
|
| 32 |
+
# Return the path to the saved output WAV file
|
| 33 |
+
return "finetuned_output.wav"
|
| 34 |
+
|
| 35 |
+
# Could you please write me a python code that will take list of detection object as an input and it will give the response that will include all the objects (labels) provided in the input. For example if the input is like this: [{'score': 0.9996405839920044, 'label': 'person', 'box': {'xmin': 435, 'ymin': 282, 'xmax': 636, 'ymax': 927}}, {'score': 0.9995879530906677, 'label': 'dog', 'box': {'xmin': 570, 'ymin': 694, 'xmax': 833, 'ymax': 946}}]
|
| 36 |
+
# The output should be, This pictuture contains 1 person and 1 dog. If there are multiple objects, do not add 'and' between every objects but 'and' should be at the end only
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def read_objects(detection_objects):
|
| 40 |
+
# Initialize counters for each object label
|
| 41 |
+
object_counts = {}
|
| 42 |
+
|
| 43 |
+
# Count the occurrences of each label
|
| 44 |
+
for detection in detection_objects:
|
| 45 |
+
label = detection['label']
|
| 46 |
+
if label in object_counts:
|
| 47 |
+
object_counts[label] += 1
|
| 48 |
+
else:
|
| 49 |
+
object_counts[label] = 1
|
| 50 |
+
|
| 51 |
+
# Generate the response string
|
| 52 |
+
response = "This picture contains"
|
| 53 |
+
labels = list(object_counts.keys())
|
| 54 |
+
for i, label in enumerate(labels):
|
| 55 |
+
response += f" {object_counts[label]} {label}"
|
| 56 |
+
if object_counts[label] > 1:
|
| 57 |
+
response += "s"
|
| 58 |
+
if i < len(labels) - 2:
|
| 59 |
+
response += ","
|
| 60 |
+
elif i == len(labels) - 2:
|
| 61 |
+
response += " and"
|
| 62 |
+
|
| 63 |
+
response += "."
|
| 64 |
+
|
| 65 |
+
return response
|
| 66 |
+
|
| 67 |
+
def draw_bounding_boxes(image, object_detections):
|
| 68 |
+
"""
|
| 69 |
+
Draws bounding boxes around detected objects on a PIL image.
|
| 70 |
+
|
| 71 |
+
Args:
|
| 72 |
+
image (PIL.Image): The input image.
|
| 73 |
+
object_detections (list): A list of dictionaries, where each dictionary represents a detected object.
|
| 74 |
+
Each dictionary should have the following keys:
|
| 75 |
+
- 'score': the confidence score of the detection
|
| 76 |
+
- 'label': the label of the detected object
|
| 77 |
+
- 'box': a dictionary with keys 'xmin', 'ymin', 'xmax', 'ymax'
|
| 78 |
+
representing the bounding box coordinates.
|
| 79 |
+
|
| 80 |
+
Returns:
|
| 81 |
+
PIL.Image: The input image with bounding boxes drawn around the detected objects.
|
| 82 |
+
"""
|
| 83 |
+
draw = ImageDraw.Draw(image)
|
| 84 |
+
for detection in object_detections:
|
| 85 |
+
box = detection['box']
|
| 86 |
+
label = detection['label']
|
| 87 |
+
score = detection['score']
|
| 88 |
+
|
| 89 |
+
# Draw the bounding box
|
| 90 |
+
draw.rectangle((box['xmin'], box['ymin'], box['xmax'], box['ymax']), outline=(255, 0, 0), width=2)
|
| 91 |
+
|
| 92 |
+
# Draw the label and score
|
| 93 |
+
text = f"{label} ({score:.2f})"
|
| 94 |
+
draw.text((box['xmin'], box['ymin'] - 20), text, fill=(255, 0, 0))
|
| 95 |
+
|
| 96 |
+
return image
|
| 97 |
+
|
| 98 |
+
def detect_object(image):
|
| 99 |
+
# raw_image = Image.open(image)
|
| 100 |
+
output = object_detector(image)
|
| 101 |
+
processed_image = draw_bounding_boxes(image, output)
|
| 102 |
+
natural_text = read_objects(output)
|
| 103 |
+
processed_audio = generate_audio(natural_text)
|
| 104 |
+
return processed_image, processed_audio
|
| 105 |
+
|
| 106 |
+
gr.close_all()
|
| 107 |
+
|
| 108 |
+
demo = gr.Interface(fn=detect_object,
|
| 109 |
+
inputs=[gr.Image(label="Select Image",type="pil")],
|
| 110 |
+
outputs=[gr.Image(label="Processed Image", type="pil"), gr.Audio(label="Generated Audio")],
|
| 111 |
+
title="@IT AI Enthusiast (https://www.youtube.com/@itaienthusiast/) - Project 7: Object Detector with Audio",
|
| 112 |
+
description="THIS APPLICATION WILL BE USED TO HIGHLIGHT OBJECTS AND GIVES AUDIO DESCRIPTION FOR THE PROVIDED INPUT IMAGE.")
|
| 113 |
+
demo.launch()
|