mayankchugh-learning commited on
Commit
1bdd473
·
verified ·
1 Parent(s): 7f8ad6b

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +113 -0
app.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use a pipeline as a high-level helper
2
+ from transformers import pipeline
3
+ import gradio as gr
4
+ from PIL import Image, ImageDraw
5
+ import scipy.io.wavfile as wavfile
6
+
7
+ # from phonemizer.backend.espeak.wrapper import EspeakWrapper
8
+ # _ESPEAK_LIBRARY = '/opt/homebrew/Cellar/espeak/1.48.04_1/lib/libespeak.1.1.48.dylib' #use the Path to the library.
9
+ # EspeakWrapper.set_library(_ESPEAK_LIBRARY)
10
+
11
+ object_detector_model_path = pipeline("object-detection", model="facebook/detr-resnet-50")
12
+
13
+ # object_detector_model_path = "../Models/models--facebook--detr-resnet-50/snapshots/1d5f47bd3bdd2c4bbfa585418ffe6da5028b4c0b"
14
+
15
+ # object_detector = pipeline("object-detection", model=object_detector_model_path)
16
+
17
+ narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")
18
+
19
+ # tts_model_path = "../Models/models--kakao-enterprise--vits-ljs/snapshots/3bcb8321394f671bd948ebf0d086d694dda95464"
20
+
21
+ # narrator = pipeline("text-to-speech", model=tts_model_path)
22
+
23
+ # Define the function to generate audio from text
24
+ def generate_audio(text):
25
+ # Generate the narrated text
26
+ narrated_text = narrator(text)
27
+
28
+ # Save the audio to WAV file
29
+ wavfile.write("finetuned_output.wav", rate=narrated_text["sampling_rate"],
30
+ data=narrated_text["audio"][0])
31
+
32
+ # Return the path to the saved output WAV file
33
+ return "finetuned_output.wav"
34
+
35
+ # Could you please write me a python code that will take list of detection object as an input and it will give the response that will include all the objects (labels) provided in the input. For example if the input is like this: [{'score': 0.9996405839920044, 'label': 'person', 'box': {'xmin': 435, 'ymin': 282, 'xmax': 636, 'ymax': 927}}, {'score': 0.9995879530906677, 'label': 'dog', 'box': {'xmin': 570, 'ymin': 694, 'xmax': 833, 'ymax': 946}}]
36
+ # The output should be, This pictuture contains 1 person and 1 dog. If there are multiple objects, do not add 'and' between every objects but 'and' should be at the end only
37
+
38
+
39
+ def read_objects(detection_objects):
40
+ # Initialize counters for each object label
41
+ object_counts = {}
42
+
43
+ # Count the occurrences of each label
44
+ for detection in detection_objects:
45
+ label = detection['label']
46
+ if label in object_counts:
47
+ object_counts[label] += 1
48
+ else:
49
+ object_counts[label] = 1
50
+
51
+ # Generate the response string
52
+ response = "This picture contains"
53
+ labels = list(object_counts.keys())
54
+ for i, label in enumerate(labels):
55
+ response += f" {object_counts[label]} {label}"
56
+ if object_counts[label] > 1:
57
+ response += "s"
58
+ if i < len(labels) - 2:
59
+ response += ","
60
+ elif i == len(labels) - 2:
61
+ response += " and"
62
+
63
+ response += "."
64
+
65
+ return response
66
+
67
+ def draw_bounding_boxes(image, object_detections):
68
+ """
69
+ Draws bounding boxes around detected objects on a PIL image.
70
+
71
+ Args:
72
+ image (PIL.Image): The input image.
73
+ object_detections (list): A list of dictionaries, where each dictionary represents a detected object.
74
+ Each dictionary should have the following keys:
75
+ - 'score': the confidence score of the detection
76
+ - 'label': the label of the detected object
77
+ - 'box': a dictionary with keys 'xmin', 'ymin', 'xmax', 'ymax'
78
+ representing the bounding box coordinates.
79
+
80
+ Returns:
81
+ PIL.Image: The input image with bounding boxes drawn around the detected objects.
82
+ """
83
+ draw = ImageDraw.Draw(image)
84
+ for detection in object_detections:
85
+ box = detection['box']
86
+ label = detection['label']
87
+ score = detection['score']
88
+
89
+ # Draw the bounding box
90
+ draw.rectangle((box['xmin'], box['ymin'], box['xmax'], box['ymax']), outline=(255, 0, 0), width=2)
91
+
92
+ # Draw the label and score
93
+ text = f"{label} ({score:.2f})"
94
+ draw.text((box['xmin'], box['ymin'] - 20), text, fill=(255, 0, 0))
95
+
96
+ return image
97
+
98
+ def detect_object(image):
99
+ # raw_image = Image.open(image)
100
+ output = object_detector(image)
101
+ processed_image = draw_bounding_boxes(image, output)
102
+ natural_text = read_objects(output)
103
+ processed_audio = generate_audio(natural_text)
104
+ return processed_image, processed_audio
105
+
106
+ gr.close_all()
107
+
108
+ demo = gr.Interface(fn=detect_object,
109
+ inputs=[gr.Image(label="Select Image",type="pil")],
110
+ outputs=[gr.Image(label="Processed Image", type="pil"), gr.Audio(label="Generated Audio")],
111
+ title="@IT AI Enthusiast (https://www.youtube.com/@itaienthusiast/) - Project 7: Object Detector with Audio",
112
+ description="THIS APPLICATION WILL BE USED TO HIGHLIGHT OBJECTS AND GIVES AUDIO DESCRIPTION FOR THE PROVIDED INPUT IMAGE.")
113
+ demo.launch()