simar007 commited on
Commit
65c7a48
·
verified ·
1 Parent(s): 1e228f9

Upload 2 files

Browse files
Files changed (2) hide show
  1. app (1).py +268 -0
  2. requirements (1).txt +4 -0
app (1).py ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces
2
+ import gradio as gr
3
+ from PIL import Image, ImageDraw, ImageFont
4
+ from ultralytics import YOLO
5
+ from huggingface_hub import hf_hub_download
6
+ import cv2
7
+ import tempfile
8
+ import numpy as np
9
+
10
+ def download_model(model_filename):
11
+ """
12
+ Downloads a YOLO model from the Hugging Face Hub.
13
+
14
+ This function fetches a specified YOLO model file from the
15
+ 'atalaydenknalbant/Yolov13' repository on the Hugging Face Hub.
16
+
17
+ Args:
18
+ model_filename (str): The name of the model file to download
19
+ (e.g., 'yolov13n.pt').
20
+
21
+ Returns:
22
+ str: The local path to the downloaded model file.
23
+ """
24
+ return hf_hub_download(repo_id="atalaydenknalbant/Yolov13", filename=model_filename)
25
+
26
+ @spaces.GPU
27
+ def yolo_inference(input_type, image, video, model_id, conf_threshold, iou_threshold, max_detection):
28
+ """
29
+ Performs object detection inference using a YOLOv13 model on either an image or a video.
30
+
31
+ This function downloads the specified YOLO model, then applies it to the
32
+ provided input. For images, it returns an annotated image. For videos, it
33
+ processes each frame and returns an annotated video. Error handling for
34
+ missing inputs is included, returning blank outputs with messages.
35
+
36
+ Args:
37
+ input_type (str): Specifies the input type, either "Image" or "Video".
38
+ image (PIL.Image.Image or None): The input image if `input_type` is "Image".
39
+ None otherwise.
40
+ video (str or None): The path to the input video file if `input_type` is "Video".
41
+ None otherwise.
42
+ model_id (str): The identifier of the YOLO model to use (e.g., 'yolov13n.pt').
43
+ conf_threshold (float): The confidence threshold for object detection.
44
+ Detections with lower confidence are discarded.
45
+ iou_threshold (float): The Intersection over Union (IoU) threshold for
46
+ Non-Maximum Suppression (NMS).
47
+ max_detection (int): The maximum number of detections to return per image or frame.
48
+
49
+ Returns:
50
+ tuple: A tuple containing two elements:
51
+ - PIL.Image.Image or None: The annotated image if `input_type` was "Image",
52
+ otherwise None.
53
+ - str or None: The path to the annotated video file if `input_type` was "Video",
54
+ otherwise None.
55
+ """
56
+ model_path = download_model(model_id)
57
+
58
+ if input_type == "Image":
59
+ if image is None:
60
+ width, height = 640, 480
61
+ blank_image = Image.new("RGB", (width, height), color="white")
62
+ draw = ImageDraw.Draw(blank_image)
63
+ message = "No image provided"
64
+ font = ImageFont.load_default(size=40)
65
+ bbox = draw.textbbox((0, 0), message, font=font)
66
+ text_width = bbox[2] - bbox[0]
67
+ text_height = bbox[3] - bbox[1]
68
+ text_x = (width - text_width) / 2
69
+ text_y = (height - text_height) / 2
70
+ draw.text((text_x, text_y), message, fill="black", font=font)
71
+ return blank_image, None
72
+
73
+ model = YOLO(model_path)
74
+ results = model.predict(
75
+ source=image,
76
+ conf=conf_threshold,
77
+ iou=iou_threshold,
78
+ imgsz=640,
79
+ max_det=max_detection,
80
+ show_labels=True,
81
+ show_conf=True,
82
+ )
83
+ for r in results:
84
+ image_array = r.plot()
85
+ annotated_image = Image.fromarray(image_array[..., ::-1])
86
+ return annotated_image, None
87
+
88
+ elif input_type == "Video":
89
+ if video is None:
90
+ width, height = 640, 480
91
+ blank_image = Image.new("RGB", (width, height), color="white")
92
+ draw = ImageDraw.Draw(blank_image)
93
+ message = "No video provided"
94
+ font = ImageFont.load_default(size=40)
95
+ bbox = draw.textbbox((0, 0), message, font=font)
96
+ text_width = bbox[2] - bbox[0]
97
+ text_height = bbox[3] - bbox[1]
98
+ text_x = (width - text_width) / 2
99
+ text_y = (height - text_height) / 2
100
+ draw.text((text_x, text_y), message, fill="black", font=font)
101
+ temp_video_file = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
102
+ fourcc = cv2.VideoWriter_fourcc(*"mp4v")
103
+ out = cv2.VideoWriter(temp_video_file, fourcc, 1, (width, height))
104
+ frame = cv2.cvtColor(np.array(blank_image), cv2.COLOR_RGB2BGR)
105
+ out.write(frame)
106
+ out.release()
107
+ return None, temp_video_file
108
+
109
+ model = YOLO(model_path)
110
+ cap = cv2.VideoCapture(video)
111
+ fps = cap.get(cv2.CAP_PROP_FPS) if cap.get(cv2.CAP_PROP_FPS) > 0 else 25
112
+ frames = []
113
+ while True:
114
+ ret, frame = cap.read()
115
+ if not ret:
116
+ break
117
+ pil_frame = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
118
+ results = model.predict(
119
+ source=pil_frame,
120
+ conf=conf_threshold,
121
+ iou=iou_threshold,
122
+ imgsz=640,
123
+ max_det=max_detection,
124
+ show_labels=True,
125
+ show_conf=True,
126
+ )
127
+ for r in results:
128
+ annotated_frame_array = r.plot()
129
+ annotated_frame = cv2.cvtColor(annotated_frame_array, cv2.COLOR_BGR2RGB)
130
+ frames.append(annotated_frame)
131
+ cap.release()
132
+ if not frames:
133
+ return None, None
134
+
135
+ height_out, width_out, _ = frames[0].shape
136
+ temp_video_file = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
137
+ fourcc = cv2.VideoWriter_fourcc(*"mp4v")
138
+ out = cv2.VideoWriter(temp_video_file, fourcc, fps, (width_out, height_out))
139
+ for f in frames:
140
+ f_bgr = cv2.cvtColor(f, cv2.COLOR_RGB2BGR)
141
+ out.write(f_bgr)
142
+ out.release()
143
+ return None, temp_video_file
144
+
145
+ return None, None
146
+
147
+ def update_visibility(input_type):
148
+ """
149
+ Adjusts the visibility of Gradio components based on the selected input type.
150
+
151
+ This function dynamically shows or hides the image and video input/output
152
+ components in the Gradio interface to ensure only relevant fields are visible.
153
+
154
+ Args:
155
+ input_type (str): The selected input type, either "Image" or "Video".
156
+
157
+ Returns:
158
+ tuple: A tuple of `gr.update` objects for the visibility of:
159
+ (image input, video input, image output, video output).
160
+ """
161
+ if input_type == "Image":
162
+ return gr.update(visible=True), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
163
+ else:
164
+ return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=True)
165
+
166
+ def yolo_inference_for_examples(image, model_id, conf_threshold, iou_threshold, max_detection):
167
+ """
168
+ Wrapper function for `yolo_inference` specifically for Gradio examples that use images.
169
+
170
+ This function simplifies the `yolo_inference` call for the `gr.Examples` component,
171
+ ensuring only image-based inference is performed for predefined examples.
172
+
173
+ Args:
174
+ image (PIL.Image.Image): The input image for the example.
175
+ model_id (str): The identifier of the YOLO model to use.
176
+ conf_threshold (float): The confidence threshold.
177
+ iou_threshold (float): The IoU threshold.
178
+ max_detection (int): The maximum number of detections.
179
+
180
+ Returns:
181
+ PIL.Image.Image or None: The annotated image. Returns None if no image is processed.
182
+ """
183
+ annotated_image, _ = yolo_inference(
184
+ input_type="Image",
185
+ image=image,
186
+ video=None,
187
+ model_id=model_id,
188
+ conf_threshold=conf_threshold,
189
+ iou_threshold=iou_threshold,
190
+ max_detection=max_detection
191
+ )
192
+ return annotated_image
193
+
194
+ theme = gr.themes.Ocean(primary_hue="blue", secondary_hue="pink")
195
+
196
+ with gr.Blocks(theme=theme) as app:
197
+ gr.Markdown("# Yolov13: Object Detection")
198
+ gr.Markdown("Upload an image or video for inference using the latest YOLOv13 models.")
199
+ gr.Markdown("📝 **Note:** Better-trained models will be deployed as they become available.")
200
+ with gr.Accordion("Paper and Citation", open=False):
201
+ gr.Markdown("""
202
+ This application is based on the research from the paper: **YOLOv13: Real-Time Object Detection with Hypergraph-Enhanced Adaptive Visual Perception**.
203
+
204
+ - **Authors:** Mengqi Lei, Siqi Li, Yihong Wu, et al.
205
+ - **Preprint Link:** [https://arxiv.org/abs/2506.17733](https://arxiv.org/abs/2506.17733)
206
+
207
+ **BibTeX:**
208
+ ```
209
+ @article{yolov13,
210
+ title={YOLOv13: Real-Time Object Detection with Hypergraph-Enhanced Adaptive Visual Perception},
211
+ author={Lei, Mengqi and Li, Siqi and Wu, Yihong and et al.},
212
+ journal={arXiv preprint arXiv:2506.17733},
213
+ year={2025}
214
+ }
215
+ ```
216
+ """)
217
+
218
+ with gr.Row():
219
+ with gr.Column():
220
+ image = gr.Image(type="pil", label="Image", visible=True)
221
+ video = gr.Video(label="Video", visible=False)
222
+ input_type = gr.Radio(
223
+ choices=["Image", "Video"],
224
+ value="Image",
225
+ label="Input Type",
226
+ )
227
+ model_id = gr.Dropdown(
228
+ label="Model Name",
229
+ choices=[
230
+ 'yolov13n.pt', 'yolov13s.pt', 'yolov13l.pt', 'yolov13x.pt',
231
+ ],
232
+ value="yolov13n.pt",
233
+ )
234
+ conf_threshold = gr.Slider(minimum=0, maximum=1, value=0.35, label="Confidence Threshold")
235
+ iou_threshold = gr.Slider(minimum=0, maximum=1, value=0.45, label="IoU Threshold")
236
+ max_detection = gr.Slider(minimum=1, maximum=300, step=1, value=300, label="Max Detection")
237
+ infer_button = gr.Button("Detect Objects", variant="primary")
238
+ with gr.Column():
239
+ output_image = gr.Image(type="pil", show_label=False, show_share_button=False, visible=True)
240
+ output_video = gr.Video(show_label=False, show_share_button=False, visible=False)
241
+ gr.DeepLinkButton(variant="primary")
242
+
243
+ input_type.change(
244
+ fn=update_visibility,
245
+ inputs=input_type,
246
+ outputs=[image, video, output_image, output_video],
247
+ )
248
+
249
+ infer_button.click(
250
+ fn=yolo_inference,
251
+ inputs=[input_type, image, video, model_id, conf_threshold, iou_threshold, max_detection],
252
+ outputs=[output_image, output_video],
253
+ )
254
+
255
+ gr.Examples(
256
+ examples=[
257
+ ["zidane.jpg", "yolov13s.pt", 0.35, 0.45, 300],
258
+ ["bus.jpg", "yolov13l.pt", 0.35, 0.45, 300],
259
+ ["yolo_vision.jpg", "yolov13x.pt", 0.35, 0.45, 300],
260
+ ],
261
+ fn=yolo_inference_for_examples,
262
+ inputs=[image, model_id, conf_threshold, iou_threshold, max_detection],
263
+ outputs=[output_image],
264
+ label="Examples (Images)",
265
+ )
266
+
267
+ if __name__ == '__main__':
268
+ app.launch(mcp_server=True)
requirements (1).txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ git+https://github.com/iMoonLab/yolov13
2
+ spaces
3
+ Pillow
4
+ huggingface_hub