shaheerawan3 commited on
Commit
56bb634
Β·
verified Β·
1 Parent(s): 7196f04

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +391 -0
app.py ADDED
@@ -0,0 +1,391 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import cv2
3
+ import numpy as np
4
+ from PIL import Image
5
+ import torch
6
+ from transformers import DetrImageProcessor, DetrForObjectDetection
7
+ from collections import defaultdict
8
+ import time
9
+ import psutil
10
+ import os
11
+
12
+ # Load DETR model (optimized for CPU)
13
+ print("Loading DETR model...")
14
+ processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
15
+ model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50")
16
+ model.eval()
17
+
18
+ # COCO class labels
19
+ COCO_CLASSES = [
20
+ 'N/A', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
21
+ 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A',
22
+ 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse',
23
+ 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack',
24
+ 'umbrella', 'N/A', 'N/A', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis',
25
+ 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove',
26
+ 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'N/A', 'wine glass',
27
+ 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich',
28
+ 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake',
29
+ 'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table', 'N/A',
30
+ 'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard',
31
+ 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A',
32
+ 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
33
+ ]
34
+
35
+ def get_available_memory():
36
+ """Get available system memory in GB"""
37
+ return psutil.virtual_memory().available / (1024 ** 3)
38
+
39
+ def auto_adjust_confidence(image_size, num_objects_hint=None):
40
+ """Dynamically adjust confidence based on image complexity"""
41
+ pixels = image_size[0] * image_size[1]
42
+
43
+ # Base confidence on image size
44
+ if pixels < 500000: # Small image
45
+ base_confidence = 0.6
46
+ elif pixels < 2000000: # Medium image
47
+ base_confidence = 0.65
48
+ else: # Large image
49
+ base_confidence = 0.7
50
+
51
+ return base_confidence
52
+
53
+ def auto_calculate_frame_interval(total_frames, video_duration, available_memory_gb):
54
+ """Dynamically calculate optimal frame interval based on video properties and system resources"""
55
+
56
+ # Base calculations
57
+ fps = total_frames / video_duration if video_duration > 0 else 30
58
+
59
+ # Memory-based adjustment
60
+ if available_memory_gb < 2:
61
+ memory_factor = 3
62
+ elif available_memory_gb < 4:
63
+ memory_factor = 2
64
+ else:
65
+ memory_factor = 1
66
+
67
+ # Duration-based adjustment
68
+ if video_duration < 10:
69
+ duration_factor = 1
70
+ elif video_duration < 30:
71
+ duration_factor = 2
72
+ elif video_duration < 60:
73
+ duration_factor = 3
74
+ else:
75
+ duration_factor = 4
76
+
77
+ # Calculate optimal frames to process
78
+ target_frames = min(150, max(30, total_frames // (memory_factor * duration_factor)))
79
+
80
+ # Calculate interval
81
+ interval = max(1, total_frames // target_frames)
82
+
83
+ return interval, target_frames
84
+
85
+ def detect_objects(image, confidence_threshold=None):
86
+ """Detect objects in a single image with dynamic confidence"""
87
+ # Convert to RGB if needed
88
+ if isinstance(image, np.ndarray):
89
+ image = Image.fromarray(image)
90
+
91
+ # Auto-adjust confidence if not provided
92
+ if confidence_threshold is None:
93
+ confidence_threshold = auto_adjust_confidence(image.size)
94
+
95
+ # Prepare image
96
+ inputs = processor(images=image, return_tensors="pt")
97
+
98
+ # Inference
99
+ with torch.no_grad():
100
+ outputs = model(**inputs)
101
+
102
+ # Post-process
103
+ target_sizes = torch.tensor([image.size[::-1]])
104
+ results = processor.post_process_object_detection(
105
+ outputs, target_sizes=target_sizes, threshold=confidence_threshold
106
+ )[0]
107
+
108
+ return results, image, confidence_threshold
109
+
110
+ def draw_boxes(image, results):
111
+ """Draw bounding boxes on image"""
112
+ img_array = np.array(image)
113
+
114
+ detections = []
115
+ object_counts = defaultdict(int)
116
+
117
+ for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
118
+ box = [round(i, 2) for i in box.tolist()]
119
+ label_name = COCO_CLASSES[label.item()]
120
+
121
+ if label_name != 'N/A':
122
+ # Draw rectangle
123
+ x1, y1, x2, y2 = map(int, box)
124
+ cv2.rectangle(img_array, (x1, y1), (x2, y2), (0, 255, 0), 2)
125
+
126
+ # Add label
127
+ label_text = f"{label_name}: {score:.2f}"
128
+ cv2.putText(img_array, label_text, (x1, y1-10),
129
+ cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
130
+
131
+ detections.append(f"{label_name} ({score:.2%})")
132
+ object_counts[label_name] += 1
133
+
134
+ return img_array, detections, object_counts
135
+
136
+ def process_static_image(image):
137
+ """Process static image mode with auto-detection"""
138
+ if image is None:
139
+ return None, "Please upload an image"
140
+
141
+ start_time = time.time()
142
+
143
+ # Detect objects with auto-adjusted confidence
144
+ results, pil_image, used_confidence = detect_objects(image, confidence_threshold=None)
145
+
146
+ # Draw boxes
147
+ annotated_image, detections, object_counts = draw_boxes(pil_image, results)
148
+
149
+ processing_time = time.time() - start_time
150
+
151
+ # Create detailed summary
152
+ if detections:
153
+ summary = f"### 🎯 Detection Results\n\n"
154
+ summary += f"**Found {len(detections)} objects in {processing_time:.2f} seconds**\n\n"
155
+ summary += f"*Auto-adjusted confidence threshold: {used_confidence:.2f}*\n\n"
156
+ summary += "#### Detected Objects:\n"
157
+
158
+ # Group by object type
159
+ for obj_name, count in sorted(object_counts.items(), key=lambda x: x[1], reverse=True):
160
+ summary += f"- **{obj_name}**: {count} instance(s)\n"
161
+
162
+ summary += f"\n#### All Detections:\n"
163
+ for i, d in enumerate(detections, 1):
164
+ summary += f"{i}. {d}\n"
165
+ else:
166
+ summary = f"### ⚠️ No objects detected\n\n"
167
+ summary += f"*Confidence threshold used: {used_confidence:.2f}*\n\n"
168
+ summary += "Try uploading a different image with more visible objects."
169
+
170
+ return annotated_image, summary
171
+
172
+ def process_video(video_path, progress=gr.Progress()):
173
+ """Process video mode with full auto-adjustment"""
174
+ if video_path is None:
175
+ return None, "Please upload a video"
176
+
177
+ progress(0, desc="Analyzing video...")
178
+
179
+ cap = cv2.VideoCapture(video_path)
180
+
181
+ # Get video properties
182
+ fps = int(cap.get(cv2.CAP_PROP_FPS))
183
+ total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
184
+ width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
185
+ height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
186
+ duration = total_frames / fps if fps > 0 else 0
187
+
188
+ # Get available memory
189
+ available_memory = get_available_memory()
190
+
191
+ # Auto-calculate optimal frame interval
192
+ frame_interval, estimated_frames = auto_calculate_frame_interval(
193
+ total_frames, duration, available_memory
194
+ )
195
+
196
+ progress(0.1, desc=f"Processing video (sampling every {frame_interval} frames)...")
197
+
198
+ # Auto-adjust confidence based on video properties
199
+ frame_size = width * height
200
+ confidence_threshold = auto_adjust_confidence((width, height))
201
+
202
+ # Output video writer
203
+ output_path = "output_video.mp4"
204
+ fourcc = cv2.VideoWriter_fourcc(*'mp4v')
205
+ out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
206
+
207
+ frame_count = 0
208
+ processed_count = 0
209
+ object_tracker = defaultdict(int)
210
+ start_time = time.time()
211
+
212
+ while cap.isOpened():
213
+ ret, frame = cap.read()
214
+ if not ret:
215
+ break
216
+
217
+ # Update progress
218
+ if frame_count % 30 == 0:
219
+ progress_pct = (frame_count / total_frames) * 0.8 + 0.1
220
+ progress(progress_pct, desc=f"Processing frame {frame_count}/{total_frames}")
221
+
222
+ # Process every nth frame
223
+ if frame_count % frame_interval == 0:
224
+ # Convert BGR to RGB
225
+ rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
226
+
227
+ # Detect objects
228
+ results, _, _ = detect_objects(rgb_frame, confidence_threshold)
229
+
230
+ # Draw boxes
231
+ for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
232
+ box = [round(i, 2) for i in box.tolist()]
233
+ label_name = COCO_CLASSES[label.item()]
234
+
235
+ if label_name != 'N/A':
236
+ x1, y1, x2, y2 = map(int, box)
237
+ cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
238
+
239
+ label_text = f"{label_name}: {score:.2f}"
240
+ cv2.putText(frame, label_text, (x1, y1-10),
241
+ cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
242
+
243
+ object_tracker[label_name] += 1
244
+
245
+ processed_count += 1
246
+
247
+ out.write(frame)
248
+ frame_count += 1
249
+
250
+ cap.release()
251
+ out.release()
252
+
253
+ processing_time = time.time() - start_time
254
+
255
+ progress(1.0, desc="Complete!")
256
+
257
+ # Create detailed summary
258
+ summary = f"### 🎬 Video Processing Complete\n\n"
259
+ summary += f"**Processing Time**: {processing_time:.2f} seconds\n\n"
260
+ summary += "#### Video Information:\n"
261
+ summary += f"- Duration: {duration:.2f} seconds\n"
262
+ summary += f"- Total frames: {total_frames}\n"
263
+ summary += f"- FPS: {fps}\n"
264
+ summary += f"- Resolution: {width}x{height}\n\n"
265
+
266
+ summary += "#### Auto-Optimization Settings:\n"
267
+ summary += f"- Confidence threshold: {confidence_threshold:.2f} *(auto-adjusted)*\n"
268
+ summary += f"- Frame interval: Every {frame_interval} frame(s) *(auto-calculated)*\n"
269
+ summary += f"- Frames processed: {processed_count}/{total_frames}\n"
270
+ summary += f"- Available memory: {available_memory:.2f} GB\n\n"
271
+
272
+ if object_tracker:
273
+ summary += "### πŸ“Š Detected Objects Across Video:\n\n"
274
+ for obj, count in sorted(object_tracker.items(), key=lambda x: x[1], reverse=True):
275
+ summary += f"- **{obj}**: {count} detection(s)\n"
276
+ else:
277
+ summary += "⚠️ No objects detected in the video.\n"
278
+ summary += "This might be due to low lighting, fast motion, or absence of recognizable objects."
279
+
280
+ return output_path, summary
281
+
282
+ # Create Gradio interface
283
+ with gr.Blocks(title="AI Object Recognition System", theme=gr.themes.Soft()) as demo:
284
+ gr.Markdown("""
285
+ # πŸ€– AI Object Recognition System
286
+ ### Intelligent Auto-Adjusting Detection & Tracking
287
+
288
+ This system **automatically optimizes** detection parameters based on:
289
+ - Image/video size and complexity
290
+ - Available system resources
291
+ - Video duration and frame rate
292
+
293
+ **No manual tuning required!**
294
+ """)
295
+
296
+ with gr.Tabs():
297
+ # Static Image Tab
298
+ with gr.Tab("πŸ“Έ Static Mode - Image Detection"):
299
+ gr.Markdown("""
300
+ ### Automatic Image Analysis
301
+ Upload any image and the system will:
302
+ - Auto-adjust confidence thresholds
303
+ - Detect all visible objects
304
+ - Provide detailed statistics
305
+ """)
306
+
307
+ with gr.Row():
308
+ with gr.Column():
309
+ static_input = gr.Image(type="numpy", label="Upload Image")
310
+ static_btn = gr.Button("πŸ” Auto-Detect Objects", variant="primary", size="lg")
311
+ gr.Markdown("*The system will automatically optimize detection settings*")
312
+
313
+ with gr.Column():
314
+ static_output = gr.Image(label="Detected Objects")
315
+ static_summary = gr.Markdown(label="Detection Results")
316
+
317
+ static_btn.click(
318
+ fn=process_static_image,
319
+ inputs=[static_input],
320
+ outputs=[static_output, static_summary]
321
+ )
322
+
323
+ gr.Examples(
324
+ examples=[],
325
+ inputs=static_input,
326
+ label="Try these examples (upload your own images)"
327
+ )
328
+
329
+ # Dynamic Video Tab
330
+ with gr.Tab("πŸŽ₯ Dynamic Mode - Video Detection"):
331
+ gr.Markdown("""
332
+ ### Automatic Video Analysis
333
+ Upload a video and the system will:
334
+ - Auto-calculate optimal frame sampling
335
+ - Adjust confidence based on video quality
336
+ - Optimize for available CPU resources
337
+ - Track objects across frames
338
+
339
+ **Supports videos of any length!** The system automatically scales processing.
340
+ """)
341
+
342
+ with gr.Row():
343
+ with gr.Column():
344
+ video_input = gr.Video(label="Upload Video")
345
+ video_btn = gr.Button("🎬 Auto-Process Video", variant="primary", size="lg")
346
+ gr.Markdown("""
347
+ *The system will automatically:*
348
+ - Analyze video properties
349
+ - Calculate optimal frame sampling
350
+ - Adjust detection thresholds
351
+ - Monitor system resources
352
+ """)
353
+
354
+ with gr.Column():
355
+ video_output = gr.Video(label="Processed Video with Detections")
356
+ video_summary = gr.Markdown(label="Processing Results")
357
+
358
+ video_btn.click(
359
+ fn=process_video,
360
+ inputs=[video_input],
361
+ outputs=[video_output, video_summary]
362
+ )
363
+
364
+ gr.Markdown("""
365
+ ---
366
+ ## 🧠 How Auto-Adjustment Works
367
+
368
+ ### Image Mode:
369
+ - **Small images** (< 500K pixels): Lower confidence threshold for more detections
370
+ - **Large images** (> 2M pixels): Higher threshold to reduce false positives
371
+
372
+ ### Video Mode:
373
+ - **Short videos** (< 10s): Process more frames for detail
374
+ - **Long videos** (> 60s): Smart sampling to maintain performance
375
+ - **Memory-aware**: Adjusts based on available RAM
376
+ - **Quality-adaptive**: Balances speed vs accuracy automatically
377
+
378
+ ### πŸ“Š Technical Details:
379
+ - **Model**: DETR ResNet-50 (Detection Transformer)
380
+ - **Dataset**: COCO (80+ object categories)
381
+ - **Optimization**: CPU-friendly with intelligent resource management
382
+ - **Supported Objects**: People, vehicles, animals, furniture, electronics, food, and more
383
+
384
+ ### πŸ’‘ Tips:
385
+ - The system works best with clear, well-lit images/videos
386
+ - All adjustments happen automatically - just upload and click!
387
+ - Processing time varies based on video length and system resources
388
+ """)
389
+
390
+ if __name__ == "__main__":
391
+ demo.launch()