shaheerawan3 commited on
Commit
b2cfca7
Β·
verified Β·
1 Parent(s): 42daf90

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +352 -248
app.py CHANGED
@@ -1,287 +1,391 @@
1
  import gradio as gr
2
- from diffusers import StableDiffusionInstructPix2PixPipeline
3
- from transformers import YolosImageProcessor, YolosForObjectDetection, BlipProcessor, BlipForConditionalGeneration
4
- from PIL import Image, ImageDraw, ImageFont
5
  import torch
6
- import json
 
 
 
 
7
 
8
- # Global models
9
- pipe = None
10
- detector = None
11
- detector_processor = None
12
- captioner = None
13
- caption_processor = None
14
 
15
- # Dynamic color generator
16
- def generate_color(text):
17
- """Generate consistent color from text using hash"""
18
- hash_val = hash(text) % 360
19
- return f"hsl({hash_val}, 70%, 55%)"
 
 
 
 
 
 
 
 
 
 
 
20
 
21
- # Dynamic category storage
22
- DETECTED_CATEGORIES = {}
 
23
 
24
- def load_models():
25
- """Load all models"""
26
- global pipe, detector, detector_processor, captioner, caption_processor
27
-
28
- if pipe is None:
29
- print("Loading image editor...")
30
- pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(
31
- "timbrooks/instruct-pix2pix",
32
- torch_dtype=torch.float16,
33
- safety_checker=None
34
- )
35
- pipe.to("cuda" if torch.cuda.is_available() else "cpu")
36
-
37
- if detector is None:
38
- print("Loading object detector...")
39
- detector_processor = YolosImageProcessor.from_pretrained('hustvl/yolos-tiny')
40
- detector = YolosForObjectDetection.from_pretrained('hustvl/yolos-tiny')
41
- detector.to("cuda" if torch.cuda.is_available() else "cpu")
42
-
43
- if captioner is None:
44
- print("Loading image captioner...")
45
- caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
46
- captioner = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
47
- captioner.to("cuda" if torch.cuda.is_available() else "cpu")
48
-
49
- print("All models loaded!")
50
 
51
- def detect_objects(image):
52
- """Detect objects in image with detailed info"""
53
- load_models()
54
-
55
- try:
56
- # Detect objects
57
- inputs = detector_processor(images=image, return_tensors="pt")
58
- if torch.cuda.is_available():
59
- inputs = {k: v.to("cuda") for k, v in inputs.items()}
60
-
61
- outputs = detector(**inputs)
62
- target_sizes = torch.tensor([image.size[::-1]])
63
- results = detector_processor.post_process_object_detection(outputs, threshold=0.3, target_sizes=target_sizes)[0]
64
-
65
- # Draw on image
66
- draw = ImageDraw.Draw(image)
67
- try:
68
- font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 16)
69
- except:
70
- font = ImageFont.load_default()
71
-
72
- detections = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
- for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
75
- box = [round(i, 2) for i in box.tolist()]
76
- label_name = detector.config.id2label[label.item()]
77
- confidence = round(score.item(), 3)
78
-
79
- # Auto-generate category and color
80
- category = label_name # Use the label itself as category
81
- color = generate_color(label_name)
82
-
83
- # Store in dynamic dict
84
- if category not in DETECTED_CATEGORIES:
85
- DETECTED_CATEGORIES[category] = color
86
-
87
- # Draw box
88
- draw.rectangle(box, outline=color, width=3)
89
 
90
- # Draw label background
91
- text = f"{label_name} {confidence:.0%}"
92
- bbox = draw.textbbox((box[0], box[1]-20), text, font=font)
93
- draw.rectangle([bbox[0]-2, bbox[1]-2, bbox[2]+2, bbox[3]+2], fill=color)
94
- draw.text((box[0], box[1]-20), text, fill='white', font=font)
95
 
96
- # Get specific info about this object
97
- obj_image = image.crop(box)
98
- obj_info = get_detailed_info(obj_image, label_name)
99
-
100
- detections.append({
101
- 'label': label_name,
102
- 'category': category,
103
- 'confidence': f"{confidence:.1%}",
104
- 'bbox': box,
105
- 'color': color,
106
- 'details': obj_info
107
- })
108
-
109
- # Create HTML output with clickable objects
110
- html_output = create_detection_html(detections)
111
-
112
- return image, html_output, json.dumps(detections, indent=2)
113
-
114
- except Exception as e:
115
- print(f"Detection error: {e}")
116
- import traceback
117
- traceback.print_exc()
118
- return image, f"<p>Error: {str(e)}</p>", "{}"
119
 
120
- def get_detailed_info(obj_image, label):
121
- """Get detailed description of the detected object"""
122
- try:
123
- # Generate caption for the object
124
- inputs = caption_processor(obj_image, return_tensors="pt")
125
- if torch.cuda.is_available():
126
- inputs = {k: v.to("cuda") for k, v in inputs.items()}
127
-
128
- out = captioner.generate(**inputs, max_length=50)
129
- caption = caption_processor.decode(out[0], skip_special_tokens=True)
130
-
131
- # Create search URL
132
- search_query = f"{label} {caption}".replace(' ', '+')
133
- search_url = f"https://www.google.com/search?q={search_query}"
 
 
 
 
 
 
 
134
 
135
- return {
136
- 'description': caption,
137
- 'search_url': search_url
138
- }
139
- except:
140
- search_url = f"https://www.google.com/search?q={label.replace(' ', '+')}"
141
- return {
142
- 'description': f"A {label}",
143
- 'search_url': search_url
144
- }
145
-
146
- def create_detection_html(detections):
147
- """Create interactive HTML with clickable detections"""
148
- if not detections:
149
- return "<p>No objects detected</p>"
150
-
151
- html = """
152
- <style>
153
- .detection-container {font-family: Arial; padding: 10px;}
154
- .detection-item {margin: 15px 0; padding: 15px; border-radius: 8px; border-left: 5px solid; cursor: pointer; transition: transform 0.2s;}
155
- .detection-item:hover {transform: translateX(5px); box-shadow: 0 2px 8px rgba(0,0,0,0.1);}
156
- .object-label {font-size: 18px; font-weight: bold; margin-bottom: 5px;}
157
- .object-details {font-size: 14px; color: #555; margin: 5px 0;}
158
- .object-category {display: inline-block; padding: 3px 10px; border-radius: 12px; font-size: 12px; color: white; margin-right: 10px;}
159
- .search-link {color: #1a73e8; text-decoration: none; font-size: 13px;}
160
- .search-link:hover {text-decoration: underline;}
161
- </style>
162
- <div class="detection-container">
163
- """
164
-
165
- # Group by category
166
- by_category = {}
167
- for det in detections:
168
- cat = det['category']
169
- if cat not in by_category:
170
- by_category[cat] = []
171
- by_category[cat].append(det)
172
-
173
- for category, items in by_category.items():
174
- color = generate_color(category)
175
- html += f"<h3 style='color: {color}; text-transform: capitalize;'>{category}s ({len(items)})</h3>"
176
 
177
- for det in items:
178
- html += f"""
179
- <div class="detection-item" style="border-left-color: {det['color']}; background: {det['color']}15;"
180
- onclick="window.open('{det['details']['search_url']}', '_blank')">
181
- <div class="object-label" style="color: {det['color']};">{det['label']}</div>
182
- <div class="object-details">
183
- <span class="object-category" style="background: {det['color']};">{det['category']}</span>
184
- <span>Confidence: {det['confidence']}</span>
185
- </div>
186
- <div class="object-details">{det['details']['description']}</div>
187
- <a href="{det['details']['search_url']}" target="_blank" class="search-link" onclick="event.stopPropagation();">
188
- πŸ” Learn more about this {det['label']}
189
- </a>
190
- </div>
191
- """
192
-
193
- html += "</div>"
194
- return html
195
 
196
- def edit_image(input_image, edit_prompt, num_steps, guidance_scale, image_guidance_scale):
197
- """Edit image"""
198
- if input_image is None or not edit_prompt.strip():
199
- return None, "❌ Provide image and prompt!"
200
 
201
- try:
202
- load_models()
203
-
204
- # Resize
205
- max_size = 512
206
- if max(input_image.size) > max_size:
207
- ratio = max_size / max(input_image.size)
208
- new_size = tuple(int(dim * ratio) for dim in input_image.size)
209
- input_image = input_image.resize(new_size, Image.Resampling.LANCZOS)
210
-
211
- width = (input_image.width // 8) * 8
212
- height = (input_image.height // 8) * 8
213
- input_image = input_image.resize((width, height))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
 
215
- result = pipe(
216
- edit_prompt,
217
- image=input_image,
218
- num_inference_steps=num_steps,
219
- guidance_scale=guidance_scale,
220
- image_guidance_scale=image_guidance_scale,
221
- ).images[0]
222
 
223
- return result, "βœ… Done!"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
 
225
- except Exception as e:
226
- return None, f"❌ Error: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
 
228
- # Build interface
229
- with gr.Blocks(theme=gr.themes.Soft()) as demo:
230
- gr.Markdown("# 🎨 AI Image Editor & Object Detector")
 
 
 
 
 
 
 
 
 
 
231
 
232
  with gr.Tabs():
233
- with gr.Tab("πŸ” Detect Objects"):
234
- gr.Markdown("Upload an image to detect and identify objects with detailed information")
 
 
 
 
 
 
 
 
235
  with gr.Row():
236
  with gr.Column():
237
- detect_input = gr.Image(label="Upload Image", type="pil")
238
- detect_btn = gr.Button("πŸ” Detect Objects", variant="primary", size="lg")
 
239
 
240
  with gr.Column():
241
- detect_output = gr.Image(label="Detected Objects")
 
242
 
243
- detection_info = gr.HTML(label="Object Details (Click to learn more)")
244
- detection_json = gr.JSON(label="Detection Data", visible=False)
 
 
 
245
 
246
- detect_btn.click(
247
- fn=detect_objects,
248
- inputs=[detect_input],
249
- outputs=[detect_output, detection_info, detection_json]
250
  )
251
 
252
- with gr.Tab("✏️ Edit Image"):
253
- gr.Markdown("Edit images with text instructions")
 
 
 
 
 
 
 
 
 
 
 
254
  with gr.Row():
255
  with gr.Column():
256
- edit_input = gr.Image(label="Upload Image", type="pil")
257
- edit_prompt = gr.Textbox(
258
- label="Instructions",
259
- placeholder="make it a painting, add snow, turn day into night...",
260
- lines=2
261
- )
262
- with gr.Accordion("Settings", open=False):
263
- num_steps = gr.Slider(10, 50, value=20, step=5, label="Steps")
264
- guidance_scale = gr.Slider(1, 10, value=7.5, step=0.5, label="Text Guidance")
265
- image_guidance_scale = gr.Slider(1, 2, value=1.5, step=0.1, label="Image Guidance")
266
- edit_btn = gr.Button("✨ Edit", variant="primary")
267
 
268
  with gr.Column():
269
- edit_output = gr.Image(label="Result")
270
- edit_status = gr.Textbox(label="Status", interactive=False)
271
 
272
- edit_btn.click(
273
- fn=edit_image,
274
- inputs=[edit_input, edit_prompt, num_steps, guidance_scale, image_guidance_scale],
275
- outputs=[edit_output, edit_status]
276
  )
277
 
278
  gr.Markdown("""
279
- ### 🎯 Features:
280
- - **Object Detection**: Identifies objects with bounding boxes and confidence scores
281
- - **Categories**: Color-coded by type (vehicles, animals, people, etc.)
282
- - **Detailed Info**: AI-generated descriptions for each object
283
- - **Clickable Links**: Click any object to learn more about it
284
- - **Image Editing**: Transform images with simple text instructions
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
  """)
286
 
287
- demo.launch()
 
 
1
  import gradio as gr
2
+ import cv2
3
+ import numpy as np
4
+ from PIL import Image
5
  import torch
6
+ from transformers import DetrImageProcessor, DetrForObjectDetection
7
+ from collections import defaultdict
8
+ import time
9
+ import psutil
10
+ import os
11
 
12
+ # Load DETR model (optimized for CPU)
13
+ print("Loading DETR model...")
14
+ processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
15
+ model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50")
16
+ model.eval()
 
17
 
18
+ # COCO class labels
19
+ COCO_CLASSES = [
20
+ 'N/A', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
21
+ 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A',
22
+ 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse',
23
+ 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack',
24
+ 'umbrella', 'N/A', 'N/A', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis',
25
+ 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove',
26
+ 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'N/A', 'wine glass',
27
+ 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich',
28
+ 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake',
29
+ 'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table', 'N/A',
30
+ 'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard',
31
+ 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A',
32
+ 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
33
+ ]
34
 
35
+ def get_available_memory():
36
+ """Get available system memory in GB"""
37
+ return psutil.virtual_memory().available / (1024 ** 3)
38
 
39
+ def auto_adjust_confidence(image_size, num_objects_hint=None):
40
+ """Dynamically adjust confidence based on image complexity"""
41
+ pixels = image_size[0] * image_size[1]
42
+
43
+ # Base confidence on image size
44
+ if pixels < 500000: # Small image
45
+ base_confidence = 0.6
46
+ elif pixels < 2000000: # Medium image
47
+ base_confidence = 0.65
48
+ else: # Large image
49
+ base_confidence = 0.7
50
+
51
+ return base_confidence
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
+ def auto_calculate_frame_interval(total_frames, video_duration, available_memory_gb):
54
+ """Dynamically calculate optimal frame interval based on video properties and system resources"""
55
+
56
+ # Base calculations
57
+ fps = total_frames / video_duration if video_duration > 0 else 30
58
+
59
+ # Memory-based adjustment
60
+ if available_memory_gb < 2:
61
+ memory_factor = 3
62
+ elif available_memory_gb < 4:
63
+ memory_factor = 2
64
+ else:
65
+ memory_factor = 1
66
+
67
+ # Duration-based adjustment
68
+ if video_duration < 10:
69
+ duration_factor = 1
70
+ elif video_duration < 30:
71
+ duration_factor = 2
72
+ elif video_duration < 60:
73
+ duration_factor = 3
74
+ else:
75
+ duration_factor = 4
76
+
77
+ # Calculate optimal frames to process
78
+ target_frames = min(150, max(30, total_frames // (memory_factor * duration_factor)))
79
+
80
+ # Calculate interval
81
+ interval = max(1, total_frames // target_frames)
82
+
83
+ return interval, target_frames
84
+
85
+ def detect_objects(image, confidence_threshold=None):
86
+ """Detect objects in a single image with dynamic confidence"""
87
+ # Convert to RGB if needed
88
+ if isinstance(image, np.ndarray):
89
+ image = Image.fromarray(image)
90
+
91
+ # Auto-adjust confidence if not provided
92
+ if confidence_threshold is None:
93
+ confidence_threshold = auto_adjust_confidence(image.size)
94
+
95
+ # Prepare image
96
+ inputs = processor(images=image, return_tensors="pt")
97
+
98
+ # Inference
99
+ with torch.no_grad():
100
+ outputs = model(**inputs)
101
+
102
+ # Post-process
103
+ target_sizes = torch.tensor([image.size[::-1]])
104
+ results = processor.post_process_object_detection(
105
+ outputs, target_sizes=target_sizes, threshold=confidence_threshold
106
+ )[0]
107
+
108
+ return results, image, confidence_threshold
109
+
110
+ def draw_boxes(image, results):
111
+ """Draw bounding boxes on image"""
112
+ img_array = np.array(image)
113
+
114
+ detections = []
115
+ object_counts = defaultdict(int)
116
+
117
+ for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
118
+ box = [round(i, 2) for i in box.tolist()]
119
+ label_name = COCO_CLASSES[label.item()]
120
 
121
+ if label_name != 'N/A':
122
+ # Draw rectangle
123
+ x1, y1, x2, y2 = map(int, box)
124
+ cv2.rectangle(img_array, (x1, y1), (x2, y2), (0, 255, 0), 2)
 
 
 
 
 
 
 
 
 
 
 
125
 
126
+ # Add label
127
+ label_text = f"{label_name}: {score:.2f}"
128
+ cv2.putText(img_array, label_text, (x1, y1-10),
129
+ cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
 
130
 
131
+ detections.append(f"{label_name} ({score:.2%})")
132
+ object_counts[label_name] += 1
133
+
134
+ return img_array, detections, object_counts
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
 
136
+ def process_static_image(image):
137
+ """Process static image mode with auto-detection"""
138
+ if image is None:
139
+ return None, "Please upload an image"
140
+
141
+ start_time = time.time()
142
+
143
+ # Detect objects with auto-adjusted confidence
144
+ results, pil_image, used_confidence = detect_objects(image, confidence_threshold=None)
145
+
146
+ # Draw boxes
147
+ annotated_image, detections, object_counts = draw_boxes(pil_image, results)
148
+
149
+ processing_time = time.time() - start_time
150
+
151
+ # Create detailed summary
152
+ if detections:
153
+ summary = f"### 🎯 Detection Results\n\n"
154
+ summary += f"**Found {len(detections)} objects in {processing_time:.2f} seconds**\n\n"
155
+ summary += f"*Auto-adjusted confidence threshold: {used_confidence:.2f}*\n\n"
156
+ summary += "#### Detected Objects:\n"
157
 
158
+ # Group by object type
159
+ for obj_name, count in sorted(object_counts.items(), key=lambda x: x[1], reverse=True):
160
+ summary += f"- **{obj_name}**: {count} instance(s)\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
 
162
+ summary += f"\n#### All Detections:\n"
163
+ for i, d in enumerate(detections, 1):
164
+ summary += f"{i}. {d}\n"
165
+ else:
166
+ summary = f"### ⚠️ No objects detected\n\n"
167
+ summary += f"*Confidence threshold used: {used_confidence:.2f}*\n\n"
168
+ summary += "Try uploading a different image with more visible objects."
169
+
170
+ return annotated_image, summary
 
 
 
 
 
 
 
 
 
171
 
172
+ def process_video(video_path, progress=gr.Progress()):
173
+ """Process video mode with full auto-adjustment"""
174
+ if video_path is None:
175
+ return None, "Please upload a video"
176
 
177
+ progress(0, desc="Analyzing video...")
178
+
179
+ cap = cv2.VideoCapture(video_path)
180
+
181
+ # Get video properties
182
+ fps = int(cap.get(cv2.CAP_PROP_FPS))
183
+ total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
184
+ width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
185
+ height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
186
+ duration = total_frames / fps if fps > 0 else 0
187
+
188
+ # Get available memory
189
+ available_memory = get_available_memory()
190
+
191
+ # Auto-calculate optimal frame interval
192
+ frame_interval, estimated_frames = auto_calculate_frame_interval(
193
+ total_frames, duration, available_memory
194
+ )
195
+
196
+ progress(0.1, desc=f"Processing video (sampling every {frame_interval} frames)...")
197
+
198
+ # Auto-adjust confidence based on video properties
199
+ frame_size = width * height
200
+ confidence_threshold = auto_adjust_confidence((width, height))
201
+
202
+ # Output video writer
203
+ output_path = "output_video.mp4"
204
+ fourcc = cv2.VideoWriter_fourcc(*'mp4v')
205
+ out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
206
+
207
+ frame_count = 0
208
+ processed_count = 0
209
+ object_tracker = defaultdict(int)
210
+ start_time = time.time()
211
+
212
+ while cap.isOpened():
213
+ ret, frame = cap.read()
214
+ if not ret:
215
+ break
216
 
217
+ # Update progress
218
+ if frame_count % 30 == 0:
219
+ progress_pct = (frame_count / total_frames) * 0.8 + 0.1
220
+ progress(progress_pct, desc=f"Processing frame {frame_count}/{total_frames}")
 
 
 
221
 
222
+ # Process every nth frame
223
+ if frame_count % frame_interval == 0:
224
+ # Convert BGR to RGB
225
+ rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
226
+
227
+ # Detect objects
228
+ results, _, _ = detect_objects(rgb_frame, confidence_threshold)
229
+
230
+ # Draw boxes
231
+ for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
232
+ box = [round(i, 2) for i in box.tolist()]
233
+ label_name = COCO_CLASSES[label.item()]
234
+
235
+ if label_name != 'N/A':
236
+ x1, y1, x2, y2 = map(int, box)
237
+ cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
238
+
239
+ label_text = f"{label_name}: {score:.2f}"
240
+ cv2.putText(frame, label_text, (x1, y1-10),
241
+ cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
242
+
243
+ object_tracker[label_name] += 1
244
+
245
+ processed_count += 1
246
 
247
+ out.write(frame)
248
+ frame_count += 1
249
+
250
+ cap.release()
251
+ out.release()
252
+
253
+ processing_time = time.time() - start_time
254
+
255
+ progress(1.0, desc="Complete!")
256
+
257
+ # Create detailed summary
258
+ summary = f"### 🎬 Video Processing Complete\n\n"
259
+ summary += f"**Processing Time**: {processing_time:.2f} seconds\n\n"
260
+ summary += "#### Video Information:\n"
261
+ summary += f"- Duration: {duration:.2f} seconds\n"
262
+ summary += f"- Total frames: {total_frames}\n"
263
+ summary += f"- FPS: {fps}\n"
264
+ summary += f"- Resolution: {width}x{height}\n\n"
265
+
266
+ summary += "#### Auto-Optimization Settings:\n"
267
+ summary += f"- Confidence threshold: {confidence_threshold:.2f} *(auto-adjusted)*\n"
268
+ summary += f"- Frame interval: Every {frame_interval} frame(s) *(auto-calculated)*\n"
269
+ summary += f"- Frames processed: {processed_count}/{total_frames}\n"
270
+ summary += f"- Available memory: {available_memory:.2f} GB\n\n"
271
+
272
+ if object_tracker:
273
+ summary += "### πŸ“Š Detected Objects Across Video:\n\n"
274
+ for obj, count in sorted(object_tracker.items(), key=lambda x: x[1], reverse=True):
275
+ summary += f"- **{obj}**: {count} detection(s)\n"
276
+ else:
277
+ summary += "⚠️ No objects detected in the video.\n"
278
+ summary += "This might be due to low lighting, fast motion, or absence of recognizable objects."
279
+
280
+ return output_path, summary
281
 
282
+ # Create Gradio interface
283
+ with gr.Blocks(title="AI Object Recognition System", theme=gr.themes.Soft()) as demo:
284
+ gr.Markdown("""
285
+ # πŸ€– AI Object Recognition System
286
+ ### Intelligent Auto-Adjusting Detection & Tracking
287
+
288
+ This system **automatically optimizes** detection parameters based on:
289
+ - Image/video size and complexity
290
+ - Available system resources
291
+ - Video duration and frame rate
292
+
293
+ **No manual tuning required!**
294
+ """)
295
 
296
  with gr.Tabs():
297
+ # Static Image Tab
298
+ with gr.Tab("πŸ“Έ Static Mode - Image Detection"):
299
+ gr.Markdown("""
300
+ ### Automatic Image Analysis
301
+ Upload any image and the system will:
302
+ - Auto-adjust confidence thresholds
303
+ - Detect all visible objects
304
+ - Provide detailed statistics
305
+ """)
306
+
307
  with gr.Row():
308
  with gr.Column():
309
+ static_input = gr.Image(type="numpy", label="Upload Image")
310
+ static_btn = gr.Button("πŸ” Auto-Detect Objects", variant="primary", size="lg")
311
+ gr.Markdown("*The system will automatically optimize detection settings*")
312
 
313
  with gr.Column():
314
+ static_output = gr.Image(label="Detected Objects")
315
+ static_summary = gr.Markdown(label="Detection Results")
316
 
317
+ static_btn.click(
318
+ fn=process_static_image,
319
+ inputs=[static_input],
320
+ outputs=[static_output, static_summary]
321
+ )
322
 
323
+ gr.Examples(
324
+ examples=[],
325
+ inputs=static_input,
326
+ label="Try these examples (upload your own images)"
327
  )
328
 
329
+ # Dynamic Video Tab
330
+ with gr.Tab("πŸŽ₯ Dynamic Mode - Video Detection"):
331
+ gr.Markdown("""
332
+ ### Automatic Video Analysis
333
+ Upload a video and the system will:
334
+ - Auto-calculate optimal frame sampling
335
+ - Adjust confidence based on video quality
336
+ - Optimize for available CPU resources
337
+ - Track objects across frames
338
+
339
+ **Supports videos of any length!** The system automatically scales processing.
340
+ """)
341
+
342
  with gr.Row():
343
  with gr.Column():
344
+ video_input = gr.Video(label="Upload Video")
345
+ video_btn = gr.Button("🎬 Auto-Process Video", variant="primary", size="lg")
346
+ gr.Markdown("""
347
+ *The system will automatically:*
348
+ - Analyze video properties
349
+ - Calculate optimal frame sampling
350
+ - Adjust detection thresholds
351
+ - Monitor system resources
352
+ """)
 
 
353
 
354
  with gr.Column():
355
+ video_output = gr.Video(label="Processed Video with Detections")
356
+ video_summary = gr.Markdown(label="Processing Results")
357
 
358
+ video_btn.click(
359
+ fn=process_video,
360
+ inputs=[video_input],
361
+ outputs=[video_output, video_summary]
362
  )
363
 
364
  gr.Markdown("""
365
+ ---
366
+ ## 🧠 How Auto-Adjustment Works
367
+
368
+ ### Image Mode:
369
+ - **Small images** (< 500K pixels): Lower confidence threshold for more detections
370
+ - **Large images** (> 2M pixels): Higher threshold to reduce false positives
371
+
372
+ ### Video Mode:
373
+ - **Short videos** (< 10s): Process more frames for detail
374
+ - **Long videos** (> 60s): Smart sampling to maintain performance
375
+ - **Memory-aware**: Adjusts based on available RAM
376
+ - **Quality-adaptive**: Balances speed vs accuracy automatically
377
+
378
+ ### πŸ“Š Technical Details:
379
+ - **Model**: DETR ResNet-50 (Detection Transformer)
380
+ - **Dataset**: COCO (80+ object categories)
381
+ - **Optimization**: CPU-friendly with intelligent resource management
382
+ - **Supported Objects**: People, vehicles, animals, furniture, electronics, food, and more
383
+
384
+ ### πŸ’‘ Tips:
385
+ - The system works best with clear, well-lit images/videos
386
+ - All adjustments happen automatically - just upload and click!
387
+ - Processing time varies based on video length and system resources
388
  """)
389
 
390
+ if __name__ == "__main__":
391
+ demo.launch()