shaheerawan3 commited on
Commit
c4a896d
Β·
verified Β·
1 Parent(s): 6973752

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +248 -352
app.py CHANGED
@@ -1,391 +1,287 @@
1
  import gradio as gr
2
- import cv2
3
- import numpy as np
4
- from PIL import Image
5
  import torch
6
- from transformers import DetrImageProcessor, DetrForObjectDetection
7
- from collections import defaultdict
8
- import time
9
- import psutil
10
- import os
11
 
12
- # Load DETR model (optimized for CPU)
13
- print("Loading DETR model...")
14
- processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
15
- model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50")
16
- model.eval()
 
17
 
18
- # COCO class labels
19
- COCO_CLASSES = [
20
- 'N/A', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
21
- 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A',
22
- 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse',
23
- 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack',
24
- 'umbrella', 'N/A', 'N/A', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis',
25
- 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove',
26
- 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'N/A', 'wine glass',
27
- 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich',
28
- 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake',
29
- 'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table', 'N/A',
30
- 'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard',
31
- 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A',
32
- 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
33
- ]
34
 
35
- def get_available_memory():
36
- """Get available system memory in GB"""
37
- return psutil.virtual_memory().available / (1024 ** 3)
38
 
39
- def auto_adjust_confidence(image_size, num_objects_hint=None):
40
- """Dynamically adjust confidence based on image complexity"""
41
- pixels = image_size[0] * image_size[1]
42
-
43
- # Base confidence on image size
44
- if pixels < 500000: # Small image
45
- base_confidence = 0.6
46
- elif pixels < 2000000: # Medium image
47
- base_confidence = 0.65
48
- else: # Large image
49
- base_confidence = 0.7
50
-
51
- return base_confidence
52
-
53
- def auto_calculate_frame_interval(total_frames, video_duration, available_memory_gb):
54
- """Dynamically calculate optimal frame interval based on video properties and system resources"""
55
-
56
- # Base calculations
57
- fps = total_frames / video_duration if video_duration > 0 else 30
58
-
59
- # Memory-based adjustment
60
- if available_memory_gb < 2:
61
- memory_factor = 3
62
- elif available_memory_gb < 4:
63
- memory_factor = 2
64
- else:
65
- memory_factor = 1
66
-
67
- # Duration-based adjustment
68
- if video_duration < 10:
69
- duration_factor = 1
70
- elif video_duration < 30:
71
- duration_factor = 2
72
- elif video_duration < 60:
73
- duration_factor = 3
74
- else:
75
- duration_factor = 4
76
-
77
- # Calculate optimal frames to process
78
- target_frames = min(150, max(30, total_frames // (memory_factor * duration_factor)))
79
-
80
- # Calculate interval
81
- interval = max(1, total_frames // target_frames)
82
-
83
- return interval, target_frames
84
 
85
- def detect_objects(image, confidence_threshold=None):
86
- """Detect objects in a single image with dynamic confidence"""
87
- # Convert to RGB if needed
88
- if isinstance(image, np.ndarray):
89
- image = Image.fromarray(image)
90
-
91
- # Auto-adjust confidence if not provided
92
- if confidence_threshold is None:
93
- confidence_threshold = auto_adjust_confidence(image.size)
94
-
95
- # Prepare image
96
- inputs = processor(images=image, return_tensors="pt")
97
-
98
- # Inference
99
- with torch.no_grad():
100
- outputs = model(**inputs)
101
-
102
- # Post-process
103
- target_sizes = torch.tensor([image.size[::-1]])
104
- results = processor.post_process_object_detection(
105
- outputs, target_sizes=target_sizes, threshold=confidence_threshold
106
- )[0]
107
-
108
- return results, image, confidence_threshold
109
-
110
- def draw_boxes(image, results):
111
- """Draw bounding boxes on image"""
112
- img_array = np.array(image)
113
-
114
- detections = []
115
- object_counts = defaultdict(int)
116
-
117
- for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
118
- box = [round(i, 2) for i in box.tolist()]
119
- label_name = COCO_CLASSES[label.item()]
120
 
121
- if label_name != 'N/A':
122
- # Draw rectangle
123
- x1, y1, x2, y2 = map(int, box)
124
- cv2.rectangle(img_array, (x1, y1), (x2, y2), (0, 255, 0), 2)
 
 
 
 
 
 
 
 
 
125
 
126
- # Add label
127
- label_text = f"{label_name}: {score:.2f}"
128
- cv2.putText(img_array, label_text, (x1, y1-10),
129
- cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
130
 
131
- detections.append(f"{label_name} ({score:.2%})")
132
- object_counts[label_name] += 1
133
-
134
- return img_array, detections, object_counts
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
 
136
- def process_static_image(image):
137
- """Process static image mode with auto-detection"""
138
- if image is None:
139
- return None, "Please upload an image"
140
-
141
- start_time = time.time()
142
-
143
- # Detect objects with auto-adjusted confidence
144
- results, pil_image, used_confidence = detect_objects(image, confidence_threshold=None)
145
-
146
- # Draw boxes
147
- annotated_image, detections, object_counts = draw_boxes(pil_image, results)
148
-
149
- processing_time = time.time() - start_time
150
-
151
- # Create detailed summary
152
- if detections:
153
- summary = f"### 🎯 Detection Results\n\n"
154
- summary += f"**Found {len(detections)} objects in {processing_time:.2f} seconds**\n\n"
155
- summary += f"*Auto-adjusted confidence threshold: {used_confidence:.2f}*\n\n"
156
- summary += "#### Detected Objects:\n"
157
 
158
- # Group by object type
159
- for obj_name, count in sorted(object_counts.items(), key=lambda x: x[1], reverse=True):
160
- summary += f"- **{obj_name}**: {count} instance(s)\n"
161
 
162
- summary += f"\n#### All Detections:\n"
163
- for i, d in enumerate(detections, 1):
164
- summary += f"{i}. {d}\n"
165
- else:
166
- summary = f"### ⚠️ No objects detected\n\n"
167
- summary += f"*Confidence threshold used: {used_confidence:.2f}*\n\n"
168
- summary += "Try uploading a different image with more visible objects."
169
-
170
- return annotated_image, summary
 
 
 
 
 
171
 
172
- def process_video(video_path, progress=gr.Progress()):
173
- """Process video mode with full auto-adjustment"""
174
- if video_path is None:
175
- return None, "Please upload a video"
176
-
177
- progress(0, desc="Analyzing video...")
178
-
179
- cap = cv2.VideoCapture(video_path)
180
-
181
- # Get video properties
182
- fps = int(cap.get(cv2.CAP_PROP_FPS))
183
- total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
184
- width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
185
- height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
186
- duration = total_frames / fps if fps > 0 else 0
187
-
188
- # Get available memory
189
- available_memory = get_available_memory()
190
-
191
- # Auto-calculate optimal frame interval
192
- frame_interval, estimated_frames = auto_calculate_frame_interval(
193
- total_frames, duration, available_memory
194
- )
195
-
196
- progress(0.1, desc=f"Processing video (sampling every {frame_interval} frames)...")
197
-
198
- # Auto-adjust confidence based on video properties
199
- frame_size = width * height
200
- confidence_threshold = auto_adjust_confidence((width, height))
201
-
202
- # Output video writer
203
- output_path = "output_video.mp4"
204
- fourcc = cv2.VideoWriter_fourcc(*'mp4v')
205
- out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
206
-
207
- frame_count = 0
208
- processed_count = 0
209
- object_tracker = defaultdict(int)
210
- start_time = time.time()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
 
212
- while cap.isOpened():
213
- ret, frame = cap.read()
214
- if not ret:
215
- break
216
 
217
- # Update progress
218
- if frame_count % 30 == 0:
219
- progress_pct = (frame_count / total_frames) * 0.8 + 0.1
220
- progress(progress_pct, desc=f"Processing frame {frame_count}/{total_frames}")
 
 
221
 
222
- # Process every nth frame
223
- if frame_count % frame_interval == 0:
224
- # Convert BGR to RGB
225
- rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
226
-
227
- # Detect objects
228
- results, _, _ = detect_objects(rgb_frame, confidence_threshold)
229
-
230
- # Draw boxes
231
- for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
232
- box = [round(i, 2) for i in box.tolist()]
233
- label_name = COCO_CLASSES[label.item()]
234
-
235
- if label_name != 'N/A':
236
- x1, y1, x2, y2 = map(int, box)
237
- cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
238
-
239
- label_text = f"{label_name}: {score:.2f}"
240
- cv2.putText(frame, label_text, (x1, y1-10),
241
- cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
242
-
243
- object_tracker[label_name] += 1
244
-
245
- processed_count += 1
246
 
247
- out.write(frame)
248
- frame_count += 1
249
-
250
- cap.release()
251
- out.release()
252
-
253
- processing_time = time.time() - start_time
254
-
255
- progress(1.0, desc="Complete!")
256
-
257
- # Create detailed summary
258
- summary = f"### 🎬 Video Processing Complete\n\n"
259
- summary += f"**Processing Time**: {processing_time:.2f} seconds\n\n"
260
- summary += "#### Video Information:\n"
261
- summary += f"- Duration: {duration:.2f} seconds\n"
262
- summary += f"- Total frames: {total_frames}\n"
263
- summary += f"- FPS: {fps}\n"
264
- summary += f"- Resolution: {width}x{height}\n\n"
265
-
266
- summary += "#### Auto-Optimization Settings:\n"
267
- summary += f"- Confidence threshold: {confidence_threshold:.2f} *(auto-adjusted)*\n"
268
- summary += f"- Frame interval: Every {frame_interval} frame(s) *(auto-calculated)*\n"
269
- summary += f"- Frames processed: {processed_count}/{total_frames}\n"
270
- summary += f"- Available memory: {available_memory:.2f} GB\n\n"
271
-
272
- if object_tracker:
273
- summary += "### πŸ“Š Detected Objects Across Video:\n\n"
274
- for obj, count in sorted(object_tracker.items(), key=lambda x: x[1], reverse=True):
275
- summary += f"- **{obj}**: {count} detection(s)\n"
276
- else:
277
- summary += "⚠️ No objects detected in the video.\n"
278
- summary += "This might be due to low lighting, fast motion, or absence of recognizable objects."
279
-
280
- return output_path, summary
281
 
282
- # Create Gradio interface
283
- with gr.Blocks(title="AI Object Recognition System", theme=gr.themes.Soft()) as demo:
284
- gr.Markdown("""
285
- # πŸ€– AI Object Recognition System
286
- ### Intelligent Auto-Adjusting Detection & Tracking
287
-
288
- This system **automatically optimizes** detection parameters based on:
289
- - Image/video size and complexity
290
- - Available system resources
291
- - Video duration and frame rate
292
-
293
- **No manual tuning required!**
294
- """)
295
 
296
  with gr.Tabs():
297
- # Static Image Tab
298
- with gr.Tab("πŸ“Έ Static Mode - Image Detection"):
299
- gr.Markdown("""
300
- ### Automatic Image Analysis
301
- Upload any image and the system will:
302
- - Auto-adjust confidence thresholds
303
- - Detect all visible objects
304
- - Provide detailed statistics
305
- """)
306
-
307
  with gr.Row():
308
  with gr.Column():
309
- static_input = gr.Image(type="numpy", label="Upload Image")
310
- static_btn = gr.Button("πŸ” Auto-Detect Objects", variant="primary", size="lg")
311
- gr.Markdown("*The system will automatically optimize detection settings*")
312
 
313
  with gr.Column():
314
- static_output = gr.Image(label="Detected Objects")
315
- static_summary = gr.Markdown(label="Detection Results")
316
 
317
- static_btn.click(
318
- fn=process_static_image,
319
- inputs=[static_input],
320
- outputs=[static_output, static_summary]
321
- )
322
 
323
- gr.Examples(
324
- examples=[],
325
- inputs=static_input,
326
- label="Try these examples (upload your own images)"
327
  )
328
 
329
- # Dynamic Video Tab
330
- with gr.Tab("πŸŽ₯ Dynamic Mode - Video Detection"):
331
- gr.Markdown("""
332
- ### Automatic Video Analysis
333
- Upload a video and the system will:
334
- - Auto-calculate optimal frame sampling
335
- - Adjust confidence based on video quality
336
- - Optimize for available CPU resources
337
- - Track objects across frames
338
-
339
- **Supports videos of any length!** The system automatically scales processing.
340
- """)
341
-
342
  with gr.Row():
343
  with gr.Column():
344
- video_input = gr.Video(label="Upload Video")
345
- video_btn = gr.Button("🎬 Auto-Process Video", variant="primary", size="lg")
346
- gr.Markdown("""
347
- *The system will automatically:*
348
- - Analyze video properties
349
- - Calculate optimal frame sampling
350
- - Adjust detection thresholds
351
- - Monitor system resources
352
- """)
 
 
353
 
354
  with gr.Column():
355
- video_output = gr.Video(label="Processed Video with Detections")
356
- video_summary = gr.Markdown(label="Processing Results")
357
 
358
- video_btn.click(
359
- fn=process_video,
360
- inputs=[video_input],
361
- outputs=[video_output, video_summary]
362
  )
363
 
364
  gr.Markdown("""
365
- ---
366
- ## 🧠 How Auto-Adjustment Works
367
-
368
- ### Image Mode:
369
- - **Small images** (< 500K pixels): Lower confidence threshold for more detections
370
- - **Large images** (> 2M pixels): Higher threshold to reduce false positives
371
-
372
- ### Video Mode:
373
- - **Short videos** (< 10s): Process more frames for detail
374
- - **Long videos** (> 60s): Smart sampling to maintain performance
375
- - **Memory-aware**: Adjusts based on available RAM
376
- - **Quality-adaptive**: Balances speed vs accuracy automatically
377
-
378
- ### πŸ“Š Technical Details:
379
- - **Model**: DETR ResNet-50 (Detection Transformer)
380
- - **Dataset**: COCO (80+ object categories)
381
- - **Optimization**: CPU-friendly with intelligent resource management
382
- - **Supported Objects**: People, vehicles, animals, furniture, electronics, food, and more
383
-
384
- ### πŸ’‘ Tips:
385
- - The system works best with clear, well-lit images/videos
386
- - All adjustments happen automatically - just upload and click!
387
- - Processing time varies based on video length and system resources
388
  """)
389
 
390
- if __name__ == "__main__":
391
- demo.launch()
 
1
  import gradio as gr
2
+ from diffusers import StableDiffusionInstructPix2PixPipeline
3
+ from transformers import YolosImageProcessor, YolosForObjectDetection, BlipProcessor, BlipForConditionalGeneration
4
+ from PIL import Image, ImageDraw, ImageFont
5
  import torch
6
+ import json
 
 
 
 
7
 
8
+ # Global models
9
+ pipe = None
10
+ detector = None
11
+ detector_processor = None
12
+ captioner = None
13
+ caption_processor = None
14
 
15
+ # Dynamic color generator
16
+ def generate_color(text):
17
+ """Generate consistent color from text using hash"""
18
+ hash_val = hash(text) % 360
19
+ return f"hsl({hash_val}, 70%, 55%)"
 
 
 
 
 
 
 
 
 
 
 
20
 
21
+ # Dynamic category storage
22
+ DETECTED_CATEGORIES = {}
 
23
 
24
+ def load_models():
25
+ """Load all models"""
26
+ global pipe, detector, detector_processor, captioner, caption_processor
27
+
28
+ if pipe is None:
29
+ print("Loading image editor...")
30
+ pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(
31
+ "timbrooks/instruct-pix2pix",
32
+ torch_dtype=torch.float16,
33
+ safety_checker=None
34
+ )
35
+ pipe.to("cuda" if torch.cuda.is_available() else "cpu")
36
+
37
+ if detector is None:
38
+ print("Loading object detector...")
39
+ detector_processor = YolosImageProcessor.from_pretrained('hustvl/yolos-tiny')
40
+ detector = YolosForObjectDetection.from_pretrained('hustvl/yolos-tiny')
41
+ detector.to("cuda" if torch.cuda.is_available() else "cpu")
42
+
43
+ if captioner is None:
44
+ print("Loading image captioner...")
45
+ caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
46
+ captioner = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
47
+ captioner.to("cuda" if torch.cuda.is_available() else "cpu")
48
+
49
+ print("All models loaded!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
+ def detect_objects(image):
52
+ """Detect objects in image with detailed info"""
53
+ load_models()
54
+
55
+ try:
56
+ # Detect objects
57
+ inputs = detector_processor(images=image, return_tensors="pt")
58
+ if torch.cuda.is_available():
59
+ inputs = {k: v.to("cuda") for k, v in inputs.items()}
60
+
61
+ outputs = detector(**inputs)
62
+ target_sizes = torch.tensor([image.size[::-1]])
63
+ results = detector_processor.post_process_object_detection(outputs, threshold=0.3, target_sizes=target_sizes)[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
+ # Draw on image
66
+ draw = ImageDraw.Draw(image)
67
+ try:
68
+ font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 16)
69
+ except:
70
+ font = ImageFont.load_default()
71
+
72
+ detections = []
73
+
74
+ for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
75
+ box = [round(i, 2) for i in box.tolist()]
76
+ label_name = detector.config.id2label[label.item()]
77
+ confidence = round(score.item(), 3)
78
 
79
+ # Auto-generate category and color
80
+ category = label_name # Use the label itself as category
81
+ color = generate_color(label_name)
 
82
 
83
+ # Store in dynamic dict
84
+ if category not in DETECTED_CATEGORIES:
85
+ DETECTED_CATEGORIES[category] = color
86
+
87
+ # Draw box
88
+ draw.rectangle(box, outline=color, width=3)
89
+
90
+ # Draw label background
91
+ text = f"{label_name} {confidence:.0%}"
92
+ bbox = draw.textbbox((box[0], box[1]-20), text, font=font)
93
+ draw.rectangle([bbox[0]-2, bbox[1]-2, bbox[2]+2, bbox[3]+2], fill=color)
94
+ draw.text((box[0], box[1]-20), text, fill='white', font=font)
95
+
96
+ # Get specific info about this object
97
+ obj_image = image.crop(box)
98
+ obj_info = get_detailed_info(obj_image, label_name)
99
+
100
+ detections.append({
101
+ 'label': label_name,
102
+ 'category': category,
103
+ 'confidence': f"{confidence:.1%}",
104
+ 'bbox': box,
105
+ 'color': color,
106
+ 'details': obj_info
107
+ })
108
+
109
+ # Create HTML output with clickable objects
110
+ html_output = create_detection_html(detections)
111
+
112
+ return image, html_output, json.dumps(detections, indent=2)
113
+
114
+ except Exception as e:
115
+ print(f"Detection error: {e}")
116
+ import traceback
117
+ traceback.print_exc()
118
+ return image, f"<p>Error: {str(e)}</p>", "{}"
119
 
120
+ def get_detailed_info(obj_image, label):
121
+ """Get detailed description of the detected object"""
122
+ try:
123
+ # Generate caption for the object
124
+ inputs = caption_processor(obj_image, return_tensors="pt")
125
+ if torch.cuda.is_available():
126
+ inputs = {k: v.to("cuda") for k, v in inputs.items()}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
 
128
+ out = captioner.generate(**inputs, max_length=50)
129
+ caption = caption_processor.decode(out[0], skip_special_tokens=True)
 
130
 
131
+ # Create search URL
132
+ search_query = f"{label} {caption}".replace(' ', '+')
133
+ search_url = f"https://www.google.com/search?q={search_query}"
134
+
135
+ return {
136
+ 'description': caption,
137
+ 'search_url': search_url
138
+ }
139
+ except:
140
+ search_url = f"https://www.google.com/search?q={label.replace(' ', '+')}"
141
+ return {
142
+ 'description': f"A {label}",
143
+ 'search_url': search_url
144
+ }
145
 
146
+ def create_detection_html(detections):
147
+ """Create interactive HTML with clickable detections"""
148
+ if not detections:
149
+ return "<p>No objects detected</p>"
150
+
151
+ html = """
152
+ <style>
153
+ .detection-container {font-family: Arial; padding: 10px;}
154
+ .detection-item {margin: 15px 0; padding: 15px; border-radius: 8px; border-left: 5px solid; cursor: pointer; transition: transform 0.2s;}
155
+ .detection-item:hover {transform: translateX(5px); box-shadow: 0 2px 8px rgba(0,0,0,0.1);}
156
+ .object-label {font-size: 18px; font-weight: bold; margin-bottom: 5px;}
157
+ .object-details {font-size: 14px; color: #555; margin: 5px 0;}
158
+ .object-category {display: inline-block; padding: 3px 10px; border-radius: 12px; font-size: 12px; color: white; margin-right: 10px;}
159
+ .search-link {color: #1a73e8; text-decoration: none; font-size: 13px;}
160
+ .search-link:hover {text-decoration: underline;}
161
+ </style>
162
+ <div class="detection-container">
163
+ """
164
+
165
+ # Group by category
166
+ by_category = {}
167
+ for det in detections:
168
+ cat = det['category']
169
+ if cat not in by_category:
170
+ by_category[cat] = []
171
+ by_category[cat].append(det)
172
+
173
+ for category, items in by_category.items():
174
+ color = generate_color(category)
175
+ html += f"<h3 style='color: {color}; text-transform: capitalize;'>{category}s ({len(items)})</h3>"
176
+
177
+ for det in items:
178
+ html += f"""
179
+ <div class="detection-item" style="border-left-color: {det['color']}; background: {det['color']}15;"
180
+ onclick="window.open('{det['details']['search_url']}', '_blank')">
181
+ <div class="object-label" style="color: {det['color']};">{det['label']}</div>
182
+ <div class="object-details">
183
+ <span class="object-category" style="background: {det['color']};">{det['category']}</span>
184
+ <span>Confidence: {det['confidence']}</span>
185
+ </div>
186
+ <div class="object-details">{det['details']['description']}</div>
187
+ <a href="{det['details']['search_url']}" target="_blank" class="search-link" onclick="event.stopPropagation();">
188
+ πŸ” Learn more about this {det['label']}
189
+ </a>
190
+ </div>
191
+ """
192
+
193
+ html += "</div>"
194
+ return html
195
+
196
+ def edit_image(input_image, edit_prompt, num_steps, guidance_scale, image_guidance_scale):
197
+ """Edit image"""
198
+ if input_image is None or not edit_prompt.strip():
199
+ return None, "❌ Provide image and prompt!"
200
 
201
+ try:
202
+ load_models()
 
 
203
 
204
+ # Resize
205
+ max_size = 512
206
+ if max(input_image.size) > max_size:
207
+ ratio = max_size / max(input_image.size)
208
+ new_size = tuple(int(dim * ratio) for dim in input_image.size)
209
+ input_image = input_image.resize(new_size, Image.Resampling.LANCZOS)
210
 
211
+ width = (input_image.width // 8) * 8
212
+ height = (input_image.height // 8) * 8
213
+ input_image = input_image.resize((width, height))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
 
215
+ result = pipe(
216
+ edit_prompt,
217
+ image=input_image,
218
+ num_inference_steps=num_steps,
219
+ guidance_scale=guidance_scale,
220
+ image_guidance_scale=image_guidance_scale,
221
+ ).images[0]
222
+
223
+ return result, "βœ… Done!"
224
+
225
+ except Exception as e:
226
+ return None, f"❌ Error: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
 
228
+ # Build interface
229
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
230
+ gr.Markdown("# 🎨 AI Image Editor & Object Detector")
 
 
 
 
 
 
 
 
 
 
231
 
232
  with gr.Tabs():
233
+ with gr.Tab("πŸ” Detect Objects"):
234
+ gr.Markdown("Upload an image to detect and identify objects with detailed information")
 
 
 
 
 
 
 
 
235
  with gr.Row():
236
  with gr.Column():
237
+ detect_input = gr.Image(label="Upload Image", type="pil")
238
+ detect_btn = gr.Button("πŸ” Detect Objects", variant="primary", size="lg")
 
239
 
240
  with gr.Column():
241
+ detect_output = gr.Image(label="Detected Objects")
 
242
 
243
+ detection_info = gr.HTML(label="Object Details (Click to learn more)")
244
+ detection_json = gr.JSON(label="Detection Data", visible=False)
 
 
 
245
 
246
+ detect_btn.click(
247
+ fn=detect_objects,
248
+ inputs=[detect_input],
249
+ outputs=[detect_output, detection_info, detection_json]
250
  )
251
 
252
+ with gr.Tab("✏️ Edit Image"):
253
+ gr.Markdown("Edit images with text instructions")
 
 
 
 
 
 
 
 
 
 
 
254
  with gr.Row():
255
  with gr.Column():
256
+ edit_input = gr.Image(label="Upload Image", type="pil")
257
+ edit_prompt = gr.Textbox(
258
+ label="Instructions",
259
+ placeholder="make it a painting, add snow, turn day into night...",
260
+ lines=2
261
+ )
262
+ with gr.Accordion("Settings", open=False):
263
+ num_steps = gr.Slider(10, 50, value=20, step=5, label="Steps")
264
+ guidance_scale = gr.Slider(1, 10, value=7.5, step=0.5, label="Text Guidance")
265
+ image_guidance_scale = gr.Slider(1, 2, value=1.5, step=0.1, label="Image Guidance")
266
+ edit_btn = gr.Button("✨ Edit", variant="primary")
267
 
268
  with gr.Column():
269
+ edit_output = gr.Image(label="Result")
270
+ edit_status = gr.Textbox(label="Status", interactive=False)
271
 
272
+ edit_btn.click(
273
+ fn=edit_image,
274
+ inputs=[edit_input, edit_prompt, num_steps, guidance_scale, image_guidance_scale],
275
+ outputs=[edit_output, edit_status]
276
  )
277
 
278
  gr.Markdown("""
279
+ ### 🎯 Features:
280
+ - **Object Detection**: Identifies objects with bounding boxes and confidence scores
281
+ - **Categories**: Color-coded by type (vehicles, animals, people, etc.)
282
+ - **Detailed Info**: AI-generated descriptions for each object
283
+ - **Clickable Links**: Click any object to learn more about it
284
+ - **Image Editing**: Transform images with simple text instructions
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
  """)
286
 
287
+ demo.launch()