|
|
import gradio as gr |
|
|
import cv2 |
|
|
import numpy as np |
|
|
from PIL import Image |
|
|
import torch |
|
|
from transformers import DetrImageProcessor, DetrForObjectDetection |
|
|
from collections import defaultdict |
|
|
import time |
|
|
import psutil |
|
|
import os |
|
|
|
|
|
|
|
|
print("Loading DETR model...") |
|
|
processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50") |
|
|
model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50") |
|
|
model.eval() |
|
|
|
|
|
|
|
|
COCO_CLASSES = [ |
|
|
'N/A', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', |
|
|
'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A', |
|
|
'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', |
|
|
'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack', |
|
|
'umbrella', 'N/A', 'N/A', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', |
|
|
'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', |
|
|
'skateboard', 'surfboard', 'tennis racket', 'bottle', 'N/A', 'wine glass', |
|
|
'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', |
|
|
'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', |
|
|
'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table', 'N/A', |
|
|
'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', |
|
|
'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A', |
|
|
'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush' |
|
|
] |
|
|
|
|
|
def get_available_memory(): |
|
|
"""Get available system memory in GB""" |
|
|
return psutil.virtual_memory().available / (1024 ** 3) |
|
|
|
|
|
def auto_adjust_confidence(image_size, num_objects_hint=None): |
|
|
"""Dynamically adjust confidence based on image complexity""" |
|
|
pixels = image_size[0] * image_size[1] |
|
|
|
|
|
|
|
|
if pixels < 500000: |
|
|
base_confidence = 0.6 |
|
|
elif pixels < 2000000: |
|
|
base_confidence = 0.65 |
|
|
else: |
|
|
base_confidence = 0.7 |
|
|
|
|
|
return base_confidence |
|
|
|
|
|
def auto_calculate_frame_interval(total_frames, video_duration, available_memory_gb): |
|
|
"""Dynamically calculate optimal frame interval based on video properties and system resources""" |
|
|
|
|
|
|
|
|
fps = total_frames / video_duration if video_duration > 0 else 30 |
|
|
|
|
|
|
|
|
if available_memory_gb < 2: |
|
|
memory_factor = 3 |
|
|
elif available_memory_gb < 4: |
|
|
memory_factor = 2 |
|
|
else: |
|
|
memory_factor = 1 |
|
|
|
|
|
|
|
|
if video_duration < 10: |
|
|
duration_factor = 1 |
|
|
elif video_duration < 30: |
|
|
duration_factor = 2 |
|
|
elif video_duration < 60: |
|
|
duration_factor = 3 |
|
|
else: |
|
|
duration_factor = 4 |
|
|
|
|
|
|
|
|
target_frames = min(150, max(30, total_frames // (memory_factor * duration_factor))) |
|
|
|
|
|
|
|
|
interval = max(1, total_frames // target_frames) |
|
|
|
|
|
return interval, target_frames |
|
|
|
|
|
def detect_objects(image, confidence_threshold=None): |
|
|
"""Detect objects in a single image with dynamic confidence""" |
|
|
|
|
|
if isinstance(image, np.ndarray): |
|
|
image = Image.fromarray(image) |
|
|
|
|
|
|
|
|
if confidence_threshold is None: |
|
|
confidence_threshold = auto_adjust_confidence(image.size) |
|
|
|
|
|
|
|
|
inputs = processor(images=image, return_tensors="pt") |
|
|
|
|
|
|
|
|
with torch.no_grad(): |
|
|
outputs = model(**inputs) |
|
|
|
|
|
|
|
|
target_sizes = torch.tensor([image.size[::-1]]) |
|
|
results = processor.post_process_object_detection( |
|
|
outputs, target_sizes=target_sizes, threshold=confidence_threshold |
|
|
)[0] |
|
|
|
|
|
return results, image, confidence_threshold |
|
|
|
|
|
def draw_boxes(image, results): |
|
|
"""Draw bounding boxes on image""" |
|
|
img_array = np.array(image) |
|
|
|
|
|
detections = [] |
|
|
object_counts = defaultdict(int) |
|
|
|
|
|
for score, label, box in zip(results["scores"], results["labels"], results["boxes"]): |
|
|
box = [round(i, 2) for i in box.tolist()] |
|
|
label_name = COCO_CLASSES[label.item()] |
|
|
|
|
|
if label_name != 'N/A': |
|
|
|
|
|
x1, y1, x2, y2 = map(int, box) |
|
|
cv2.rectangle(img_array, (x1, y1), (x2, y2), (0, 255, 0), 2) |
|
|
|
|
|
|
|
|
label_text = f"{label_name}: {score:.2f}" |
|
|
cv2.putText(img_array, label_text, (x1, y1-10), |
|
|
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2) |
|
|
|
|
|
detections.append(f"{label_name} ({score:.2%})") |
|
|
object_counts[label_name] += 1 |
|
|
|
|
|
return img_array, detections, object_counts |
|
|
|
|
|
def process_static_image(image): |
|
|
"""Process static image mode with auto-detection""" |
|
|
if image is None: |
|
|
return None, "Please upload an image" |
|
|
|
|
|
start_time = time.time() |
|
|
|
|
|
|
|
|
results, pil_image, used_confidence = detect_objects(image, confidence_threshold=None) |
|
|
|
|
|
|
|
|
annotated_image, detections, object_counts = draw_boxes(pil_image, results) |
|
|
|
|
|
processing_time = time.time() - start_time |
|
|
|
|
|
|
|
|
if detections: |
|
|
summary = f"### π― Detection Results\n\n" |
|
|
summary += f"**Found {len(detections)} objects in {processing_time:.2f} seconds**\n\n" |
|
|
summary += f"*Auto-adjusted confidence threshold: {used_confidence:.2f}*\n\n" |
|
|
summary += "#### Detected Objects:\n" |
|
|
|
|
|
|
|
|
for obj_name, count in sorted(object_counts.items(), key=lambda x: x[1], reverse=True): |
|
|
summary += f"- **{obj_name}**: {count} instance(s)\n" |
|
|
|
|
|
summary += f"\n#### All Detections:\n" |
|
|
for i, d in enumerate(detections, 1): |
|
|
summary += f"{i}. {d}\n" |
|
|
else: |
|
|
summary = f"### β οΈ No objects detected\n\n" |
|
|
summary += f"*Confidence threshold used: {used_confidence:.2f}*\n\n" |
|
|
summary += "Try uploading a different image with more visible objects." |
|
|
|
|
|
return annotated_image, summary |
|
|
|
|
|
def process_video(video_path, progress=gr.Progress()): |
|
|
"""Process video mode with full auto-adjustment""" |
|
|
if video_path is None: |
|
|
return None, "Please upload a video" |
|
|
|
|
|
progress(0, desc="Analyzing video...") |
|
|
|
|
|
cap = cv2.VideoCapture(video_path) |
|
|
|
|
|
|
|
|
fps = int(cap.get(cv2.CAP_PROP_FPS)) |
|
|
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) |
|
|
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) |
|
|
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) |
|
|
duration = total_frames / fps if fps > 0 else 0 |
|
|
|
|
|
|
|
|
available_memory = get_available_memory() |
|
|
|
|
|
|
|
|
frame_interval, estimated_frames = auto_calculate_frame_interval( |
|
|
total_frames, duration, available_memory |
|
|
) |
|
|
|
|
|
progress(0.1, desc=f"Processing video (sampling every {frame_interval} frames)...") |
|
|
|
|
|
|
|
|
frame_size = width * height |
|
|
confidence_threshold = auto_adjust_confidence((width, height)) |
|
|
|
|
|
|
|
|
output_path = "output_video.mp4" |
|
|
fourcc = cv2.VideoWriter_fourcc(*'mp4v') |
|
|
out = cv2.VideoWriter(output_path, fourcc, fps, (width, height)) |
|
|
|
|
|
frame_count = 0 |
|
|
processed_count = 0 |
|
|
object_tracker = defaultdict(int) |
|
|
start_time = time.time() |
|
|
|
|
|
while cap.isOpened(): |
|
|
ret, frame = cap.read() |
|
|
if not ret: |
|
|
break |
|
|
|
|
|
|
|
|
if frame_count % 30 == 0: |
|
|
progress_pct = (frame_count / total_frames) * 0.8 + 0.1 |
|
|
progress(progress_pct, desc=f"Processing frame {frame_count}/{total_frames}") |
|
|
|
|
|
|
|
|
if frame_count % frame_interval == 0: |
|
|
|
|
|
rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) |
|
|
|
|
|
|
|
|
results, _, _ = detect_objects(rgb_frame, confidence_threshold) |
|
|
|
|
|
|
|
|
for score, label, box in zip(results["scores"], results["labels"], results["boxes"]): |
|
|
box = [round(i, 2) for i in box.tolist()] |
|
|
label_name = COCO_CLASSES[label.item()] |
|
|
|
|
|
if label_name != 'N/A': |
|
|
x1, y1, x2, y2 = map(int, box) |
|
|
cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2) |
|
|
|
|
|
label_text = f"{label_name}: {score:.2f}" |
|
|
cv2.putText(frame, label_text, (x1, y1-10), |
|
|
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2) |
|
|
|
|
|
object_tracker[label_name] += 1 |
|
|
|
|
|
processed_count += 1 |
|
|
|
|
|
out.write(frame) |
|
|
frame_count += 1 |
|
|
|
|
|
cap.release() |
|
|
out.release() |
|
|
|
|
|
processing_time = time.time() - start_time |
|
|
|
|
|
progress(1.0, desc="Complete!") |
|
|
|
|
|
|
|
|
summary = f"### π¬ Video Processing Complete\n\n" |
|
|
summary += f"**Processing Time**: {processing_time:.2f} seconds\n\n" |
|
|
summary += "#### Video Information:\n" |
|
|
summary += f"- Duration: {duration:.2f} seconds\n" |
|
|
summary += f"- Total frames: {total_frames}\n" |
|
|
summary += f"- FPS: {fps}\n" |
|
|
summary += f"- Resolution: {width}x{height}\n\n" |
|
|
|
|
|
summary += "#### Auto-Optimization Settings:\n" |
|
|
summary += f"- Confidence threshold: {confidence_threshold:.2f} *(auto-adjusted)*\n" |
|
|
summary += f"- Frame interval: Every {frame_interval} frame(s) *(auto-calculated)*\n" |
|
|
summary += f"- Frames processed: {processed_count}/{total_frames}\n" |
|
|
summary += f"- Available memory: {available_memory:.2f} GB\n\n" |
|
|
|
|
|
if object_tracker: |
|
|
summary += "### π Detected Objects Across Video:\n\n" |
|
|
for obj, count in sorted(object_tracker.items(), key=lambda x: x[1], reverse=True): |
|
|
summary += f"- **{obj}**: {count} detection(s)\n" |
|
|
else: |
|
|
summary += "β οΈ No objects detected in the video.\n" |
|
|
summary += "This might be due to low lighting, fast motion, or absence of recognizable objects." |
|
|
|
|
|
return output_path, summary |
|
|
|
|
|
|
|
|
with gr.Blocks(title="AI Object Recognition System", theme=gr.themes.Soft()) as demo: |
|
|
gr.Markdown(""" |
|
|
# π€ AI Object Recognition System |
|
|
### Intelligent Auto-Adjusting Detection & Tracking |
|
|
|
|
|
This system **automatically optimizes** detection parameters based on: |
|
|
- Image/video size and complexity |
|
|
- Available system resources |
|
|
- Video duration and frame rate |
|
|
|
|
|
**No manual tuning required!** |
|
|
""") |
|
|
|
|
|
with gr.Tabs(): |
|
|
|
|
|
with gr.Tab("πΈ Static Mode - Image Detection"): |
|
|
gr.Markdown(""" |
|
|
### Automatic Image Analysis |
|
|
Upload any image and the system will: |
|
|
- Auto-adjust confidence thresholds |
|
|
- Detect all visible objects |
|
|
- Provide detailed statistics |
|
|
""") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
static_input = gr.Image(type="numpy", label="Upload Image") |
|
|
static_btn = gr.Button("π Auto-Detect Objects", variant="primary", size="lg") |
|
|
gr.Markdown("*The system will automatically optimize detection settings*") |
|
|
|
|
|
with gr.Column(): |
|
|
static_output = gr.Image(label="Detected Objects") |
|
|
static_summary = gr.Markdown(label="Detection Results") |
|
|
|
|
|
static_btn.click( |
|
|
fn=process_static_image, |
|
|
inputs=[static_input], |
|
|
outputs=[static_output, static_summary] |
|
|
) |
|
|
|
|
|
gr.Examples( |
|
|
examples=[], |
|
|
inputs=static_input, |
|
|
label="Try these examples (upload your own images)" |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Tab("π₯ Dynamic Mode - Video Detection"): |
|
|
gr.Markdown(""" |
|
|
### Automatic Video Analysis |
|
|
Upload a video and the system will: |
|
|
- Auto-calculate optimal frame sampling |
|
|
- Adjust confidence based on video quality |
|
|
- Optimize for available CPU resources |
|
|
- Track objects across frames |
|
|
|
|
|
**Supports videos of any length!** The system automatically scales processing. |
|
|
""") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
video_input = gr.Video(label="Upload Video") |
|
|
video_btn = gr.Button("π¬ Auto-Process Video", variant="primary", size="lg") |
|
|
gr.Markdown(""" |
|
|
*The system will automatically:* |
|
|
- Analyze video properties |
|
|
- Calculate optimal frame sampling |
|
|
- Adjust detection thresholds |
|
|
- Monitor system resources |
|
|
""") |
|
|
|
|
|
with gr.Column(): |
|
|
video_output = gr.Video(label="Processed Video with Detections") |
|
|
video_summary = gr.Markdown(label="Processing Results") |
|
|
|
|
|
video_btn.click( |
|
|
fn=process_video, |
|
|
inputs=[video_input], |
|
|
outputs=[video_output, video_summary] |
|
|
) |
|
|
|
|
|
gr.Markdown(""" |
|
|
--- |
|
|
## π§ How Auto-Adjustment Works |
|
|
|
|
|
### Image Mode: |
|
|
- **Small images** (< 500K pixels): Lower confidence threshold for more detections |
|
|
- **Large images** (> 2M pixels): Higher threshold to reduce false positives |
|
|
|
|
|
### Video Mode: |
|
|
- **Short videos** (< 10s): Process more frames for detail |
|
|
- **Long videos** (> 60s): Smart sampling to maintain performance |
|
|
- **Memory-aware**: Adjusts based on available RAM |
|
|
- **Quality-adaptive**: Balances speed vs accuracy automatically |
|
|
|
|
|
### π Technical Details: |
|
|
- **Model**: DETR ResNet-50 (Detection Transformer) |
|
|
- **Dataset**: COCO (80+ object categories) |
|
|
- **Optimization**: CPU-friendly with intelligent resource management |
|
|
- **Supported Objects**: People, vehicles, animals, furniture, electronics, food, and more |
|
|
|
|
|
### π‘ Tips: |
|
|
- The system works best with clear, well-lit images/videos |
|
|
- All adjustments happen automatically - just upload and click! |
|
|
- Processing time varies based on video length and system resources |
|
|
""") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |