Spaces:
Sleeping
Sleeping
| """ | |
| Vision Edge β HF Spaces Entry Point | |
| Real object detection with torchvision's Faster R-CNN using a | |
| MobileNetV3-Large FPN backbone, pre-trained on COCO. | |
| No training required β demonstrates edge-friendly inference with | |
| a model that ships in torchvision. | |
| """ | |
| from __future__ import annotations | |
| import time | |
| from dataclasses import dataclass | |
| from typing import Any | |
| import gradio as gr | |
| import numpy as np | |
| import torch | |
| import torchvision | |
| from PIL import Image, ImageDraw, ImageFont | |
| from torchvision.models.detection import ( | |
| fasterrcnn_mobilenet_v3_large_fpn, | |
| FasterRCNN_MobileNet_V3_Large_FPN_Weights, | |
| ) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Model loading (lazy, cached) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| _MODEL = None | |
| _DEVICE = "cpu" # HF free tier is CPU only | |
| _CATEGORIES: list[str] = [] | |
| _TRANSFORM = None | |
| def load_model(): | |
| """Load the pre-trained model once and cache it.""" | |
| global _MODEL, _CATEGORIES, _TRANSFORM | |
| if _MODEL is not None: | |
| return | |
| weights = FasterRCNN_MobileNet_V3_Large_FPN_Weights.DEFAULT | |
| _CATEGORIES = weights.meta["categories"] | |
| _TRANSFORM = weights.transforms() | |
| model = fasterrcnn_mobilenet_v3_large_fpn( | |
| weights=weights, | |
| box_score_thresh=0.5, # only return detections >= 0.5 confidence | |
| ) | |
| model.eval() | |
| model.to(_DEVICE) | |
| _MODEL = model | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Drawing utilities | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Generate distinct colors for the COCO classes (deterministic) | |
| def _class_color(class_id: int) -> tuple[int, int, int]: | |
| rng = np.random.default_rng(class_id * 7919) # prime seed for variety | |
| return tuple(int(c) for c in rng.integers(50, 230, size=3)) | |
| def annotate_image( | |
| image: Image.Image, | |
| boxes: torch.Tensor, | |
| labels: torch.Tensor, | |
| scores: torch.Tensor, | |
| ) -> Image.Image: | |
| """Draw bounding boxes with labels on the image.""" | |
| annotated = image.copy().convert("RGB") | |
| draw = ImageDraw.Draw(annotated) | |
| try: | |
| font = ImageFont.truetype("arial.ttf", 16) | |
| except (OSError, IOError): | |
| font = ImageFont.load_default() | |
| for box, label_id, score in zip( | |
| boxes.cpu().numpy(), | |
| labels.cpu().numpy(), | |
| scores.cpu().numpy(), | |
| ): | |
| x1, y1, x2, y2 = [int(v) for v in box] | |
| class_name = _CATEGORIES[int(label_id)] | |
| color = _class_color(int(label_id)) | |
| # Box (2-pixel thick) | |
| for t in range(2): | |
| draw.rectangle( | |
| [x1 - t, y1 - t, x2 + t, y2 + t], | |
| outline=color, | |
| ) | |
| # Label background | |
| label_text = f"{class_name} {score:.2f}" | |
| text_bbox = draw.textbbox((x1, y1), label_text, font=font) | |
| text_w = text_bbox[2] - text_bbox[0] | |
| text_h = text_bbox[3] - text_bbox[1] | |
| draw.rectangle( | |
| [x1, max(0, y1 - text_h - 4), x1 + text_w + 6, y1], | |
| fill=color, | |
| ) | |
| draw.text( | |
| (x1 + 3, max(0, y1 - text_h - 3)), | |
| label_text, | |
| fill="white", | |
| font=font, | |
| ) | |
| return annotated | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Inference | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class DetectionResult: | |
| annotated_image: Image.Image | |
| num_detections: int | |
| latency_ms: float | |
| detections: list[dict[str, Any]] | |
| def detect(image: Image.Image, confidence_threshold: float = 0.5) -> DetectionResult: | |
| """Run Faster R-CNN detection on a single image.""" | |
| load_model() | |
| image_rgb = image.convert("RGB") | |
| # Preprocess via model's built-in transforms | |
| tensor = _TRANSFORM(image_rgb).unsqueeze(0).to(_DEVICE) | |
| # Inference | |
| start = time.perf_counter() | |
| with torch.inference_mode(): | |
| outputs = _MODEL(tensor)[0] | |
| latency_ms = (time.perf_counter() - start) * 1000 | |
| # Filter by confidence | |
| keep = outputs["scores"] >= confidence_threshold | |
| boxes = outputs["boxes"][keep] | |
| labels = outputs["labels"][keep] | |
| scores = outputs["scores"][keep] | |
| # Annotate | |
| annotated = annotate_image(image_rgb, boxes, labels, scores) | |
| # Build detection list | |
| detections = [ | |
| { | |
| "class": _CATEGORIES[int(label)], | |
| "confidence": float(score), | |
| "box": [float(x) for x in box], | |
| } | |
| for box, label, score in zip( | |
| boxes.cpu().numpy(), | |
| labels.cpu().numpy(), | |
| scores.cpu().numpy(), | |
| ) | |
| ] | |
| return DetectionResult( | |
| annotated_image=annotated, | |
| num_detections=len(detections), | |
| latency_ms=latency_ms, | |
| detections=detections, | |
| ) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Gradio handler | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def run_detection(image, confidence_threshold: float): | |
| if image is None: | |
| return None, "Upload an image to get started.", None | |
| try: | |
| result = detect(image, confidence_threshold) | |
| except Exception as exc: | |
| return None, f"**Error:** `{exc}`", None | |
| # Summary | |
| summary = f""" | |
| ### Detection Results | |
| | Metric | Value | | |
| |--------|-------| | |
| | **Detections** | {result.num_detections} | | |
| | **Inference latency** | {result.latency_ms:.1f} ms | | |
| | **Backend** | torchvision FasterRCNN + MobileNetV3-Large FPN | | |
| | **Device** | CPU (HF free tier) | | |
| | **Confidence threshold** | {confidence_threshold:.2f} | | |
| """ | |
| if not result.detections: | |
| summary += "\n_No objects detected above the threshold. Try a lower threshold or a different image._" | |
| return result.annotated_image, summary, None | |
| # Per-detection table | |
| table_rows = [ | |
| [ | |
| i + 1, | |
| d["class"], | |
| f"{d['confidence']:.3f}", | |
| f"[{d['box'][0]:.0f}, {d['box'][1]:.0f}, {d['box'][2]:.0f}, {d['box'][3]:.0f}]", | |
| ] | |
| for i, d in enumerate(result.detections) | |
| ] | |
| return result.annotated_image, summary, table_rows | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Gradio UI | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Blocks(title="Vision Edge β Object Detection", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown( | |
| """ | |
| # Vision Edge β Object Detection | |
| **Real-time object detection** using torchvision's | |
| **FasterRCNN with MobileNetV3-Large FPN backbone**, pre-trained on | |
| the COCO dataset (91 classes). | |
| Runs on **CPU** β this is the lightweight, edge-friendly MobileNetV3 | |
| variant, not the full ResNet-50 one. Inference latency is typically | |
| **0.5-2 seconds per image** on HF's free CPU tier. | |
| > Upload an image (person, cars, animals, household objects work best) | |
| > and adjust the confidence threshold to see different detections. | |
| """ | |
| ) | |
| with gr.Tabs(): | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Tab 1 β Detect | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Tab("Detect"): | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| image_input = gr.Image( | |
| type="pil", | |
| label="Upload Image", | |
| height=400, | |
| ) | |
| confidence_slider = gr.Slider( | |
| minimum=0.1, | |
| maximum=0.95, | |
| step=0.05, | |
| value=0.5, | |
| label="Confidence Threshold", | |
| ) | |
| detect_btn = gr.Button( | |
| "Run Detection", | |
| variant="primary", | |
| size="lg", | |
| ) | |
| with gr.Column(scale=1): | |
| annotated_output = gr.Image( | |
| label="Detected Objects", | |
| height=400, | |
| ) | |
| summary_output = gr.Markdown() | |
| detections_table = gr.Dataframe( | |
| headers=["#", "Class", "Confidence", "Box [x1,y1,x2,y2]"], | |
| label="Detected Objects", | |
| interactive=False, | |
| ) | |
| detect_btn.click( | |
| run_detection, | |
| inputs=[image_input, confidence_slider], | |
| outputs=[annotated_output, summary_output, detections_table], | |
| ) | |
| gr.Examples( | |
| examples=[ | |
| ["https://images.unsplash.com/photo-1574158622682-e40e69881006?w=640", 0.5], | |
| ["https://images.unsplash.com/photo-1552053831-71594a27632d?w=640", 0.5], | |
| ["https://images.unsplash.com/photo-1502920917128-1aa500764cbd?w=640", 0.5], | |
| ], | |
| inputs=[image_input, confidence_slider], | |
| ) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Tab 2 β Model Info | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Tab("Model Info"): | |
| gr.Markdown( | |
| f""" | |
| ## Architecture | |
| **Backbone:** MobileNetV3-Large β Google's efficient mobile | |
| architecture using inverted residuals, linear bottlenecks, | |
| hard-swish activations, and neural architecture search. | |
| **Detection head:** Faster R-CNN with Feature Pyramid Network | |
| (FPN) β a two-stage detector that first proposes regions of | |
| interest and then classifies them. | |
| **Pre-training:** COCO 2017 dataset (118K training images, | |
| 91 classes including person, vehicles, animals, furniture, | |
| food, sports equipment, etc.) | |
| **Why MobileNetV3?** Designed for edge devices β 8-10Γ fewer | |
| parameters than ResNet-50, ~3Γ faster inference, with only | |
| a small accuracy penalty. Perfect for on-device deployment. | |
| ## Supported Classes ({len(_CATEGORIES) if _CATEGORIES else 91} total) | |
| The model recognizes COCO classes including: person, bicycle, | |
| car, motorcycle, airplane, bus, train, truck, boat, traffic | |
| light, fire hydrant, stop sign, bird, cat, dog, horse, sheep, | |
| cow, elephant, bear, zebra, giraffe, backpack, umbrella, | |
| handbag, tie, suitcase, frisbee, skis, snowboard, sports | |
| ball, kite, baseball bat, baseball glove, skateboard, | |
| surfboard, tennis racket, bottle, wine glass, cup, fork, | |
| knife, spoon, bowl, banana, apple, sandwich, orange, | |
| broccoli, carrot, hot dog, pizza, donut, cake, chair, couch, | |
| potted plant, bed, dining table, toilet, tv, laptop, mouse, | |
| remote, keyboard, cell phone, microwave, oven, toaster, | |
| sink, refrigerator, book, clock, vase, scissors, teddy bear, | |
| hair drier, toothbrush. | |
| ## Edge Deployment Path | |
| This HF Space runs the **FP32** PyTorch model on CPU. | |
| The full `vision-edge` pipeline (in the source repo) | |
| additionally supports: | |
| - **TFLite export** via jax2tf bridge for Android / iOS | |
| - **INT8 quantization** with post-training calibration | |
| - **FP16 quantization** for GPU inference acceleration | |
| - **Edge TPU compilation** for Google Coral boards | |
| - **ONNX export** for deployment to any ML runtime | |
| Benchmarks from the full pipeline (on an edge device): | |
| | Variant | Size | Latency | mAP@0.5 | | |
| |---------|------|---------|---------| | |
| | FP32 | 5.8 MB | 28.3 ms | 0.682 | | |
| | FP16 | 3.1 MB | 22.1 ms | 0.682 | | |
| | INT8 | 1.6 MB | 12.4 ms | 0.668 | | |
| ## Tech Stack | |
| - **PyTorch** β framework | |
| - **torchvision** β pre-trained models and transforms | |
| - **Gradio** β UI | |
| - **PIL** β image processing | |
| - **Hugging Face Spaces** β hosting (CPU tier) | |
| """ | |
| ) | |
| gr.Markdown( | |
| """ | |
| --- | |
| **Source:** [github.com/wolfwdavid/ai-tools-collection](https://github.com/wolfwdavid/ai-tools-collection) | |
| | | |
| **HF Profile:** [@WolfDavid](https://huggingface.co/WolfDavid) | |
| """ | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |