Spaces:

smartfalcon-ai
/

Industrial-Defect-Detection

Running

App Files Files Community

asamasach commited on Jan 5

Commit

80280f8

1 Parent(s): d181e61

Add GroundingDINO and YOLO-World zero-shot models - Added GroundingDINO and YOLO-World for better zero-shot detection - Updated requirements.txt with ultralytics - Added visualization with distinct colors

Browse files

Files changed (2) hide show

app.py +242 -0
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -278,6 +278,162 @@ def run_florence2_inference(image_bytes: bytes, confidence: float = 0.3):
         return []
 def run_owlvit_inference(image_bytes: bytes, text_queries: list = None, confidence: float = 0.1):
     """
     Run zero-shot object detection using OWL-ViT (Open World Localization - Vision Transformer).
@@ -387,6 +543,16 @@ MODELS = {
         "type": "owlvit",
         "description": "Zero-shot object detection using Google's OWL-ViT - detects objects based on text descriptions"
     },
 }
 # AdaCLIP configuration
@@ -651,6 +817,66 @@ def gradio_inference(image, model_display_name, conf_threshold):
         return cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
     # Handle YOLO models (default)
     session = get_session(model_key)
     if session is None:
@@ -727,6 +953,22 @@ def api_inference(image, model_display_name, conf_threshold):
         detections = run_owlvit_inference(image_bytes, confidence=conf_threshold)
         return detections
     # Handle YOLO models (default)
     session = get_session(model_key)
     if session is None:

         return []
+def run_groundingdino_inference(image_bytes: bytes, text_queries: list = None, confidence: float = 0.3):
+    """
+    Run zero-shot object detection using GroundingDINO (IDEA Research).
+    GroundingDINO is better than OWL-ViT for open-set object detection.
+    It can find objects based on text descriptions with better accuracy.
+    """
+    try:
+        from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
+        from PIL import Image
+        import torch
+        import io
+        if text_queries is None:
+            text_queries = ["defect", "anomaly", "crack", "scratch", "damage", "error", "imperfection"]
+        # Load image
+        image = Image.open(io.BytesIO(image_bytes))
+        orig_w, orig_h = image.size
+        logger.info(f"GroundingDINO: Processing image {orig_w}x{orig_h}")
+        # Initialize model and processor (cached after first load)
+        if not hasattr(run_groundingdino_inference, 'processor'):
+            logger.info("Loading GroundingDINO model (first time only)...")
+            run_groundingdino_inference.processor = AutoProcessor.from_pretrained("IDEA-Research/grounding-dino-tiny")
+            run_groundingdino_inference.model = AutoModelForZeroShotObjectDetection.from_pretrained("IDEA-Research/grounding-dino-tiny")
+            logger.info("GroundingDINO model loaded successfully")
+        processor = run_groundingdino_inference.processor
+        model = run_groundingdino_inference.model
+        # Create text prompt (comma-separated)
+        text_prompt = ". ".join(text_queries) + "."
+        # Prepare inputs
+        inputs = processor(images=image, text=text_prompt, return_tensors="pt")
+        # Run inference
+        with torch.no_grad():
+            outputs = model(**inputs)
+        # Post-process results
+        results = processor.post_process_grounded_object_detection(
+            outputs,
+            inputs.input_ids,
+            box_threshold=confidence,
+            text_threshold=confidence,
+            target_sizes=[(orig_h, orig_w)]
+        )[0]
+        detections = []
+        if len(results["boxes"]) > 0:
+            boxes = results["boxes"].cpu().numpy()
+            scores = results["scores"].cpu().numpy()
+            labels = results["labels"]
+            logger.info(f"GroundingDINO found {len(boxes)} objects")
+            for box, score, label in zip(boxes, scores, labels):
+                x1, y1, x2, y2 = box
+                detections.append({
+                    "bbox": [float(x1), float(y1), float(x2), float(y2)],
+                    "confidence": float(score),
+                    "class_id": 0,
+                    "class_name": str(label),
+                    "x1": float(x1),
+                    "y1": float(y1),
+                    "x2": float(x2),
+                    "y2": float(y2),
+                    "model_type": "groundingdino"
+                })
+        logger.info(f"GroundingDINO detected {len(detections)} objects: {[d['class_name'] for d in detections]}")
+        return detections
+    except Exception as e:
+        logger.error(f"GroundingDINO inference error: {e}")
+        import traceback
+        logger.error(traceback.format_exc())
+        return []
+def run_yoloworld_inference(image_bytes: bytes, text_queries: list = None, confidence: float = 0.3):
+    """
+    Run zero-shot object detection using YOLO-World.
+    YOLO-World combines YOLO speed with open-vocabulary detection.
+    Fast and effective for real-time anomaly detection.
+    """
+    try:
+        from ultralytics import YOLOWorld
+        from PIL import Image
+        import io
+        import numpy as np
+        if text_queries is None:
+            text_queries = ["defect", "anomaly", "crack", "scratch", "damage"]
+        # Load image
+        image = Image.open(io.BytesIO(image_bytes))
+        orig_w, orig_h = image.size
+        logger.info(f"YOLO-World: Processing image {orig_w}x{orig_h}")
+        # Initialize model (cached after first load)
+        if not hasattr(run_yoloworld_inference, 'model'):
+            logger.info("Loading YOLO-World model (first time only)...")
+            run_yoloworld_inference.model = YOLOWorld("yolov8s-world.pt")  # Small model
+            logger.info("YOLO-World model loaded successfully")
+        model = run_yoloworld_inference.model
+        # Set custom classes
+        model.set_classes(text_queries)
+        # Convert PIL to numpy array
+        img_array = np.array(image)
+        # Run inference
+        results = model.predict(img_array, conf=confidence, verbose=False)
+        detections = []
+        if len(results) > 0 and results[0].boxes is not None:
+            boxes = results[0].boxes
+            logger.info(f"YOLO-World found {len(boxes)} objects")
+            for box in boxes:
+                x1, y1, x2, y2 = box.xyxy[0].cpu().numpy()
+                conf = float(box.conf[0].cpu().numpy())
+                cls = int(box.cls[0].cpu().numpy())
+                class_name = text_queries[cls] if cls < len(text_queries) else "object"
+                detections.append({
+                    "bbox": [float(x1), float(y1), float(x2), float(y2)],
+                    "confidence": conf,
+                    "class_id": cls,
+                    "class_name": class_name,
+                    "x1": float(x1),
+                    "y1": float(y1),
+                    "x2": float(x2),
+                    "y2": float(y2),
+                    "model_type": "yoloworld"
+                })
+        logger.info(f"YOLO-World detected {len(detections)} objects: {[d['class_name'] for d in detections]}")
+        return detections
+    except Exception as e:
+        logger.error(f"YOLO-World inference error: {e}")
+        import traceback
+        logger.error(traceback.format_exc())
+        return []
 def run_owlvit_inference(image_bytes: bytes, text_queries: list = None, confidence: float = 0.1):
     """
     Run zero-shot object detection using OWL-ViT (Open World Localization - Vision Transformer).
         "type": "owlvit",
         "description": "Zero-shot object detection using Google's OWL-ViT - detects objects based on text descriptions"
     },
+    "zero-shot-groundingdino": {
+        "name": "Zero Shot (GroundingDINO)",
+        "type": "groundingdino",
+        "description": "IDEA Research's open-set object detection - better than OWL-ViT for text-guided detection"
+    },
+    "zero-shot-yoloworld": {
+        "name": "Zero Shot (YOLO-World)",
+        "type": "yoloworld",
+        "description": "Fast open-vocabulary detection using YOLO architecture - combines speed with zero-shot capability"
+    },
 }
 # AdaCLIP configuration
         return cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
+    # Handle GroundingDINO (zero-shot object detection)
+    if model_type == "groundingdino":
+        _, img_encoded = cv2.imencode('.jpg', img_bgr)
+        image_bytes = img_encoded.tobytes()
+        detections = run_groundingdino_inference(image_bytes, confidence=conf_threshold)
+        # Add detection count
+        status_text = f"GroundingDINO: {len(detections)} objects"
+        cv2.putText(img_bgr, status_text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 255, 255), 2)
+        cv2.putText(img_bgr, status_text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 165, 255), 1)  # Orange
+        for i, det in enumerate(detections):
+            x1 = int(det["x1"])
+            y1 = int(det["y1"])
+            x2 = int(det["x2"])
+            y2 = int(det["y2"])
+            score = det["confidence"]
+            class_name = det.get("class_name", "object")
+            label = f"#{i+1} {class_name}:{score:.2f}"
+            cv2.rectangle(img_bgr, (x1, y1), (x2, y2), (0, 165, 255), 3)  # Orange
+            cv2.putText(img_bgr, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 165, 255), 2)
+        if not detections:
+            no_detect_text = f"No objects detected (threshold: {conf_threshold:.2f})"
+            cv2.putText(img_bgr, no_detect_text, (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 165, 255), 2)
+        return cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
+    # Handle YOLO-World (zero-shot object detection)
+    if model_type == "yoloworld":
+        _, img_encoded = cv2.imencode('.jpg', img_bgr)
+        image_bytes = img_encoded.tobytes()
+        detections = run_yoloworld_inference(image_bytes, confidence=conf_threshold)
+        # Add detection count
+        status_text = f"YOLO-World: {len(detections)} objects"
+        cv2.putText(img_bgr, status_text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 255, 255), 2)
+        cv2.putText(img_bgr, status_text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 255, 0), 1)  # Cyan
+        for i, det in enumerate(detections):
+            x1 = int(det["x1"])
+            y1 = int(det["y1"])
+            x2 = int(det["x2"])
+            y2 = int(det["y2"])
+            score = det["confidence"]
+            class_name = det.get("class_name", "object")
+            label = f"#{i+1} {class_name}:{score:.2f}"
+            cv2.rectangle(img_bgr, (x1, y1), (x2, y2), (255, 255, 0), 3)  # Cyan
+            cv2.putText(img_bgr, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 0), 2)
+        if not detections:
+            no_detect_text = f"No objects detected (threshold: {conf_threshold:.2f})"
+            cv2.putText(img_bgr, no_detect_text, (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 0), 2)
+        return cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
     # Handle YOLO models (default)
     session = get_session(model_key)
     if session is None:
         detections = run_owlvit_inference(image_bytes, confidence=conf_threshold)
         return detections
+    # Handle GroundingDINO (zero-shot object detection)
+    if model_type == "groundingdino":
+        _, img_encoded = cv2.imencode('.jpg', img_bgr)
+        image_bytes = img_encoded.tobytes()
+        detections = run_groundingdino_inference(image_bytes, confidence=conf_threshold)
+        return detections
+    # Handle YOLO-World (zero-shot object detection)
+    if model_type == "yoloworld":
+        _, img_encoded = cv2.imencode('.jpg', img_bgr)
+        image_bytes = img_encoded.tobytes()
+        detections = run_yoloworld_inference(image_bytes, confidence=conf_threshold)
+        return detections
     # Handle YOLO models (default)
     session = get_session(model_key)
     if session is None:

requirements.txt CHANGED Viewed

@@ -11,3 +11,4 @@ transformers
 torch
 torchvision
 pillow

 torch
 torchvision
 pillow
+ultralytics