Final_Assignment_Template

Sleeping

App Files Files Community

huytofu92 commited on May 19, 2025

Commit

e9a4266

1 Parent(s): 8615d8c

Fix object detection tool

Browse files

Files changed (1) hide show

vlm_tools.py +82 -42

vlm_tools.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import cv2
 import numpy as np
 import pytesseract
 import requests
@@ -11,7 +12,6 @@ from langchain_core.tools import tool as langchain_tool
 from smolagents.tools import Tool, tool
 def pre_processing(image: str, input_size=(416, 416))->np.ndarray:
     """
     Pre-process an image for YOLO model
     Args:
@@ -20,16 +20,35 @@ def pre_processing(image: str, input_size=(416, 416))->np.ndarray:
     Returns:
         The pre-processed image as a numpy array
     """
-    image_data = base64.b64decode(image)
-    np_image = np.frombuffer(image_data, np.uint8)
-    img = cv2.imdecode(np_image, cv2.IMREAD_COLOR)
-    # Resize and normalize the image
-    img = cv2.resize(img, input_size)
-    img = img[:, :, ::-1].transpose(2, 0, 1)  # BGR to RGB, to CHW
-    img = np.expand_dims(img, axis=0)
-    img = img.astype(np.float32) / 255.0  # Normalize to [0, 1]
-    return img
 def post_processing(onnx_output, classes, original_shape, conf_threshold=0.5, nms_threshold=0.4)->list:
     """
@@ -62,7 +81,7 @@ def post_processing(onnx_output, classes, original_shape, conf_threshold=0.5, nm
             class_ids.append(class_id)
     # Apply non-max suppression
-    indices = cv2.dnn.NMSBoxes(boxes, confidences, conf_threshold, nms_threshold)
     detected_objects = []
     for i in indices:
         i = i[0]
@@ -246,38 +265,59 @@ class ObjectDetectionTool(Tool):
     output_type = "any"
     def setup(self):
-        # Load ONNX model
-        self.onnx_path = onnx_path
-        self.onnx_model = onnxruntime.InferenceSession(self.onnx_path)
-        # Load class labels - using a predefined list since we can't use open()
-        # These are the standard COCO dataset classes that YOLOv3 uses
-        self.classes = [
-            'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat',
-            'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat',
-            'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack',
-            'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
-            'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
-            'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
-            'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair',
-            'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
-            'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book',
-            'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
-        ]
     def forward(self, images: any)->any:
-        detected_objects = []
-        for image in images:
-            img = pre_processing(image)
-            # Preprocess the image
-            blob = cv2.dnn.blobFromImage(img, 0.00392, (416, 416), (0, 0, 0), True, crop=False)
-            onnx_input = {self.onnx_model.get_inputs()[0].name: blob}
-            onnx_output = self.onnx_model.run(None, onnx_input)
-            detected_objects.append(post_processing(onnx_output, self.classes, img.shape))
-        return detected_objects
 class OCRTool(Tool):
     description = """

 import cv2
+from cv2 import dnn
 import numpy as np
 import pytesseract
 import requests
 from smolagents.tools import Tool, tool
 def pre_processing(image: str, input_size=(416, 416))->np.ndarray:
     """
     Pre-process an image for YOLO model
     Args:
     Returns:
         The pre-processed image as a numpy array
     """
+    try:
+        # Decode base64 image
+        image_data = base64.b64decode(image)
+        np_image = np.frombuffer(image_data, np.uint8)
+        img = cv2.imdecode(np_image, cv2.IMREAD_COLOR)
+        if img is None:
+            raise ValueError("Failed to decode image")
+        # Store original shape for post-processing
+        original_shape = img.shape[:2]  # (height, width)
+        # Ensure input_size is valid
+        if not isinstance(input_size, tuple) or len(input_size) != 2:
+            input_size = (416, 416)
+        # Resize and normalize the image
+        img = cv2.resize(img, input_size, interpolation=cv2.INTER_LINEAR)
+        if img is None:
+            raise ValueError("Failed to resize image")
+        # Convert BGR to RGB and normalize
+        img = img[:, :, ::-1].transpose(2, 0, 1)  # BGR to RGB, to CHW
+        img = np.expand_dims(img, axis=0)
+        img = img.astype(np.float32) / 255.0  # Normalize to [0, 1]
+        return img, original_shape
+    except Exception as e:
+        raise ValueError(f"Error in pre_processing: {str(e)}")
 def post_processing(onnx_output, classes, original_shape, conf_threshold=0.5, nms_threshold=0.4)->list:
     """
             class_ids.append(class_id)
     # Apply non-max suppression
+    indices = dnn.NMSBoxes(boxes, confidences, conf_threshold, nms_threshold)
     detected_objects = []
     for i in indices:
         i = i[0]
     output_type = "any"
     def setup(self):
+        try:
+            # Load ONNX model
+            self.onnx_path = onnx_path
+            self.onnx_model = onnxruntime.InferenceSession(self.onnx_path)
+            # Load class labels
+            self.classes = [
+                'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat',
+                'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat',
+                'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack',
+                'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
+                'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
+                'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
+                'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair',
+                'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
+                'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book',
+                'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
+            ]
+        except Exception as e:
+            raise RuntimeError(f"Error in setup: {str(e)}")
     def forward(self, images: any)->any:
+        try:
+            if not isinstance(images, list):
+                images = [images]  # Convert single image to list
+            detected_objects = []
+            for image in images:
+                try:
+                    # Preprocess the image
+                    img, original_shape = pre_processing(image)
+                    # Create blob and run inference
+                    blob = dnn.blobFromImage(img[0], 0.00392, (416, 416), (0, 0, 0), True, crop=False)
+                    onnx_input = {self.onnx_model.get_inputs()[0].name: blob}
+                    onnx_output = self.onnx_model.run(None, onnx_input)
+                    # Handle shape mismatch by transposing if needed
+                    if onnx_output[0].shape[1] == 255:  # If in NCHW format
+                        onnx_output = [onnx_output[0].transpose(0, 2, 3, 1)]  # Convert to NHWC
+                    # Post-process the output
+                    objects = post_processing(onnx_output, self.classes, original_shape)
+                    detected_objects.append(objects)
+                except Exception as e:
+                    print(f"Error processing image: {str(e)}")
+                    detected_objects.append([])  # Add empty list for failed image
+            return detected_objects
+        except Exception as e:
+            raise RuntimeError(f"Error in forward pass: {str(e)}")
 class OCRTool(Tool):
     description = """