Final_Assignment_Template

Sleeping

App Files Files Community

huytofu92 commited on May 16, 2025

Commit

a2cf089

1 Parent(s): 24b4228

Add pandas tool and modify vlm tools

Browse files

Files changed (3) hide show

mini_agents.py +4 -4
tools.py +11 -0
vlm_tools.py +41 -32

mini_agents.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from smolagents import CodeAgent, InferenceClientModel
-from tools import sort_list, operate_two_numbers, convert_number
 from tools import to_dataframe, to_json, get_dataframe_data, get_dataframe_column, get_dataframe_row, get_dataframe_groupby
-from vlm_tools import download_image, image_processing, object_detection, ocr_scan
 from audio_tools import audio_to_base64, noise_reduction, audio_segmentation, speaker_diarization
 from community_tools import community_tools
 import os
@@ -40,7 +40,7 @@ vlm_model = InferenceClientModel(
 vlm_agent = CodeAgent(
     model=vlm_model,
-    tools=[download_image, image_processing, object_detection, ocr_scan],
     max_steps=4,
     name="vlm_agent",
     description="This agent is responsible for downloading images, processing images, detecting objects in them and extracting text from them."
@@ -68,7 +68,7 @@ pandas_model = InferenceClientModel(
 pandas_agent = CodeAgent(
     model=pandas_model,
-    tools=[to_dataframe, to_json, get_dataframe_data, get_dataframe_column, get_dataframe_row, get_dataframe_groupby],
     max_steps=4,
     name="pandas_agent",
     description="This agent is responsible for converting data to a dataframe, performing pandas operations on such dataframe and converting the dataframe back to a json or a csv file."

 from smolagents import CodeAgent, InferenceClientModel
+from tools import sort_list, operate_two_numbers, convert_number, load_dataframe_from_csv
 from tools import to_dataframe, to_json, get_dataframe_data, get_dataframe_column, get_dataframe_row, get_dataframe_groupby
+from vlm_tools import download_image, image_processing, object_detection_tool, ocr_scan
 from audio_tools import audio_to_base64, noise_reduction, audio_segmentation, speaker_diarization
 from community_tools import community_tools
 import os
 vlm_agent = CodeAgent(
     model=vlm_model,
+    tools=[download_image, image_processing, object_detection_tool, ocr_scan],
     max_steps=4,
     name="vlm_agent",
     description="This agent is responsible for downloading images, processing images, detecting objects in them and extracting text from them."
 pandas_agent = CodeAgent(
     model=pandas_model,
+    tools=[load_dataframe_from_csv, to_dataframe, to_json, get_dataframe_data, get_dataframe_column, get_dataframe_row, get_dataframe_groupby],
     max_steps=4,
     name="pandas_agent",
     description="This agent is responsible for converting data to a dataframe, performing pandas operations on such dataframe and converting the dataframe back to a json or a csv file."

tools.py CHANGED Viewed

@@ -134,6 +134,17 @@ def convert_number(orig_num: any, operation: Literal["to_base", "type_cast"], ne
     else:
         raise ValueError("operation must be one of the following: to_base, type_cast")
 @tool
 def to_dataframe(data: List[dict], columns: List[str])->pd.DataFrame:
     """

     else:
         raise ValueError("operation must be one of the following: to_base, type_cast")
+@tool
+def load_dataframe_from_csv(file_path: str)->pd.DataFrame:
+    """
+    Load a pandas DataFrame from a CSV file
+    Args:
+        file_path: The path to the CSV file to load.
+    Returns:
+        The pandas DataFrame
+    """
+    return pd.read_csv(file_path)
 @tool
 def to_dataframe(data: List[dict], columns: List[str])->pd.DataFrame:
     """

vlm_tools.py CHANGED Viewed

@@ -8,6 +8,7 @@ from io import BytesIO
 from PIL import Image
 from langchain_core.tools import tool as langchain_tool
 from smolagents.tools import Tool, tool
 def pre_processing(image: str, input_size=(416, 416))->np.ndarray:
@@ -109,49 +110,57 @@ def image_processing(image: str, brightness: float = 1.0, contrast: float = 1.0)
 onnx_path = "vlm_assets/yolov3-8.onnx"
 names_path = "vlm_assets/obj.names"
-@tool
-def object_detection(image: str, onnx_path: str = onnx_path, names_path: str = names_path)->list:
-    """
-    Detect objects in an image
-    Args:
-        image: The image in base64 format to detect objects in
-        onnx_path: The path to the onnx file
-        names_path: The path to the names file
-    Returns:
-        The detected objects
-    """
-    img = pre_processing(image)
-    # Load ONNX model
-    onnx_model = onnxruntime.InferenceSession(onnx_path)
-    # Load class labels
-    with open(names_path, 'r') as f:
-        classes = [line.strip() for line in f.readlines()]
-    # Preprocess the image
-    blob = cv2.dnn.blobFromImage(img, 0.00392, (416, 416), (0, 0, 0), True, crop=False)
-    onnx_input = {onnx_model.get_inputs()[0].name: blob}
-    onnx_output = onnx_model.run(None, onnx_input)
-    detected_objects = post_processing(onnx_output, classes, img.shape)
-    return detected_objects
 @tool
-def ocr_scan(image: str)->str:
     """
     Scan an image for text
     Args:
-        image: The image in base64 format to scan for text
     Returns:
-        The text in the image
     """
-    image_data = base64.b64decode(image)
-    img = Image.open(BytesIO(image_data))
-    scanned_text = pytesseract.image_to_string(img)
     return scanned_text

 from PIL import Image
 from langchain_core.tools import tool as langchain_tool
 from smolagents.tools import Tool, tool
+from typing import List
 def pre_processing(image: str, input_size=(416, 416))->np.ndarray:
 onnx_path = "vlm_assets/yolov3-8.onnx"
 names_path = "vlm_assets/obj.names"
+class ObjectDetectionTool(Tool):
+    description = "Detect objects in a list of frames (images)"
+    name = "object_detection"
+    inputs = {
+        "frames": {"type": List[str], "description": "The list of frames (images) to detect objects in"},
+        "onnx_path": {"type": "string", "description": "The path to the onnx file"},
+        "names_path": {"type": "string", "description": "The path to the names file"}
+    }
+    output_type = List[List[str]]
+    def setup(self):
+        # Load ONNX model
+        self.onnx_path = onnx_path
+        self.names_path = names_path
+        self.onnx_model = onnxruntime.InferenceSession(self.onnx_path)
+    def forward(self, frames: List[str])->List[List[str]]:
+        # Load class labels
+        with open(self.names_path, 'r') as f:
+            classes = [line.strip() for line in f.readlines()]
+        detected_objects = []
+        for frame in frames:
+            img = pre_processing(frame)
+            # Preprocess the image
+            blob = cv2.dnn.blobFromImage(img, 0.00392, (416, 416), (0, 0, 0), True, crop=False)
+            onnx_input = {self.onnx_model.get_inputs()[0].name: blob}
+            onnx_output = self.onnx_model.run(None, onnx_input)
+            detected_objects.append(post_processing(onnx_output, classes, img.shape))
+        return detected_objects
 @tool
+def ocr_scan(frames: List[str])->List[List[str]]:
     """
     Scan an image for text
     Args:
+        frames: The list of frames (images) to scan for text
     Returns:
+        The list of text in the images
     """
+    scanned_text = []
+    for frame in frames:
+        image_data = base64.b64decode(frame)
+        img = Image.open(BytesIO(image_data))
+        scanned_text.append(pytesseract.image_to_string(img))
     return scanned_text
+object_detection_tool = ObjectDetectionTool()