Spaces:
Sleeping
Sleeping
Add pandas tool and modify vlm tools
Browse files- mini_agents.py +4 -4
- tools.py +11 -0
- vlm_tools.py +41 -32
mini_agents.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
from smolagents import CodeAgent, InferenceClientModel
|
| 2 |
-
from tools import sort_list, operate_two_numbers, convert_number
|
| 3 |
from tools import to_dataframe, to_json, get_dataframe_data, get_dataframe_column, get_dataframe_row, get_dataframe_groupby
|
| 4 |
-
from vlm_tools import download_image, image_processing,
|
| 5 |
from audio_tools import audio_to_base64, noise_reduction, audio_segmentation, speaker_diarization
|
| 6 |
from community_tools import community_tools
|
| 7 |
import os
|
|
@@ -40,7 +40,7 @@ vlm_model = InferenceClientModel(
|
|
| 40 |
|
| 41 |
vlm_agent = CodeAgent(
|
| 42 |
model=vlm_model,
|
| 43 |
-
tools=[download_image, image_processing,
|
| 44 |
max_steps=4,
|
| 45 |
name="vlm_agent",
|
| 46 |
description="This agent is responsible for downloading images, processing images, detecting objects in them and extracting text from them."
|
|
@@ -68,7 +68,7 @@ pandas_model = InferenceClientModel(
|
|
| 68 |
|
| 69 |
pandas_agent = CodeAgent(
|
| 70 |
model=pandas_model,
|
| 71 |
-
tools=[to_dataframe, to_json, get_dataframe_data, get_dataframe_column, get_dataframe_row, get_dataframe_groupby],
|
| 72 |
max_steps=4,
|
| 73 |
name="pandas_agent",
|
| 74 |
description="This agent is responsible for converting data to a dataframe, performing pandas operations on such dataframe and converting the dataframe back to a json or a csv file."
|
|
|
|
| 1 |
from smolagents import CodeAgent, InferenceClientModel
|
| 2 |
+
from tools import sort_list, operate_two_numbers, convert_number, load_dataframe_from_csv
|
| 3 |
from tools import to_dataframe, to_json, get_dataframe_data, get_dataframe_column, get_dataframe_row, get_dataframe_groupby
|
| 4 |
+
from vlm_tools import download_image, image_processing, object_detection_tool, ocr_scan
|
| 5 |
from audio_tools import audio_to_base64, noise_reduction, audio_segmentation, speaker_diarization
|
| 6 |
from community_tools import community_tools
|
| 7 |
import os
|
|
|
|
| 40 |
|
| 41 |
vlm_agent = CodeAgent(
|
| 42 |
model=vlm_model,
|
| 43 |
+
tools=[download_image, image_processing, object_detection_tool, ocr_scan],
|
| 44 |
max_steps=4,
|
| 45 |
name="vlm_agent",
|
| 46 |
description="This agent is responsible for downloading images, processing images, detecting objects in them and extracting text from them."
|
|
|
|
| 68 |
|
| 69 |
pandas_agent = CodeAgent(
|
| 70 |
model=pandas_model,
|
| 71 |
+
tools=[load_dataframe_from_csv, to_dataframe, to_json, get_dataframe_data, get_dataframe_column, get_dataframe_row, get_dataframe_groupby],
|
| 72 |
max_steps=4,
|
| 73 |
name="pandas_agent",
|
| 74 |
description="This agent is responsible for converting data to a dataframe, performing pandas operations on such dataframe and converting the dataframe back to a json or a csv file."
|
tools.py
CHANGED
|
@@ -134,6 +134,17 @@ def convert_number(orig_num: any, operation: Literal["to_base", "type_cast"], ne
|
|
| 134 |
else:
|
| 135 |
raise ValueError("operation must be one of the following: to_base, type_cast")
|
| 136 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
@tool
|
| 138 |
def to_dataframe(data: List[dict], columns: List[str])->pd.DataFrame:
|
| 139 |
"""
|
|
|
|
| 134 |
else:
|
| 135 |
raise ValueError("operation must be one of the following: to_base, type_cast")
|
| 136 |
|
| 137 |
+
@tool
|
| 138 |
+
def load_dataframe_from_csv(file_path: str)->pd.DataFrame:
|
| 139 |
+
"""
|
| 140 |
+
Load a pandas DataFrame from a CSV file
|
| 141 |
+
Args:
|
| 142 |
+
file_path: The path to the CSV file to load.
|
| 143 |
+
Returns:
|
| 144 |
+
The pandas DataFrame
|
| 145 |
+
"""
|
| 146 |
+
return pd.read_csv(file_path)
|
| 147 |
+
|
| 148 |
@tool
|
| 149 |
def to_dataframe(data: List[dict], columns: List[str])->pd.DataFrame:
|
| 150 |
"""
|
vlm_tools.py
CHANGED
|
@@ -8,6 +8,7 @@ from io import BytesIO
|
|
| 8 |
from PIL import Image
|
| 9 |
from langchain_core.tools import tool as langchain_tool
|
| 10 |
from smolagents.tools import Tool, tool
|
|
|
|
| 11 |
|
| 12 |
def pre_processing(image: str, input_size=(416, 416))->np.ndarray:
|
| 13 |
|
|
@@ -109,49 +110,57 @@ def image_processing(image: str, brightness: float = 1.0, contrast: float = 1.0)
|
|
| 109 |
onnx_path = "vlm_assets/yolov3-8.onnx"
|
| 110 |
names_path = "vlm_assets/obj.names"
|
| 111 |
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
""
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
|
| 137 |
-
|
| 138 |
|
| 139 |
-
|
| 140 |
|
| 141 |
@tool
|
| 142 |
-
def ocr_scan(
|
| 143 |
"""
|
| 144 |
Scan an image for text
|
| 145 |
Args:
|
| 146 |
-
|
| 147 |
Returns:
|
| 148 |
-
The text in the
|
| 149 |
"""
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
|
|
|
|
|
|
| 153 |
return scanned_text
|
| 154 |
|
|
|
|
| 155 |
|
| 156 |
|
| 157 |
|
|
|
|
| 8 |
from PIL import Image
|
| 9 |
from langchain_core.tools import tool as langchain_tool
|
| 10 |
from smolagents.tools import Tool, tool
|
| 11 |
+
from typing import List
|
| 12 |
|
| 13 |
def pre_processing(image: str, input_size=(416, 416))->np.ndarray:
|
| 14 |
|
|
|
|
| 110 |
onnx_path = "vlm_assets/yolov3-8.onnx"
|
| 111 |
names_path = "vlm_assets/obj.names"
|
| 112 |
|
| 113 |
+
class ObjectDetectionTool(Tool):
|
| 114 |
+
description = "Detect objects in a list of frames (images)"
|
| 115 |
+
name = "object_detection"
|
| 116 |
+
inputs = {
|
| 117 |
+
"frames": {"type": List[str], "description": "The list of frames (images) to detect objects in"},
|
| 118 |
+
"onnx_path": {"type": "string", "description": "The path to the onnx file"},
|
| 119 |
+
"names_path": {"type": "string", "description": "The path to the names file"}
|
| 120 |
+
}
|
| 121 |
+
output_type = List[List[str]]
|
| 122 |
+
|
| 123 |
+
def setup(self):
|
| 124 |
+
# Load ONNX model
|
| 125 |
+
self.onnx_path = onnx_path
|
| 126 |
+
self.names_path = names_path
|
| 127 |
+
self.onnx_model = onnxruntime.InferenceSession(self.onnx_path)
|
| 128 |
+
|
| 129 |
+
def forward(self, frames: List[str])->List[List[str]]:
|
| 130 |
+
# Load class labels
|
| 131 |
+
with open(self.names_path, 'r') as f:
|
| 132 |
+
classes = [line.strip() for line in f.readlines()]
|
| 133 |
+
|
| 134 |
+
detected_objects = []
|
| 135 |
+
for frame in frames:
|
| 136 |
+
img = pre_processing(frame)
|
| 137 |
+
|
| 138 |
+
# Preprocess the image
|
| 139 |
+
blob = cv2.dnn.blobFromImage(img, 0.00392, (416, 416), (0, 0, 0), True, crop=False)
|
| 140 |
+
onnx_input = {self.onnx_model.get_inputs()[0].name: blob}
|
| 141 |
+
onnx_output = self.onnx_model.run(None, onnx_input)
|
| 142 |
|
| 143 |
+
detected_objects.append(post_processing(onnx_output, classes, img.shape))
|
| 144 |
|
| 145 |
+
return detected_objects
|
| 146 |
|
| 147 |
@tool
|
| 148 |
+
def ocr_scan(frames: List[str])->List[List[str]]:
|
| 149 |
"""
|
| 150 |
Scan an image for text
|
| 151 |
Args:
|
| 152 |
+
frames: The list of frames (images) to scan for text
|
| 153 |
Returns:
|
| 154 |
+
The list of text in the images
|
| 155 |
"""
|
| 156 |
+
scanned_text = []
|
| 157 |
+
for frame in frames:
|
| 158 |
+
image_data = base64.b64decode(frame)
|
| 159 |
+
img = Image.open(BytesIO(image_data))
|
| 160 |
+
scanned_text.append(pytesseract.image_to_string(img))
|
| 161 |
return scanned_text
|
| 162 |
|
| 163 |
+
object_detection_tool = ObjectDetectionTool()
|
| 164 |
|
| 165 |
|
| 166 |
|