giulia-fontanella commited on
Commit
9e53814
·
verified ·
1 Parent(s): b6f6740

Update tools.py

Browse files
Files changed (1) hide show
  1. tools.py +100 -99
tools.py CHANGED
@@ -46,109 +46,110 @@ def read_python(file_path: str) -> str:
46
  except Exception as e:
47
  return f"Error reading Python file: {str(e)}"
48
 
49
-
50
- @tool
51
- def extract_text_from_image(img_path: str) -> str:
52
- """
53
- Extract text from an image file using a multimodal model.
54
-
55
- Args:
56
- img_path: A string representing the url of an image (e.g., PNG, JPEG).
57
-
58
- Returns:
59
- A single string containing the concatenated text extracted from the image.
60
- """
61
- all_text = ""
62
- try:
63
- # Read image and encode as base64
64
- with open(img_path, "rb") as image_file:
65
- image_bytes = image_file.read()
66
-
67
- image_base64 = base64.b64encode(image_bytes).decode("utf-8")
68
-
69
- # Prepare the prompt including the base64 image data
70
- message = [
71
- HumanMessage(
72
- content=[
73
- {
74
- "type": "text",
75
- "text": (
76
- "Extract all the text from this image. "
77
- "Return only the extracted text, no explanations."
78
- ),
79
- },
80
- {
81
- "type": "image_url",
82
- "image_url": {
83
- "url": f"data:image/png;base64,{image_base64}"
84
  },
85
- },
86
- ]
87
- )
88
- ]
89
-
90
- # Call the vision-capable model
91
- response = vision_llm.invoke(message)
92
-
93
- # Append extracted text
94
- all_text += response.content + "\n\n"
95
-
96
- return all_text.strip()
97
- except Exception as e:
98
- error_msg = f"Error extracting text: {str(e)}"
99
- print(error_msg)
100
- return ""
101
-
102
-
103
- @tool
104
- def describe_image(img_path: str, query: str) -> str:
105
- """
106
- Generate a detailed description of an image using a multimodal model.
107
- This function reads a image from an url, encodes it, and sends it to a
108
- vision-capable language model to obtain a comprehensive, natural language
109
- description of the image's content, including its objects, actions, and context,
110
- following a specific query.
111
 
112
- Args:
113
- img_path: A string representing the url of an image (e.g., PNG, JPEG).
114
- query: Information to extract from the image.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
- Returns:
117
- A single string containing a detailed description of the image.
118
- """
119
- try:
120
- # Read image and encode as base64
121
- with open(img_path, "rb") as image_file:
122
- image_bytes = image_file.read()
123
-
124
- image_base64 = base64.b64encode(image_bytes).decode("utf-8")
125
-
126
- # Prepare message payload
127
- message = [
128
- HumanMessage(
129
- content=[
130
- {
131
- "type": "text",
132
- "text": (
133
- f"Describe this image in rich detail. Include objects, people, setting, background elements, and any inferred actions or context. Avoid technical jargon. In particular, extract the following information: {query}" ),
134
- },
135
- {
136
- "type": "image_url",
137
- "image_url": {
138
- "url": f"data:image/png;base64,{image_base64}"
139
  },
140
- },
141
- ]
142
- )
143
- ]
144
- response = vision_llm.invoke(message)
145
- return response.content.strip()
146
-
147
- except Exception as e:
148
- error_msg = f"Error describing image: {str(e)}"
149
- print(error_msg)
150
- return ""
151
-
 
 
 
 
 
152
 
153
  @tool
154
  def wiki_search(query: str) -> str:
 
46
  except Exception as e:
47
  return f"Error reading Python file: {str(e)}"
48
 
49
+ def make_text_from_image_tool(vision_llm):
50
+ @tool
51
+ def extract_text_from_image(img_path: str) -> str:
52
+ """
53
+ Extract text from an image file using a multimodal model.
54
+
55
+ Args:
56
+ img_path: A string representing the url of an image (e.g., PNG, JPEG).
57
+
58
+ Returns:
59
+ A single string containing the concatenated text extracted from the image.
60
+ """
61
+ all_text = ""
62
+ try:
63
+ # Read image and encode as base64
64
+ with open(img_path, "rb") as image_file:
65
+ image_bytes = image_file.read()
66
+
67
+ image_base64 = base64.b64encode(image_bytes).decode("utf-8")
68
+
69
+ # Prepare the prompt including the base64 image data
70
+ message = [
71
+ HumanMessage(
72
+ content=[
73
+ {
74
+ "type": "text",
75
+ "text": (
76
+ "Extract all the text from this image. "
77
+ "Return only the extracted text, no explanations."
78
+ ),
 
 
 
 
 
79
  },
80
+ {
81
+ "type": "image_url",
82
+ "image_url": {
83
+ "url": f"data:image/png;base64,{image_base64}"
84
+ },
85
+ },
86
+ ]
87
+ )
88
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
+ # Call the vision-capable model
91
+ response = vision_llm.invoke(message)
92
+
93
+ # Append extracted text
94
+ all_text += response.content + "\n\n"
95
+
96
+ return all_text.strip()
97
+ except Exception as e:
98
+ error_msg = f"Error extracting text: {str(e)}"
99
+ print(error_msg)
100
+ return ""
101
+
102
+
103
+ def make_describe_image_tool(vision_llm):
104
+ @tool
105
+ def describe_image(img_path: str, query: str) -> str:
106
+ """
107
+ Generate a detailed description of an image using a multimodal model.
108
+ This function reads a image from an url, encodes it, and sends it to a
109
+ vision-capable language model to obtain a comprehensive, natural language
110
+ description of the image's content, including its objects, actions, and context,
111
+ following a specific query.
112
 
113
+ Args:
114
+ img_path: A string representing the url of an image (e.g., PNG, JPEG).
115
+ query: Information to extract from the image.
116
+
117
+ Returns:
118
+ A single string containing a detailed description of the image.
119
+ """
120
+ try:
121
+ # Read image and encode as base64
122
+ with open(img_path, "rb") as image_file:
123
+ image_bytes = image_file.read()
124
+
125
+ image_base64 = base64.b64encode(image_bytes).decode("utf-8")
126
+
127
+ # Prepare message payload
128
+ message = [
129
+ HumanMessage(
130
+ content=[
131
+ {
132
+ "type": "text",
133
+ "text": (
134
+ f"Describe this image in rich detail. Include objects, people, setting, background elements, and any inferred actions or context. Avoid technical jargon. In particular, extract the following information: {query}" ),
 
135
  },
136
+ {
137
+ "type": "image_url",
138
+ "image_url": {
139
+ "url": f"data:image/png;base64,{image_base64}"
140
+ },
141
+ },
142
+ ]
143
+ )
144
+ ]
145
+ response = vision_llm.invoke(message)
146
+ return response.content.strip()
147
+
148
+ except Exception as e:
149
+ error_msg = f"Error describing image: {str(e)}"
150
+ print(error_msg)
151
+ return ""
152
+
153
 
154
  @tool
155
  def wiki_search(query: str) -> str: