nikhmr1235 commited on
Commit
4409e88
·
verified ·
1 Parent(s): 480b629

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +90 -1
app.py CHANGED
@@ -22,7 +22,7 @@ from langchain_openai import ChatOpenAI
22
  from openai import OpenAI
23
 
24
  # tools imported from helper.py
25
- from helper import repl_tool, get_travily_api_search_tool,audio_transcriber_tool,wikipedia_search_tool,file_saver_tool,wikipedia_full_content_tool,serpapi_Google_Search_tool, gemini_multimodal_tool
26
 
27
 
28
 
@@ -102,6 +102,86 @@ class BasicAgent:
102
  return self.invoke_with_retry(question)
103
 
104
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
 
107
  def run_and_submit_all( profile: gr.OAuthProfile | None):
@@ -135,6 +215,13 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
135
  return "OpenAI API key not found. Please set OPENAI_API_KEY environment variable.", None
136
  print(f"Using OpenAI API key: {openai_api_key[:4]}... (truncated for security)")
137
 
 
 
 
 
 
 
 
138
  #NMODEL
139
  #'''
140
  llm_client = ChatGoogleGenerativeAI(
@@ -159,6 +246,8 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
159
  return "Tavily API key not found. Please set TAVILY_API_KEY environment variable.", None
160
  print(f"Using Tavily API key: {tavily_api_key[:4]}... (truncated for security)")
161
 
 
 
162
  travily_api_search_tool = get_travily_api_search_tool(tavily_api_key)
163
  #tools = [travily_api_search_tool, repl_tool, file_saver_tool,audio_transcriber_tool,wikipedia_search_tool,wikipedia_full_content_tool]
164
  tools = [ repl_tool, file_saver_tool,audio_transcriber_tool,travily_api_search_tool, gemini_multimodal_tool]
 
22
  from openai import OpenAI
23
 
24
  # tools imported from helper.py
25
+ from helper import repl_tool, get_travily_api_search_tool,audio_transcriber_tool,wikipedia_search_tool,file_saver_tool,wikipedia_full_content_tool,serpapi_Google_Search_tool
26
 
27
 
28
 
 
102
  return self.invoke_with_retry(question)
103
 
104
 
105
+ import base64
106
+ from langchain.tools import Tool
107
+ from langchain_google_genai import ChatGoogleGenerativeAI
108
+ from langchain_core.messages import HumanMessage
109
+ import os
110
+
111
+ def analyze_image_with_gemini(args: dict) -> str:
112
+ """
113
+ Analyzes an image using Google's Gemini Multimodal LLM to answer a given question.
114
+ This tool is designed for tasks requiring visual understanding, such as
115
+ describing image content, identifying objects, or answering questions about
116
+ information presented visually (e.g., charts, diagrams, chess boards).
117
+
118
+ **Input Format (CRITICAL):**
119
+ The input MUST be a JSON string with 'image_path' and 'question' keys.
120
+ - 'image_path': The local file path to the image (e.g., 'path/to/my_image.png').
121
+ This image MUST have been previously downloaded and saved locally using the 'file_saver' tool.
122
+ - 'question': The question to answer based on the image content.
123
+
124
+ Example: '{"image_path": "downloaded_image.png", "question": "What is depicted in this image?"}'
125
+ Example: '{"image_path": "chess_board.jpg", "question": "What is the next best move in this chess position?"}'
126
+
127
+ **DO NOT:**
128
+ - Pass URLs directly to this tool; always use 'file_saver' first.
129
+ - Ask questions unrelated to the image content.
130
+ - Expect real-time actions or external website access.
131
+
132
+ **Output:**
133
+ The tool returns the answer generated by the Gemini Multimodal LLM based on the image and question.
134
+ Returns an informative error message if the image file is not found,
135
+ the API key is missing, or the LLM encounters an issue.
136
+ """
137
+ try:
138
+ # Ensure the input is parsed if it comes as a string (common from LLMs)
139
+ if isinstance(args, str):
140
+ import json
141
+ args = json.loads(args)
142
+
143
+ image_path = args.get("image_path")
144
+ question = args.get("question")
145
+
146
+ if not image_path or not question:
147
+ return "Error: Both 'image_path' and 'question' must be provided."
148
+
149
+ if not os.path.exists(image_path):
150
+ return f"Error: Local image file not found at '{image_path}'. Did you save it with 'file_saver'?"
151
+
152
+ google_api_key = os.getenv("GOOGLE_API_KEY")
153
+
154
+ if not google_api_key:
155
+ return "Error: GOOGLE_API_KEY not found in environment variables for multimodal tool."
156
+
157
+ # Initialize the multimodal LLM (Gemini-Pro-Vision is recommended for image understanding)
158
+ # Using a fallback to 'gemini-pro' if 'gemini-pro-vision' isn't directly available or preferred
159
+ llm = ChatGoogleGenerativeAI(
160
+ model="gemini-pro-vision" if "gemini-pro-vision" in ChatGoogleGenerativeAI.get_available_models(google_api_key) else "gemini-2.0-flash",
161
+ google_api_key=google_api_key,
162
+ temperature=0.0 # Set temperature to 0 for more factual/deterministic responses
163
+ )
164
+
165
+ # Load the image as base64 for multimodal input
166
+ with open(image_path, "rb") as f:
167
+ image_bytes = f.read()
168
+ # Encode image to base64
169
+ image_base64 = base64.b64encode(image_bytes).decode('utf-8')
170
+
171
+ # Create a multimodal message for the LLM
172
+ message = HumanMessage(
173
+ content=[
174
+ {"type": "text", "text": question},
175
+ {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
176
+ ]
177
+ )
178
+
179
+ # Invoke the LLM
180
+ response = llm.invoke([message])
181
+ return response.content
182
+
183
+ except Exception as e:
184
+ return f"Error in gemini_multimodal_tool: {e}"
185
 
186
 
187
  def run_and_submit_all( profile: gr.OAuthProfile | None):
 
215
  return "OpenAI API key not found. Please set OPENAI_API_KEY environment variable.", None
216
  print(f"Using OpenAI API key: {openai_api_key[:4]}... (truncated for security)")
217
 
218
+
219
+ # Define the Tool object for the agent
220
+ gemini_multimodal_tool = Tool(
221
+ name="gemini_multimodal_tool",
222
+ description=analyze_image_with_gemini.__doc__, # Use the docstring as description
223
+ func=analyze_image_with_gemini,
224
+ )
225
  #NMODEL
226
  #'''
227
  llm_client = ChatGoogleGenerativeAI(
 
246
  return "Tavily API key not found. Please set TAVILY_API_KEY environment variable.", None
247
  print(f"Using Tavily API key: {tavily_api_key[:4]}... (truncated for security)")
248
 
249
+
250
+
251
  travily_api_search_tool = get_travily_api_search_tool(tavily_api_key)
252
  #tools = [travily_api_search_tool, repl_tool, file_saver_tool,audio_transcriber_tool,wikipedia_search_tool,wikipedia_full_content_tool]
253
  tools = [ repl_tool, file_saver_tool,audio_transcriber_tool,travily_api_search_tool, gemini_multimodal_tool]