nikhmr1235 commited on
Commit
169060d
·
verified ·
1 Parent(s): 9192353

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -181
app.py CHANGED
@@ -4,6 +4,7 @@ import gradio as gr
4
  import inspect
5
  import pandas as pd
6
  import time
 
7
  from langchain_google_genai import ChatGoogleGenerativeAI
8
  from langchain_community.tools import TavilySearchResults
9
  from langchain import hub # Used to pull predefined prompts from LangChain Hub
@@ -22,7 +23,7 @@ from langchain_openai import ChatOpenAI
22
  from openai import OpenAI
23
 
24
  # tools imported from helper.py
25
- from helper import repl_tool, get_travily_api_search_tool,audio_transcriber_tool,wikipedia_search_tool,file_saver_tool,wikipedia_full_content_tool,serpapi_Google_Search_tool
26
 
27
 
28
 
@@ -102,87 +103,6 @@ class BasicAgent:
102
  return self.invoke_with_retry(question)
103
 
104
 
105
- import base64
106
- from langchain.tools import Tool
107
- from langchain_google_genai import ChatGoogleGenerativeAI
108
- from langchain_core.messages import HumanMessage
109
- import os
110
-
111
- def analyze_image_with_gemini(args: dict) -> str:
112
- """
113
- Analyzes an image using Google's Gemini Multimodal LLM to answer a given question.
114
- This tool is designed for tasks requiring visual understanding, such as
115
- describing image content, identifying objects, or answering questions about
116
- information presented visually (e.g., charts, diagrams, chess boards).
117
-
118
- **Input Format (CRITICAL):**
119
- The input MUST be a JSON string with 'image_path' and 'question' keys.
120
- - 'image_path': The local file path to the image (e.g., 'path/to/my_image.png').
121
- This image MUST have been previously downloaded and saved locally using the 'file_saver' tool.
122
- - 'question': The question to answer based on the image content.
123
-
124
- Example: '{"image_path": "downloaded_image.png", "question": "What is depicted in this image?"}'
125
- Example: '{"image_path": "chess_board.jpg", "question": "What is the next best move in this chess position?"}'
126
-
127
- **DO NOT:**
128
- - Pass URLs directly to this tool; always use 'file_saver' first.
129
- - Ask questions unrelated to the image content.
130
- - Expect real-time actions or external website access.
131
-
132
- **Output:**
133
- The tool returns the answer generated by the Gemini Multimodal LLM based on the image and question.
134
- Returns an informative error message if the image file is not found,
135
- the API key is missing, or the LLM encounters an issue.
136
- """
137
- try:
138
- # Ensure the input is parsed if it comes as a string (common from LLMs)
139
- if isinstance(args, str):
140
- import json
141
- args = json.loads(args)
142
-
143
- image_path = args.get("image_path")
144
- question = args.get("question")
145
-
146
- if not image_path or not question:
147
- return "Error: Both 'image_path' and 'question' must be provided."
148
-
149
- if not os.path.exists(image_path):
150
- return f"Error: Local image file not found at '{image_path}'. Did you save it with 'file_saver'?"
151
-
152
- google_api_key = os.getenv("GOOGLE_API_KEY")
153
-
154
- if not google_api_key:
155
- return "Error: GOOGLE_API_KEY not found in environment variables for multimodal tool."
156
-
157
- # Initialize the multimodal LLM (Gemini-Pro-Vision is recommended for image understanding)
158
- # Using a fallback to 'gemini-pro' if 'gemini-pro-vision' isn't directly available or preferred
159
- llm = ChatGoogleGenerativeAI(
160
- #model="gemini-pro-vision" if "gemini-pro-vision" in ChatGoogleGenerativeAI.get_available_models(google_api_key) else "gemini-2.0-flash",
161
- model="gemini-2.0-flash",
162
- google_api_key=google_api_key,
163
- temperature=0.0 # Set temperature to 0 for more factual/deterministic responses
164
- )
165
-
166
- # Load the image as base64 for multimodal input
167
- with open(image_path, "rb") as f:
168
- image_bytes = f.read()
169
- # Encode image to base64
170
- image_base64 = base64.b64encode(image_bytes).decode('utf-8')
171
-
172
- # Create a multimodal message for the LLM
173
- message = HumanMessage(
174
- content=[
175
- {"type": "text", "text": question},
176
- {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
177
- ]
178
- )
179
-
180
- # Invoke the LLM
181
- response = llm.invoke([message])
182
- return response.content
183
-
184
- except Exception as e:
185
- return f"Error in gemini_multimodal_tool: {e}"
186
 
187
 
188
  def run_and_submit_all( profile: gr.OAuthProfile | None):
@@ -217,12 +137,7 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
217
  print(f"Using OpenAI API key: {openai_api_key[:4]}... (truncated for security)")
218
 
219
 
220
- # Define the Tool object for the agent
221
- gemini_multimodal_tool = Tool(
222
- name="gemini_multimodal_tool",
223
- description=analyze_image_with_gemini.__doc__, # Use the docstring as description
224
- func=analyze_image_with_gemini,
225
- )
226
  #NMODEL
227
  #'''
228
  llm_client = ChatGoogleGenerativeAI(
@@ -256,95 +171,6 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
256
  # Pull a predefined prompt from LangChain Hub
257
  # "hwchase17/react-chat" is a prompt template designed for ReAct-style conversational agents.
258
  #prompt = hub.pull("hwchase17/react-chat")
259
- '''
260
- prompt = PromptTemplate(
261
- input_variables=["input", "agent_scratchpad", "chat_history", "tool_names"],
262
- template="""
263
- You are a smart and helpful AI Agent/Assistant. You are allowed and encouraged to use one or more tools as needed to answer complex questions and perform tasks.
264
- It is CRUCIAL that you ALWAYS follow the exact format below. Do not deviate.
265
- NOTE: it is MANDATORY for you to be precise and concise in your response. Respond directly with ONLY the answer, without any introductory phrases or additional details.
266
- For example, if asked for the number of letters in the English alphabet, respond with '26'. Do NOT say "The number of letters is 26."
267
-
268
- You have access to the following tools:
269
- {tools}
270
-
271
- To use a tool, you MUST follow this precise format:
272
-
273
- Thought: I need to use a tool to find the answer.
274
- Action: [tool_name] # This will be one of [{tool_names}]
275
- Action Input: [input_for_the_tool]
276
- Observation: [result_from_the_tool]
277
-
278
- IMPORTANT NOTE ON TOOL USAGE:
279
- - If an 'Observation' from a tool does NOT directly contain the specific answer to your question, you MUST refine your query or switch to a different, more suitable tool (e.g., 'tavily_search' for broader or more current information if 'wikipedia_search_tool' was insufficient). Do NOT get stuck repeatedly using the same tool if it's not yielding the direct answer.
280
- - If the input contains the exact phrase "Attachment '{{file_name}}' available at: {{attachment_url}}" (where '{{file_name}}' and '{{attachment_url}}' are placeholders for actual values), consider the file type:
281
- - If the file type is binary/text (e.g., .xlsx, .docx, .mp3, .jpg, .pdf,.png), you MUST use the 'file_saver' tool to download and save it.
282
- For 'file_saver', the Action Input must be a JSON string like: '{{"url": "the_attachment_url", "local_filename": "the_file_name_from_attachment"}}'
283
- example: for input, Attachment '1f975693-876d-457b-a649-393859e79bf3.mp3' available at EXACT URL: https://agents-course-unit4-scoring.hf.space/files/1f975693-876d-457b-a649-393859e79bf3, Action Input for file_saver would be '{{"url": "https://agents-course-unit4-scoring.hf.space/files/1f975693-876d-457b-a649-393859e79bf3", "local_filename": "1f975693-876d-457b-a649-393859e79bf3.mp3"}}'
284
-
285
-
286
- IMPORTANT: When processing audio files (like .mp3) that have been saved using 'file_saver', the 'audio_transcriber_tool' MUST be used with the 'local_filename' of the saved audio file as its Action Input. Do NOT pass URLs or remote paths directly to 'audio_transcriber_tool'.
287
- For any incoming image files (e.g., .jpg, .png), it's crucial to download and save them locally using the 'file_saver' tool. Once the image is saved, you should then decide whether to utilize other available tools or your Multimodal LLM to formulate a response. If you have sufficient information and can provide a CONCISE response, or if no tool is needed, you MUST use this precise format:
288
-
289
- if you can use a LLM to answer the question, think step-by-step and then answer the question.
290
- Example: given a chess board image and asked to predict the next best move, if Multi-modal LLM is available, you can use it to answer the question.
291
-
292
- Thought: I have enough information, or no tool is needed.
293
- Final Answer: [your concise/short response here]
294
-
295
- NOTE: it is MANDATORY for you to be precise and concise in your response. Respond directly with ONLY the answer, without any introductory phrases or additional details.
296
- For example, if asked for the number of letters in the English alphabet, respond with '26'. Do NOT say "The number of letters is 26."
297
- VERY IMPORTANT: Your response MUST always start with 'Thought:'.
298
-
299
- Here are some examples of how you should respond:
300
-
301
- Example 1:
302
- Question: What is the capital of France?
303
- Thought: I need to use a tool to find the capital of France.
304
- Action: tavily_search_results
305
- Action Input: capital of France
306
- Observation: The capital of France is Paris.
307
- Thought: I have found the answer.
308
- Final Answer: Paris
309
-
310
- Example 2:
311
- Question: What is 2 + 2?
312
- Thought: This is a simple arithmetic question, no tool is needed.
313
- Final Answer: 4
314
-
315
- Example 3:
316
- Question: How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia.
317
- Thought: The user is asking for specific information from Wikipedia, likely requiring a list or discography. The `travily_api_search_tool` is best for this to get the detailed section. After getting the content, I will need to parse it using `python_repl` to count the albums within the specified years.
318
- Action: serpapi_Google Search
319
- Action Input: Mercedes Sosa section: Discography
320
- Observation: [Discography text content]
321
- Thought: I have retrieved the discography text. Now I need to parse this text to identify and count studio albums released between 2000 and 2009. I will use the `python_repl` tool for this.
322
- Action: python_repl
323
- Action Input:
324
- ```python
325
- import re
326
- text = "[Discography text content from previous observation]" # Replace with actual text
327
- albums_2000_2009 = []
328
- # This is a simplified regex example; actual parsing might be more complex depending on text format
329
- pattern = r"\((\d{{4}})\s*(.*?)(?:\[|\n|$)"
330
- for match in re.finditer(pattern, text):
331
- year = int(match.group(1))
332
- if 2000 <= year <= 2009:
333
- albums_2000_2009.append(match.group(2).strip())
334
- print(len(albums_2000_2009))
335
- Observation: 3
336
- Thought: I have parsed the discography and counted the albums. I have found the answer.
337
- Final Answer: 3
338
- ---
339
- Previous conversation history:
340
- {chat_history}
341
-
342
- New input: {input}
343
- ---
344
- {agent_scratchpad}
345
- """
346
- )
347
- '''
348
 
349
  prompt = PromptTemplate(
350
  input_variables=["input", "agent_scratchpad", "chat_history", "tool_names"],
@@ -526,18 +352,20 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
526
  full_question_for_agent += f"\n\nAttachment '{file_name}' available at EXACT URL: {attachment_url}"
527
  print(f"Running agent on task {task_id}: {full_question_for_agent}",flush=True)
528
 
529
- '''
530
  allowed_ids = {
531
- "7bd855d8-463d-4ed5-93ca-5fe35145f733",
532
  "cca530fc-4052-43b2-b130-b30968d8aa44",
533
  #"1f975693-876d-457b-a649-393859e79bf3",
534
  #"99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3",
535
  #"4fc2f1ae-8625-45b5-ab34-ad4433bc21f8",
536
  #"8e867cd7-cff9-4e6c-867a-ff5ddc2550be",
 
 
537
  }
538
  if task_id not in allowed_ids:
539
  continue
540
- '''
541
 
542
  try:
543
  submitted_answer = agent(full_question_for_agent)
@@ -571,8 +399,17 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
571
  f"Message: {result_data.get('message', 'No message received.')}"
572
  )
573
  print("Submission successful.")
 
 
 
 
 
 
 
 
 
574
  results_df = pd.DataFrame(results_log)
575
- return final_status, results_df
576
  except requests.exceptions.HTTPError as e:
577
  error_detail = f"Server responded with status {e.response.status_code}."
578
  try:
 
4
  import inspect
5
  import pandas as pd
6
  import time
7
+ import re
8
  from langchain_google_genai import ChatGoogleGenerativeAI
9
  from langchain_community.tools import TavilySearchResults
10
  from langchain import hub # Used to pull predefined prompts from LangChain Hub
 
23
  from openai import OpenAI
24
 
25
  # tools imported from helper.py
26
+ from helper import repl_tool, get_travily_api_search_tool,audio_transcriber_tool,wikipedia_search_tool,file_saver_tool,wikipedia_full_content_tool,serpapi_Google_Search_tool,gemini_multimodal_tool
27
 
28
 
29
 
 
103
  return self.invoke_with_retry(question)
104
 
105
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
 
108
  def run_and_submit_all( profile: gr.OAuthProfile | None):
 
137
  print(f"Using OpenAI API key: {openai_api_key[:4]}... (truncated for security)")
138
 
139
 
140
+
 
 
 
 
 
141
  #NMODEL
142
  #'''
143
  llm_client = ChatGoogleGenerativeAI(
 
171
  # Pull a predefined prompt from LangChain Hub
172
  # "hwchase17/react-chat" is a prompt template designed for ReAct-style conversational agents.
173
  #prompt = hub.pull("hwchase17/react-chat")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
 
175
  prompt = PromptTemplate(
176
  input_variables=["input", "agent_scratchpad", "chat_history", "tool_names"],
 
352
  full_question_for_agent += f"\n\nAttachment '{file_name}' available at EXACT URL: {attachment_url}"
353
  print(f"Running agent on task {task_id}: {full_question_for_agent}",flush=True)
354
 
355
+
356
  allowed_ids = {
357
+ #"7bd855d8-463d-4ed5-93ca-5fe35145f733",
358
  "cca530fc-4052-43b2-b130-b30968d8aa44",
359
  #"1f975693-876d-457b-a649-393859e79bf3",
360
  #"99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3",
361
  #"4fc2f1ae-8625-45b5-ab34-ad4433bc21f8",
362
  #"8e867cd7-cff9-4e6c-867a-ff5ddc2550be",
363
+ "a1e91b78-d3d8-4675-bb8d-62741b4b68a6",
364
+ "3f57289b-8c60-48be-bd80-01f8099ca449",
365
  }
366
  if task_id not in allowed_ids:
367
  continue
368
+
369
 
370
  try:
371
  submitted_answer = agent(full_question_for_agent)
 
399
  f"Message: {result_data.get('message', 'No message received.')}"
400
  )
401
  print("Submission successful.")
402
+ # Step 1: Remove common problematic characters (like null bytes, non-breaking spaces, etc.)
403
+ # This regex removes characters that are not printable ASCII.
404
+ # \x20-\x7E covers space through tilde (~)
405
+ # \n\r\t covers newlines and tabs
406
+ # You might need to adjust this regex based on what 'wonky chars' you specifically observe.
407
+ cleaned_final_status = re.sub(r'[^\x20-\x7E\n\r\t]+', '', final_status)
408
+
409
+ # Step 2: Strip leading/trailing whitespace (including newlines from formatting)
410
+ cleaned_final_status = cleaned_final_status.strip()
411
  results_df = pd.DataFrame(results_log)
412
+ return cleaned_final_status, results_df
413
  except requests.exceptions.HTTPError as e:
414
  error_detail = f"Server responded with status {e.response.status_code}."
415
  try: