mrtom17 commited on
Commit
ed9d2b5
·
verified ·
1 Parent(s): eb7c373

Upload 5 files

Browse files
Files changed (5) hide show
  1. README.md +47 -10
  2. agent.py +265 -0
  3. app_safe.py +217 -0
  4. requirements.txt +13 -0
  5. tools.py +280 -0
README.md CHANGED
@@ -1,12 +1,49 @@
1
- ---
2
- title: Gaia Agent
3
- emoji: 🏃
4
- colorFrom: purple
5
- colorTo: gray
6
- sdk: gradio
7
- sdk_version: 5.36.2
8
- app_file: app.py
9
- pinned: false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ # GAIA Benchmark Agent
2
+
3
+ This project is an AI agent designed to tackle the GAIA benchmark, featuring multi-step reasoning, tool use (web search, Wikipedia, data analysis, file handling), and a Gradio web interface for evaluation and submission.
4
+
5
+ ## Features
6
+ - LangGraph-based agent with robust tool integration
7
+ - Wikipedia, Tavily (web search), data analysis, and file handling tools
8
+ - Automatic file download for file-based questions
9
+ - Gradio interface for user interaction and answer submission
10
+ - Error handling and graceful fallback for recursion/tool loops
11
+
12
+ ## Setup & Deployment
13
+
14
+ ### 1. Install Dependencies
15
+ ```
16
+ pip install -r requirements.txt
17
+ ```
18
+
19
+ ### 2. Environment Variables
20
+ Create a `.env` file (not committed) or set these variables in your Hugging Face Space:
21
+ - `OPENAI_API_KEY` (for OpenAI LLM and transcription)
22
+ - `TAVILY_API_KEY` (for Tavily web search)
23
+ - (Optional) `SPACE_ID` (for Hugging Face Space integration)
24
+
25
+ ### 3. Run Locally
26
+ ```
27
+ python app_safe.py
28
+ ```
29
+ Or launch the Gradio interface as your main app file.
30
+
31
+ ### 4. Deploy to Hugging Face Spaces
32
+ - Push your code to a public Hugging Face Space repository.
33
+ - Set your API keys as secrets in the Space settings.
34
+ - The Gradio app will launch automatically.
35
+
36
+ ## Project Structure
37
+ - `app_safe.py` — Main Gradio app for full agent evaluation
38
+ - `agent.py` — Agent logic and tool orchestration
39
+ - `tools.py` — Tool definitions (Tavily, Wikipedia, data analysis, etc.)
40
+ - `requirements.txt` — All dependencies
41
+ - `README.md` — This file
42
+
43
+ ## Notes
44
+ - The agent will return a fallback answer if it cannot answer within the recursion/tool call limits.
45
+ - For best results, ensure all environment variables are set and dependencies are installed.
46
+
47
  ---
48
 
49
+ **Good luck on the GAIA benchmark!**
agent.py ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # agent.py
2
+ import os
3
+ import logging
4
+ from typing import TypedDict, Annotated, Any
5
+ from langgraph.graph import StateGraph, END, START
6
+ from langgraph.graph.message import add_messages
7
+ from dotenv import load_dotenv
8
+ from langgraph.prebuilt import ToolNode
9
+ from langchain_openai import ChatOpenAI
10
+ from langchain_core.messages import AnyMessage, HumanMessage, AIMessage, ToolMessage, SystemMessage
11
+ from tools import TOOLS # Your tools list should be defined here
12
+ import requests
13
+ import re
14
+ import json
15
+
16
+ # --- Logging Setup ---
17
+ load_dotenv()
18
+ LOG_FILE = os.path.join(os.path.dirname(__file__), "agent.log")
19
+ logging.basicConfig(
20
+ level=logging.INFO,
21
+ format="%(asctime)s [%(levelname)s] %(message)s",
22
+ handlers=[
23
+ logging.StreamHandler(),
24
+ logging.FileHandler(LOG_FILE, mode="w", encoding="utf-8"),
25
+ ],
26
+ )
27
+ logger = logging.getLogger("agent_logger")
28
+
29
+ # --- Token Counting Helper ---
30
+ def count_tokens(messages):
31
+ try:
32
+ import tiktoken
33
+ enc = tiktoken.encoding_for_model("gpt-3.5-turbo")
34
+ total = 0
35
+ for msg in messages:
36
+ if hasattr(msg, "content") and msg.content:
37
+ total += len(enc.encode(str(msg.content)))
38
+ return total
39
+ except ImportError:
40
+ logger.warning("tiktoken not installed, skipping token count.")
41
+ return -1
42
+ except Exception as e:
43
+ logger.warning(f"Token counting error: {e}")
44
+ return -1
45
+
46
+ # LLM definition using GPT‑o3
47
+ system_prompt = (
48
+ "You are a helpful assistant. When answering, output ONLY the answer to the question, with no extra text, explanation, or formatting. "
49
+ "If you call a tool and receive its output, use the tool output as the main source for your answer. "
50
+ "You may analyze, summarize, or combine tool outputs if needed to answer the question, but do not ignore tool outputs or say you cannot access files or images. "
51
+ "Do not include phrases like 'Final answer', 'The answer is', or any commentary. Output only the answer string. "
52
+ "If a question involves a file, audio, or image, use the appropriate tool to access or process the file. Do not say you cannot access files—always attempt a tool call first. "
53
+ "When you output your answer, use the least possible amount of words. If a single word or number suffices, output only that."
54
+ )
55
+ chat = ChatOpenAI(
56
+ model="o3", # GPT‑o3 model
57
+ temperature=1,
58
+ openai_api_key=os.getenv("OPENAI_API_KEY"),
59
+ )
60
+
61
+ # Bind tools with the LLM
62
+ chat_with_tools = chat.bind_tools(TOOLS)
63
+
64
+ # Agent state: tracks conversation history
65
+ class AgentState(TypedDict):
66
+ messages: Annotated[list[AnyMessage], add_messages]
67
+
68
+ # Assistant node: single chat invocation
69
+ def assistant(state: AgentState) -> dict[str, list[AnyMessage]]:
70
+ logger.info("[Agent] Thinking...")
71
+ logger.info(f"[Agent] Messages so far: {[str(m) for m in state['messages']]}")
72
+ next_msg = chat_with_tools.invoke(state["messages"])
73
+ logger.info(f"[Agent] LLM response: {next_msg.content}")
74
+ if getattr(next_msg, "tool_calls", None):
75
+ logger.info(f"[Agent] Tool calls: {next_msg.tool_calls}")
76
+ return {"messages": [next_msg]}
77
+
78
+ # Condition: check if the assistant wants to use a tool again
79
+ def needs_tool(state: AgentState) -> str:
80
+ last = state["messages"][-1]
81
+ # If the LLM called a tool, we route to the tool node
82
+ if getattr(last, "tool_calls", None):
83
+ return "tools"
84
+ # Else, stop at END
85
+ return "end"
86
+
87
+ # Build the graph
88
+ def build_langgraph():
89
+ builder = StateGraph(AgentState)
90
+ builder.add_node("assistant", assistant)
91
+ builder.add_node("tools", ToolNode(TOOLS))
92
+ builder.set_entry_point("assistant")
93
+ builder.add_conditional_edges(
94
+ "assistant",
95
+ needs_tool,
96
+ {"tools": "tools", "end": END}
97
+ )
98
+ builder.add_edge("tools", "assistant")
99
+ return builder.compile()
100
+
101
+ # High-level solve function with logging and token counting
102
+ def solve(question: str) -> str:
103
+ logger.info(f"[User] {question}")
104
+ graph = build_langgraph()
105
+ state = {"messages": [SystemMessage(content=system_prompt), HumanMessage(content=question)]}
106
+ step = 0
107
+ all_messages = list(state["messages"])
108
+ # --- Track google_search_tool calls per question ---
109
+ google_search_calls = 0
110
+ MAX_GOOGLE_SEARCH_CALLS = 10
111
+ # --- Track repeated tool calls for 'give up' condition ---
112
+ tool_call_counts = {}
113
+ GIVE_UP_THRESHOLD = 5
114
+ fallback_answer = "Unable to determine from available data."
115
+ recursion_fallback = "Unable to find the answer with the given data."
116
+ try:
117
+ while True:
118
+ step += 1
119
+ logger.info(f"--- Step {step} ---")
120
+ # Run one step of the graph with recursion_limit set to 25
121
+ result = graph.invoke(state, {"recursion_limit": 25})
122
+ new_msgs = result["messages"][len(state["messages"]):]
123
+ for msg in new_msgs:
124
+ if isinstance(msg, AIMessage):
125
+ logger.info(f"[Agent] {msg.content}")
126
+ elif isinstance(msg, ToolMessage):
127
+ logger.info(f"[ToolMessage] {msg.content}")
128
+ # Intercept tool calls and block google_search_tool after limit
129
+ if hasattr(msg, "tool_call_id") and hasattr(msg, "name") and msg.name == "google_search_tool":
130
+ google_search_calls += 1
131
+ if google_search_calls > MAX_GOOGLE_SEARCH_CALLS:
132
+ # Replace tool output with refusal message
133
+ refusal = ToolMessage(
134
+ content="Google search tool call refused: limit of 10 calls per question reached.",
135
+ tool_call_id=msg.tool_call_id
136
+ )
137
+ result["messages"][result["messages"].index(msg)] = refusal
138
+ logger.info("[ToolMessage] Google search tool call refused: limit reached.")
139
+ # --- Improved give up logic: track by tool name and arguments/query ---
140
+ if hasattr(msg, "name") and hasattr(msg, "tool_call_id"):
141
+ tool_args = ""
142
+ if hasattr(msg, "additional_kwargs") and msg.additional_kwargs and "tool_calls" in msg.additional_kwargs:
143
+ tool_calls = msg.additional_kwargs["tool_calls"]
144
+ if tool_calls and isinstance(tool_calls, list):
145
+ # Get the first tool call's arguments (as string)
146
+ tool_args = tool_calls[0].get("function", {}).get("arguments", "")
147
+ tool_key = (msg.name, tool_args.strip().lower())
148
+ tool_call_counts[tool_key] = tool_call_counts.get(tool_key, 0) + 1
149
+ if tool_call_counts[tool_key] > GIVE_UP_THRESHOLD:
150
+ logger.info(f"[Agent] Give up condition met for tool {msg.name} with similar arguments: {tool_args}")
151
+ return fallback_answer
152
+ all_messages.extend(new_msgs)
153
+ state["messages"] = result["messages"]
154
+ # Check if done
155
+ if not getattr(state["messages"][-1], "tool_calls", None):
156
+ break
157
+ logger.info(f"[Agent] Final answer: {state['messages'][-1].content}")
158
+ token_count = count_tokens(all_messages)
159
+ if token_count >= 0:
160
+ logger.info(f"[Stats] Total tokens used: {token_count}")
161
+ return state["messages"][-1].content
162
+ except Exception as e:
163
+ # Catch GraphRecursionError and return a fallback answer
164
+ import langgraph.errors
165
+ if isinstance(e, langgraph.errors.GraphRecursionError):
166
+ logger.info("[Agent] Recursion limit reached, returning fallback answer.")
167
+ return recursion_fallback
168
+ else:
169
+ logger.error(f"[Agent] Unexpected error: {e}")
170
+ raise
171
+
172
+ def download_file(url, dest_path):
173
+ response = requests.get(url, stream=True)
174
+ response.raise_for_status()
175
+ with open(dest_path, 'wb') as f:
176
+ for chunk in response.iter_content(chunk_size=8192):
177
+ f.write(chunk)
178
+ print(f"Downloaded {url} to {dest_path}")
179
+
180
+ # Example usage with logging
181
+
182
+ def web_search_example():
183
+ q = "Tell me about the recent injury of Jamal Musiala"
184
+ logger.info("\n" + "-"*20 + " Running Web Search Example " + "-"*20)
185
+ answer = solve(q)
186
+ logger.info(f"[Result] Q: {q}\nA: {answer}")
187
+ logger.info("\n" + "-"*50 + "\n")
188
+
189
+ def audio_transcription_example():
190
+ q = "Transcribe the audio in the file 'sample_audio.wav'."
191
+ logger.info("\n" + "-"*20 + " Running Audio Transcription Example " + "-"*20)
192
+ answer = solve(q)
193
+ logger.info(f"[Result] Q: {q}\nA: {answer}")
194
+ logger.info("\n" + "-"*50 + "\n")
195
+
196
+ def image_captioning_example():
197
+ q = "Describe the image in the file 'sample_image.jpg'."
198
+ logger.info("\n" + "-"*20 + " Running Image Captioning Example " + "-"*20)
199
+ answer = solve(q)
200
+ logger.info(f"[Result] Q: {q}\nA: {answer}")
201
+ logger.info("\n" + "-"*50 + "\n")
202
+
203
+ def python_file_reader_example():
204
+ q = "Read the first 10 lines of the file 'project/agent.py'."
205
+ logger.info("\n" + "-"*20 + " Running Python File Reader Example " + "-"*20)
206
+ answer = solve(q)
207
+ logger.info(f"[Result] Q: {q}\nA: {answer}")
208
+ logger.info("\n" + "-"*50 + "\n")
209
+
210
+ def image_captioning_real_example():
211
+ import os
212
+ file_path = 'project/sample_image.jpg'
213
+ if not os.path.exists(file_path):
214
+ print(f"Test image file '{file_path}' not found. Please add a real image file to the project directory.")
215
+ return
216
+ q = f"Describe the image in the file '{file_path}'."
217
+ logger.info("\n" + "-"*20 + " Running Image Captioning Real Example " + "-"*20)
218
+ answer = solve(q)
219
+ logger.info(f"[Result] Q: {q}\nA: {answer}")
220
+ logger.info("\n" + "-"*50 + "\n")
221
+ print(f"[Image Captioning Real Example] Q: {q}\nA: {answer}")
222
+
223
+ def python_file_reader_real_example():
224
+ file_path = 'project/agent.py'
225
+ q = f"Read the first 10 lines of the file '{file_path}'."
226
+ logger.info("\n" + "-"*20 + " Running Python File Reader Real Example " + "-"*20)
227
+ answer = solve(q)
228
+ logger.info(f"[Result] Q: {q}\nA: {answer}")
229
+ logger.info("\n" + "-"*50 + "\n")
230
+ print(f"[Python File Reader Real Example] Q: {q}\nA: {answer}")
231
+
232
+ def python_file_execution_example():
233
+ file_path = 'project/exercise.py'
234
+ q = f"What is the output of running the file '{file_path}'?"
235
+ logger.info("\n" + "-"*20 + " Running Python File Execution Example " + "-"*20)
236
+ answer = solve(q)
237
+ logger.info(f"[Result] Q: {q}\nA: {answer}")
238
+ logger.info("\n" + "-"*50 + "\n")
239
+ print(f"[Python File Execution Example] Q: {q}\nA: {answer}")
240
+
241
+ def audio_transcription_real_example():
242
+ import os
243
+ file_path = 'project/sample_audio.wav'
244
+ if not os.path.exists(file_path):
245
+ print(f"Test audio file '{file_path}' not found. Please add a real audio file to the project directory.")
246
+ return
247
+ q = f"Transcribe the audio in the file '{file_path}'."
248
+ logger.info("\n" + "-"*20 + " Running Audio Transcription Real Example " + "-"*20)
249
+ answer = solve(q)
250
+ logger.info(f"[Result] Q: {q}\nA: {answer}")
251
+ logger.info("\n" + "-"*50 + "\n")
252
+ print(f"[Audio Transcription Real Example] Q: {q}\nA: {answer}")
253
+
254
+ def react_single_word_example():
255
+ q = "What is the capital of France?"
256
+ logger.info("\n" + "-"*20 + " Running ReAct Single Word Example " + "-"*20)
257
+ answer = solve(q)
258
+ logger.info(f"[Result] Q: {q}\nA: {answer}")
259
+ logger.info("\n" + "-"*50 + "\n")
260
+ print(f"[ReAct Single Word Example] Q: {q}\nA: {answer}")
261
+
262
+
263
+
264
+ if __name__ == "__main__":
265
+ web_search_example()
app_safe.py ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ import requests
4
+ import inspect
5
+ import pandas as pd
6
+ from agent import solve, download_file
7
+
8
+ # (Keep Constants as is)
9
+ # --- Constants ---
10
+ DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
11
+
12
+ # --- Agent Wrapper ---
13
+ class LangGraphAgent:
14
+ def __init__(self):
15
+ print("LangGraphAgent initialized.")
16
+ def __call__(self, question: str) -> str:
17
+ print(f"LangGraphAgent received question (first 50 chars): {question[:50]}...")
18
+ try:
19
+ answer = solve(question)
20
+ except Exception as e:
21
+ print(f"LangGraphAgent error: {e}")
22
+ answer = f"AGENT ERROR: {e}"
23
+ print(f"LangGraphAgent returning answer: {answer}")
24
+ return answer
25
+
26
+ def run_and_submit_all( profile: gr.OAuthProfile | None):
27
+ """
28
+ Fetches all questions, runs the LangGraphAgent on them, submits all answers,
29
+ and displays the results.
30
+ """
31
+ # --- Determine HF Space Runtime URL and Repo URL ---
32
+ space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
33
+
34
+ if profile:
35
+ username= f"{profile.username}"
36
+ print(f"User logged in: {username}")
37
+ else:
38
+ print("User not logged in.")
39
+ return "Please Login to Hugging Face with the button.", None
40
+
41
+ api_url = DEFAULT_API_URL
42
+ questions_url = f"{api_url}/questions"
43
+ submit_url = f"{api_url}/submit"
44
+
45
+ # 1. Instantiate Agent ( modify this part to create your agent)
46
+ try:
47
+ agent = LangGraphAgent()
48
+ except Exception as e:
49
+ print(f"Error instantiating agent: {e}")
50
+ return f"Error initializing agent: {e}", None
51
+ # In the case of an app running as a hugging Face space, this link points toward your codebase ( usefull for others so please keep it public)
52
+ agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
53
+ print(agent_code)
54
+
55
+ # 2. Fetch Questions
56
+ print(f"Fetching questions from: {questions_url}")
57
+ try:
58
+ response = requests.get(questions_url, timeout=15)
59
+ response.raise_for_status()
60
+ questions_data = response.json()
61
+ if not questions_data:
62
+ print("Fetched questions list is empty.")
63
+ return "Fetched questions list is empty or invalid format.", None
64
+ print(f"Fetched {len(questions_data)} questions.")
65
+ except requests.exceptions.RequestException as e:
66
+ print(f"Error fetching questions: {e}")
67
+ return f"Error fetching questions: {e}", None
68
+ except requests.exceptions.JSONDecodeError as e:
69
+ print(f"Error decoding JSON response from questions endpoint: {e}")
70
+ print(f"Response text: {response.text[:500]}")
71
+ return f"Error decoding server response for questions: {e}", None
72
+ except Exception as e:
73
+ print(f"An unexpected error occurred fetching questions: {e}")
74
+ return f"An unexpected error occurred fetching questions: {e}", None
75
+
76
+ # 3. Run your Agent
77
+ results_log = []
78
+ answers_payload = []
79
+ print(f"Running agent on {len(questions_data)} questions...")
80
+ answered_count = 0
81
+ total_questions = len(questions_data)
82
+ for item in questions_data:
83
+ task_id = item.get("task_id")
84
+ question_text = item.get("question")
85
+ file_name = item.get("file_name")
86
+ # --- File Handling: Download using /files/{task_id} endpoint if file_name is present ---
87
+ if file_name:
88
+ local_path = os.path.join(".", file_name)
89
+ if not os.path.exists(local_path):
90
+ file_api_url = f"{api_url}/files/{task_id}"
91
+ print(f"Downloading file for task {task_id}: {file_api_url} -> {local_path}")
92
+ try:
93
+ download_file(file_api_url, local_path)
94
+ except Exception as e:
95
+ print(f"Failed to download file for task {task_id}: {e}")
96
+ else:
97
+ print(f"File already exists locally: {local_path}")
98
+ # Append file name to the question prompt
99
+ question_text = f"{question_text} (File: {file_name})"
100
+ if not task_id or question_text is None:
101
+ print(f"Skipping item with missing task_id or question: {item}")
102
+ continue
103
+ try:
104
+ submitted_answer = agent(question_text)
105
+ answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
106
+ results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
107
+ except Exception as e:
108
+ print(f"Error running agent on task {task_id}: {e}")
109
+ results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
110
+ answered_count += 1
111
+ print(f"Answered {answered_count}/{total_questions} questions...")
112
+
113
+ if not answers_payload:
114
+ print("Agent did not produce any answers to submit.")
115
+ return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
116
+
117
+ # 4. Prepare Submission
118
+ submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
119
+ status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
120
+ print(status_update)
121
+
122
+ # 5. Submit
123
+ print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
124
+ try:
125
+ response = requests.post(submit_url, json=submission_data, timeout=60)
126
+ response.raise_for_status()
127
+ result_data = response.json()
128
+ final_status = (
129
+ f"Submission Successful!\n"
130
+ f"User: {result_data.get('username')}\n"
131
+ f"Overall Score: {result_data.get('score', 'N/A')}% "
132
+ f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
133
+ f"Message: {result_data.get('message', 'No message received.')}"
134
+ )
135
+ print("Submission successful.")
136
+ results_df = pd.DataFrame(results_log)
137
+ return final_status, results_df
138
+ except requests.exceptions.HTTPError as e:
139
+ error_detail = f"Server responded with status {e.response.status_code}."
140
+ try:
141
+ error_json = e.response.json()
142
+ error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
143
+ except requests.exceptions.JSONDecodeError:
144
+ error_detail += f" Response: {e.response.text[:500]}"
145
+ status_message = f"Submission Failed: {error_detail}"
146
+ print(status_message)
147
+ results_df = pd.DataFrame(results_log)
148
+ return status_message, results_df
149
+ except requests.exceptions.Timeout:
150
+ status_message = "Submission Failed: The request timed out."
151
+ print(status_message)
152
+ results_df = pd.DataFrame(results_log)
153
+ return status_message, results_df
154
+ except requests.exceptions.RequestException as e:
155
+ status_message = f"Submission Failed: Network error - {e}"
156
+ print(status_message)
157
+ results_df = pd.DataFrame(results_log)
158
+ return status_message, results_df
159
+ except Exception as e:
160
+ status_message = f"An unexpected error occurred during submission: {e}"
161
+ print(status_message)
162
+ results_df = pd.DataFrame(results_log)
163
+ return status_message, results_df
164
+
165
+
166
+ # --- Build Gradio Interface using Blocks ---
167
+ with gr.Blocks() as demo:
168
+ gr.Markdown("# Basic Agent Evaluation Runner")
169
+ gr.Markdown(
170
+ """
171
+ **Instructions:**
172
+ 1. Please clone this space, then modify the code to define your agent's logic, the tools, the necessary packages, etc ...
173
+ 2. Log in to your Hugging Face account using the button below. This uses your HF username for submission.
174
+ 3. Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
175
+ ---
176
+ **Disclaimers:**
177
+ Once clicking on the "submit button, it can take quite some time ( this is the time for the agent to go through all the questions).
178
+ This space provides a basic setup and is intentionally sub-optimal to encourage you to develop your own, more robust solution. For instance for the delay process of the submit button, a solution could be to cache the answers and submit in a seperate action or even to answer the questions in async.
179
+ """
180
+ )
181
+
182
+ gr.LoginButton()
183
+
184
+ run_button = gr.Button("Run Evaluation & Submit All Answers")
185
+
186
+ status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
187
+ # Removed max_rows=10 from DataFrame constructor
188
+ results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
189
+
190
+ run_button.click(
191
+ fn=run_and_submit_all,
192
+ outputs=[status_output, results_table]
193
+ )
194
+
195
+ if __name__ == "__main__":
196
+ print("\n" + "-"*30 + " App Starting " + "-"*30)
197
+ # Check for SPACE_HOST and SPACE_ID at startup for information
198
+ space_host_startup = os.getenv("SPACE_HOST")
199
+ space_id_startup = os.getenv("SPACE_ID") # Get SPACE_ID at startup
200
+
201
+ if space_host_startup:
202
+ print(f"✅ SPACE_HOST found: {space_host_startup}")
203
+ print(f" Runtime URL should be: https://{space_host_startup}.hf.space")
204
+ else:
205
+ print("ℹ️ SPACE_HOST environment variable not found (running locally?).")
206
+
207
+ if space_id_startup: # Print repo URLs if SPACE_ID is found
208
+ print(f"✅ SPACE_ID found: {space_id_startup}")
209
+ print(f" Repo URL: https://huggingface.co/spaces/{space_id_startup}")
210
+ print(f" Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main")
211
+ else:
212
+ print("ℹ️ SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined.")
213
+
214
+ print("-"*(60 + len(" App Starting ")) + "\n")
215
+
216
+ print("Launching Gradio Interface for Basic Agent Evaluation...")
217
+ demo.launch(debug=True, share=False)
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ openai
3
+ tavily
4
+ pandas
5
+ requests
6
+ python-dotenv
7
+ langgraph
8
+ langchain
9
+ wikipedia
10
+ sumy
11
+ transformers
12
+ torch
13
+ Pillow
tools.py ADDED
@@ -0,0 +1,280 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel, Field
2
+ from typing import Optional
3
+ import math
4
+ import requests
5
+ from langchain_core.tools import tool
6
+ import os
7
+
8
+ # --- Calculator Tool ---
9
+ class CalculatorInput(BaseModel):
10
+ expression: str = Field(..., description="A mathematical expression to evaluate, e.g. '2 + 2 * 3'.")
11
+
12
+ @tool(args_schema=CalculatorInput, return_direct=True)
13
+ def calculator_tool(expression: str) -> str:
14
+ """Evaluate a mathematical expression, e.g. '2 + 2 * 3'."""
15
+ try:
16
+ # WARNING: eval is dangerous in production! Here we use it for simplicity, but in real apps use a safe parser.
17
+ result = eval(expression, {"__builtins__": None, "math": math}, {})
18
+ return str(result)
19
+ except Exception as e:
20
+ return f"Error: {e}"
21
+
22
+ # --- Wikipedia Search Tool ---
23
+ class WikipediaSearchInput(BaseModel):
24
+ query: str = Field(..., description="The search query for Wikipedia.")
25
+ sentences: Optional[int] = Field(3, description="Number of sentences to return from the summary.")
26
+
27
+ # We'll use the wikipedia library for this tool
28
+ try:
29
+ import wikipedia
30
+ except ImportError:
31
+ wikipedia = None
32
+
33
+ @tool(args_schema=WikipediaSearchInput, return_direct=True)
34
+ def wikipedia_search_tool(query: str, sentences: int = 3) -> str:
35
+ """Search Wikipedia for a summary of a topic."""
36
+ if wikipedia is None:
37
+ return "Wikipedia library not installed. Please install it with 'pip install wikipedia'."
38
+ try:
39
+ summary = wikipedia.summary(query, sentences=sentences)
40
+ return summary
41
+ except Exception as e:
42
+ return f"Wikipedia search error: {e}"
43
+
44
+ # --- Python Interpreter Tool ---
45
+ class PythonInterpreterInput(BaseModel):
46
+ code: str = Field(..., description="Python code to execute. Should print or return the answer.")
47
+
48
+ @tool(args_schema=PythonInterpreterInput, return_direct=True)
49
+ def python_interpreter_tool(code: str) -> str:
50
+ """Execute Python code and return the result. Use variable 'result' or print output."""
51
+ import io
52
+ import contextlib
53
+ local_vars = {}
54
+ output = io.StringIO()
55
+ try:
56
+ with contextlib.redirect_stdout(output):
57
+ exec(code, {"__builtins__": {}}, local_vars)
58
+ # If code defines a variable 'result', return it; else return stdout
59
+ if 'result' in local_vars:
60
+ return str(local_vars['result'])
61
+ result_output = output.getvalue().strip()
62
+ return result_output if result_output else "(No output)"
63
+ except Exception as e:
64
+ return f"Python execution error: {e}"
65
+
66
+ # --- Unit Conversion Tool ---
67
+ class UnitConversionInput(BaseModel):
68
+ value: float = Field(..., description="The numeric value to convert.")
69
+ from_unit: str = Field(..., description="The unit to convert from, e.g. 'meters'.")
70
+ to_unit: str = Field(..., description="The unit to convert to, e.g. 'feet'.")
71
+
72
+ # Simple conversion table for demonstration
73
+ CONVERSION_FACTORS = {
74
+ ("meters", "feet"): 3.28084,
75
+ ("feet", "meters"): 0.3048,
76
+ ("kilograms", "pounds"): 2.20462,
77
+ ("pounds", "kilograms"): 0.453592,
78
+ ("celsius", "fahrenheit"): lambda c: c * 9/5 + 32,
79
+ ("fahrenheit", "celsius"): lambda f: (f - 32) * 5/9,
80
+ }
81
+
82
+ @tool(args_schema=UnitConversionInput, return_direct=True)
83
+ def unit_conversion_tool(value: float, from_unit: str, to_unit: str) -> str:
84
+ """Convert between units (e.g., meters to feet, celsius to fahrenheit)."""
85
+ key = (from_unit.lower(), to_unit.lower())
86
+ try:
87
+ factor = CONVERSION_FACTORS[key]
88
+ if callable(factor):
89
+ result = factor(value)
90
+ else:
91
+ result = value * factor
92
+ return f"{value} {from_unit} = {result} {to_unit}"
93
+ except Exception:
94
+ return f"Conversion from {from_unit} to {to_unit} not supported."
95
+
96
+ # --- Date/Time Calculation Tool ---
97
+ from datetime import datetime, timedelta
98
+ class DateTimeCalcInput(BaseModel):
99
+ base_date: str = Field(..., description="The starting date in YYYY-MM-DD format. If blank, use today.")
100
+ delta_days: int = Field(..., description="Number of days to add (positive) or subtract (negative).")
101
+
102
+ @tool(args_schema=DateTimeCalcInput, return_direct=True)
103
+ def date_time_calc_tool(base_date: str, delta_days: int) -> str:
104
+ """Add or subtract days from a date (YYYY-MM-DD)."""
105
+ try:
106
+ base = datetime.strptime(base_date, "%Y-%m-%d") if base_date else datetime.now()
107
+ new_date = base + timedelta(days=delta_days)
108
+ return new_date.strftime("%Y-%m-%d")
109
+ except Exception as e:
110
+ return f"Date calculation error: {e}"
111
+
112
+ # --- Text Summarization Tool ---
113
+ class SummarizationInput(BaseModel):
114
+ text: str = Field(..., description="Text to summarize.")
115
+ max_sentences: int = Field(3, description="Maximum number of sentences in the summary.")
116
+
117
+ try:
118
+ from sumy.parsers.plaintext import PlaintextParser
119
+ from sumy.nlp.tokenizers import Tokenizer
120
+ from sumy.summarizers.lsa import LsaSummarizer
121
+ except ImportError:
122
+ PlaintextParser = Tokenizer = LsaSummarizer = None
123
+
124
+ @tool(args_schema=SummarizationInput, return_direct=True)
125
+ def summarization_tool(text: str, max_sentences: int = 3) -> str:
126
+ """Summarize a long text into a few sentences."""
127
+ if not (PlaintextParser and Tokenizer and LsaSummarizer):
128
+ return "Summarization library not installed. Please install it with 'pip install sumy'."
129
+ try:
130
+ parser = PlaintextParser.from_string(text, Tokenizer("english"))
131
+ summarizer = LsaSummarizer()
132
+ summary = summarizer(parser.document, max_sentences)
133
+ return " ".join(str(sentence) for sentence in summary)
134
+ except Exception as e:
135
+ return f"Summarization error: {e}"
136
+
137
+ # --- Tavily Search Tool ---
138
+ try:
139
+ from tavily import TavilyClient
140
+ except ImportError:
141
+ TavilyClient = None
142
+
143
+ class TavilySearchInput(BaseModel):
144
+ query: str = Field(..., description="The search query to look up on the web.")
145
+ num_results: int = Field(3, description="Number of results to return.")
146
+
147
+ @tool(args_schema=TavilySearchInput, return_direct=True)
148
+ def tavily_search_tool(query: str, num_results: int = 3) -> str:
149
+ """Search the web for up-to-date information using Tavily API (official client)."""
150
+ api_key = os.getenv("TAVILY_API_KEY")
151
+ if not api_key:
152
+ return "Tavily API key not set. Please set TAVILY_API_KEY in your environment."
153
+ if TavilyClient is None:
154
+ return "Tavily Python client not installed. Please install it with 'pip install tavily'."
155
+ try:
156
+ tavily_client = TavilyClient(api_key=api_key)
157
+ response = tavily_client.search(query, max_results=num_results)
158
+ # response is a dict; try to return the 'answer' or the full response
159
+ if isinstance(response, dict):
160
+ if response.get("answer"):
161
+ return response["answer"]
162
+ elif response.get("results"):
163
+ snippets = [r.get("snippet", "") for r in response["results"][:num_results]]
164
+ return "\n".join(snippets) if snippets else str(response)
165
+ else:
166
+ return str(response)
167
+ else:
168
+ return str(response)
169
+ except Exception as e:
170
+ return f"Tavily search error: {e}"
171
+
172
+ # --- Audio Transcription Tool ---
173
+ class AudioTranscriptionInput(BaseModel):
174
+ file_path: str = Field(..., description="Path to the audio file to transcribe.")
175
+
176
+ @tool(args_schema=AudioTranscriptionInput, return_direct=True)
177
+ def audio_transcription_tool(file_path: str) -> str:
178
+ """Transcribe an audio file using OpenAI's new API (>=1.0.0, gpt-4o-transcribe)."""
179
+ try:
180
+ import openai
181
+ import os
182
+ api_key = os.getenv("OPENAI_API_KEY")
183
+ client = openai.OpenAI(api_key=api_key)
184
+ with open(file_path, "rb") as audio_file:
185
+ transcript = client.audio.transcriptions.create(
186
+ file=audio_file,
187
+ model="gpt-4o-transcribe",
188
+ response_format="text"
189
+ )
190
+ return transcript
191
+ except Exception as e:
192
+ return f"Audio transcription error: {e}"
193
+
194
+ # --- Image Captioning Tool ---
195
+ class ImageCaptioningInput(BaseModel):
196
+ file_path: str = Field(..., description="Path to the image file to caption.")
197
+
198
+ @tool(args_schema=ImageCaptioningInput, return_direct=True)
199
+ def image_captioning_tool(file_path: str) -> str:
200
+ """Generate a caption for an image using BLIP from transformers (requires transformers and torch)."""
201
+ try:
202
+ from PIL import Image
203
+ from transformers import BlipProcessor, BlipForConditionalGeneration
204
+ import torch
205
+ processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
206
+ model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
207
+ image = Image.open(file_path).convert("RGB")
208
+ inputs = processor(image, return_tensors="pt")
209
+ with torch.no_grad():
210
+ out = model.generate(**inputs)
211
+ caption = processor.decode(out[0], skip_special_tokens=True)
212
+ return caption
213
+ except Exception as e:
214
+ return f"Image captioning error: {e}"
215
+
216
+ # --- Python File Reader Tool ---
217
+ class PythonFileReaderInput(BaseModel):
218
+ file_path: str = Field(..., description="Path to the Python file to read.")
219
+ max_lines: Optional[int] = Field(None, description="Maximum number of lines to read from the file.")
220
+
221
+ @tool(args_schema=PythonFileReaderInput, return_direct=True)
222
+ def python_file_reader_tool(file_path: str, max_lines: Optional[int] = None) -> str:
223
+ """Read and return the content of a Python file (optionally limited to max_lines)."""
224
+ try:
225
+ with open(file_path, "r", encoding="utf-8") as f:
226
+ if max_lines is not None:
227
+ lines = [next(f) for _ in range(max_lines)]
228
+ return "".join(lines)
229
+ else:
230
+ return f.read()
231
+ except Exception as e:
232
+ return f"Python file read error: {e}"
233
+
234
+ # --- Data Analysis Tool ---
235
+ class DataAnalysisInput(BaseModel):
236
+ file_path: str = Field(..., description="Path to the Excel or CSV file to analyze.")
237
+ instruction: str = Field(..., description="Analysis instruction, e.g. 'summary', 'head', 'describe', or a column name.")
238
+
239
+ @tool(args_schema=DataAnalysisInput, return_direct=True)
240
+ def data_analysis_tool(file_path: str, instruction: str) -> str:
241
+ """Analyze an Excel or CSV file using pandas. Instruction can be 'summary', 'head', 'describe', or a column name."""
242
+ import pandas as pd
243
+ import os
244
+ try:
245
+ if not os.path.exists(file_path):
246
+ return f"File not found: {file_path}"
247
+ if file_path.endswith('.csv'):
248
+ df = pd.read_csv(file_path)
249
+ elif file_path.endswith('.xlsx') or file_path.endswith('.xls'):
250
+ df = pd.read_excel(file_path)
251
+ else:
252
+ return "Unsupported file type. Only .csv, .xlsx, and .xls are supported."
253
+ instruction_lower = instruction.strip().lower()
254
+ if instruction_lower == 'summary':
255
+ return str(df.info())
256
+ elif instruction_lower == 'head':
257
+ return df.head().to_string()
258
+ elif instruction_lower == 'describe':
259
+ return df.describe().to_string()
260
+ elif instruction in df.columns:
261
+ return df[instruction].to_string()
262
+ else:
263
+ return f"Unknown instruction or column: {instruction}"
264
+ except Exception as e:
265
+ return f"Data analysis error: {e}"
266
+
267
+ # --- Tool List for LangGraph/LangChain ---
268
+ TOOLS = [
269
+ calculator_tool,
270
+ tavily_search_tool,
271
+ wikipedia_search_tool,
272
+ python_interpreter_tool,
273
+ unit_conversion_tool,
274
+ date_time_calc_tool,
275
+ summarization_tool,
276
+ audio_transcription_tool,
277
+ image_captioning_tool,
278
+ python_file_reader_tool,
279
+ data_analysis_tool,
280
+ ]