Update app.py
Browse files
app.py
CHANGED
|
@@ -26,18 +26,47 @@ DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
|
| 26 |
|
| 27 |
@tool
|
| 28 |
def web_search(query: str) -> str:
|
| 29 |
-
"""Search the web using DuckDuckGo
|
| 30 |
try:
|
| 31 |
return DuckDuckGoSearchRun().run(query)
|
| 32 |
except Exception as e:
|
| 33 |
return f"Search error: {e}"
|
| 34 |
|
| 35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
@tool
|
| 37 |
def wikipedia_search(query: str) -> str:
|
| 38 |
-
"""Search Wikipedia for encyclopedic knowledge
|
| 39 |
try:
|
| 40 |
-
wiki = WikipediaAPIWrapper(top_k_results=
|
| 41 |
return wiki.run(query)
|
| 42 |
except Exception as e:
|
| 43 |
return f"Wikipedia error: {e}"
|
|
@@ -46,9 +75,9 @@ def wikipedia_search(query: str) -> str:
|
|
| 46 |
@tool
|
| 47 |
def python_repl(code: str) -> str:
|
| 48 |
"""
|
| 49 |
-
Execute Python code for
|
| 50 |
-
Always use print() to
|
| 51 |
-
|
| 52 |
"""
|
| 53 |
import io, sys
|
| 54 |
old_stdout = sys.stdout
|
|
@@ -67,8 +96,8 @@ def python_repl(code: str) -> str:
|
|
| 67 |
@tool
|
| 68 |
def calculator(expression: str) -> str:
|
| 69 |
"""
|
| 70 |
-
Evaluate a simple
|
| 71 |
-
Examples: '2 + 2', '100 * 1.07 ** 5', 'math.sqrt(144)'
|
| 72 |
"""
|
| 73 |
try:
|
| 74 |
return str(eval(expression, {"math": math, "__builtins__": {}}))
|
|
@@ -79,8 +108,9 @@ def calculator(expression: str) -> str:
|
|
| 79 |
@tool
|
| 80 |
def get_task_file(task_id: str) -> str:
|
| 81 |
"""
|
| 82 |
-
Fetch the file attached to a GAIA task by its task_id.
|
| 83 |
-
Use this when the question mentions an attached file or
|
|
|
|
| 84 |
"""
|
| 85 |
try:
|
| 86 |
import requests as req
|
|
@@ -100,23 +130,44 @@ def get_task_file(task_id: str) -> str:
|
|
| 100 |
class AgentState(TypedDict):
|
| 101 |
messages: Annotated[list[AnyMessage], add_messages]
|
| 102 |
|
| 103 |
-
SYSTEM_PROMPT = """You are a
|
| 104 |
-
|
| 105 |
-
##
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
-
|
| 109 |
-
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
"""
|
| 121 |
|
| 122 |
def _tool_to_openai_schema(t) -> dict:
|
|
@@ -146,11 +197,16 @@ class BasicAgent:
|
|
| 146 |
|
| 147 |
# Mappa nome → funzione tool per esecuzione
|
| 148 |
self.tools_by_name = {t.name: t for t in self.tools_list}
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
|
| 155 |
# Schema OpenAI dei tool per passarli al client
|
| 156 |
self.tools_schema = [_tool_to_openai_schema(t) for t in self.tools_list]
|
|
@@ -204,12 +260,12 @@ class BasicAgent:
|
|
| 204 |
hf_messages = self._messages_to_hf_format([sys_msg] + state["messages"])
|
| 205 |
|
| 206 |
response = self.client.chat_completion(
|
| 207 |
-
model="Qwen/Qwen2.5-
|
| 208 |
messages=hf_messages,
|
| 209 |
tools=self.tools_schema,
|
| 210 |
tool_choice="auto",
|
| 211 |
-
max_tokens=
|
| 212 |
-
temperature=0,
|
| 213 |
)
|
| 214 |
|
| 215 |
choice = response.choices[0].message
|
|
@@ -230,25 +286,134 @@ class BasicAgent:
|
|
| 230 |
tool_calls=tool_calls,
|
| 231 |
)
|
| 232 |
return {"messages": [ai_message]}
|
| 233 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 234 |
def __call__(self, question: str) -> str:
|
| 235 |
-
print(f"Agent received question (first
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 236 |
try:
|
| 237 |
-
result = self.graph.invoke({
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
# Estrai FINAL ANSWER se presente, altrimenti ultima riga
|
| 244 |
-
match = re.search(r"FINAL ANSWER:\s*(.+?)(?:\n|$)", last_message, re.IGNORECASE)
|
| 245 |
-
answer = match.group(1).strip() if match else last_message.strip().split("\n")[-1]
|
| 246 |
-
|
| 247 |
-
print(f"Agent returning answer: {answer}")
|
| 248 |
-
return answer
|
| 249 |
except Exception as e:
|
| 250 |
-
|
| 251 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 252 |
|
| 253 |
def run_and_submit_all( profile: gr.OAuthProfile | None):
|
| 254 |
"""
|
|
@@ -371,42 +536,44 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
|
|
| 371 |
return status_message, results_df
|
| 372 |
|
| 373 |
|
| 374 |
-
|
| 375 |
-
with gr.Blocks() as demo:
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
|
| 384 |
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
| 391 |
|
| 392 |
-
|
| 393 |
|
| 394 |
-
|
| 395 |
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
|
| 399 |
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
|
|
|
|
|
|
|
| 404 |
|
| 405 |
if __name__ == "__main__":
|
| 406 |
print("\n" + "-"*30 + " App Starting " + "-"*30)
|
|
|
|
| 407 |
# Check for SPACE_HOST and SPACE_ID at startup for information
|
| 408 |
space_host_startup = os.getenv("SPACE_HOST")
|
| 409 |
-
space_id_startup = os.getenv("SPACE_ID")
|
| 410 |
|
| 411 |
if space_host_startup:
|
| 412 |
print(f"✅ SPACE_HOST found: {space_host_startup}")
|
|
@@ -414,7 +581,7 @@ if __name__ == "__main__":
|
|
| 414 |
else:
|
| 415 |
print("ℹ️ SPACE_HOST environment variable not found (running locally?).")
|
| 416 |
|
| 417 |
-
if space_id_startup:
|
| 418 |
print(f"✅ SPACE_ID found: {space_id_startup}")
|
| 419 |
print(f" Repo URL: https://huggingface.co/spaces/{space_id_startup}")
|
| 420 |
print(f" Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main")
|
|
@@ -424,4 +591,5 @@ if __name__ == "__main__":
|
|
| 424 |
print("-"*(60 + len(" App Starting ")) + "\n")
|
| 425 |
|
| 426 |
print("Launching Gradio Interface for Basic Agent Evaluation...")
|
| 427 |
-
demo
|
|
|
|
|
|
| 26 |
|
| 27 |
@tool
|
| 28 |
def web_search(query: str) -> str:
|
| 29 |
+
"""Search the web using DuckDuckGo for current facts, news, specific data, recent information, and verification. Returns top search results."""
|
| 30 |
try:
|
| 31 |
return DuckDuckGoSearchRun().run(query)
|
| 32 |
except Exception as e:
|
| 33 |
return f"Search error: {e}"
|
| 34 |
|
| 35 |
|
| 36 |
+
def _wikipedia_api_query(title: str) -> str:
|
| 37 |
+
"""Get plain text extract from English Wikipedia for a page title."""
|
| 38 |
+
import urllib.parse
|
| 39 |
+
url = (
|
| 40 |
+
"https://en.wikipedia.org/w/api.php"
|
| 41 |
+
"?action=query&format=json&prop=extracts&explaintext=1&titles="
|
| 42 |
+
+ urllib.parse.quote(title)
|
| 43 |
+
)
|
| 44 |
+
headers = {"User-Agent": "Mozilla/5.0 (compatible; AgentBot/1.0; +https://example.com/bot)"}
|
| 45 |
+
r = requests.get(url, timeout=15, headers=headers)
|
| 46 |
+
try:
|
| 47 |
+
r.raise_for_status()
|
| 48 |
+
except requests.exceptions.HTTPError as ee:
|
| 49 |
+
print(f"Warning: Wikipedia API HTTP error {ee}")
|
| 50 |
+
return ""
|
| 51 |
+
data = r.json()
|
| 52 |
+
pages = data.get("query", {}).get("pages", {})
|
| 53 |
+
if not pages:
|
| 54 |
+
return ""
|
| 55 |
+
text = next(iter(pages.values())).get("extract", "")
|
| 56 |
+
return text or ""
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
@tool
|
| 60 |
+
def wikipedia_api_query(title: str) -> str:
|
| 61 |
+
"""Get plain text extract from English Wikipedia for a page title."""
|
| 62 |
+
return _wikipedia_api_query(title)
|
| 63 |
+
|
| 64 |
+
|
| 65 |
@tool
|
| 66 |
def wikipedia_search(query: str) -> str:
|
| 67 |
+
"""Search Wikipedia for encyclopedic knowledge: historical facts, biographies, dates, definitions, figures, scientific information. Provides structured text summaries."""
|
| 68 |
try:
|
| 69 |
+
wiki = WikipediaAPIWrapper(top_k_results=3, doc_content_chars_max=3000)
|
| 70 |
return wiki.run(query)
|
| 71 |
except Exception as e:
|
| 72 |
return f"Wikipedia error: {e}"
|
|
|
|
| 75 |
@tool
|
| 76 |
def python_repl(code: str) -> str:
|
| 77 |
"""
|
| 78 |
+
Execute Python code for mathematical calculations, data processing, logic operations, and transformations.
|
| 79 |
+
Always use print() to output results.
|
| 80 |
+
Examples: print(2**10), print([1,2,3].count(2)), data=[1,2,3]; print(sum(data)/len(data))
|
| 81 |
"""
|
| 82 |
import io, sys
|
| 83 |
old_stdout = sys.stdout
|
|
|
|
| 96 |
@tool
|
| 97 |
def calculator(expression: str) -> str:
|
| 98 |
"""
|
| 99 |
+
Evaluate a mathematical expression quickly. Use for simple arithmetic and compound calculations.
|
| 100 |
+
Examples: '2 + 2', '100 * 1.07 ** 5', 'math.sqrt(144)', '(50 + 30) / 2'
|
| 101 |
"""
|
| 102 |
try:
|
| 103 |
return str(eval(expression, {"math": math, "__builtins__": {}}))
|
|
|
|
| 108 |
@tool
|
| 109 |
def get_task_file(task_id: str) -> str:
|
| 110 |
"""
|
| 111 |
+
Fetch the file or document attached to a GAIA task by its task_id.
|
| 112 |
+
Use this when the question mentions an attached file, document, PDF, or any attachment.
|
| 113 |
+
Returns text content for text/JSON files, or indicates binary file type.
|
| 114 |
"""
|
| 115 |
try:
|
| 116 |
import requests as req
|
|
|
|
| 130 |
class AgentState(TypedDict):
|
| 131 |
messages: Annotated[list[AnyMessage], add_messages]
|
| 132 |
|
| 133 |
+
SYSTEM_PROMPT = """You are a highly capable GAIA benchmark solver. Your goal is to answer questions accurately and precisely.
|
| 134 |
+
|
| 135 |
+
## How to Solve Questions - Step by Step
|
| 136 |
+
|
| 137 |
+
1. **Understand the Question**: Read carefully and identify:
|
| 138 |
+
- What type of answer is expected (number, text, list, date, etc.)
|
| 139 |
+
- Key constraints or special formats mentioned
|
| 140 |
+
- Whether a file or document is attached
|
| 141 |
+
|
| 142 |
+
2. **Choose Your Approach**:
|
| 143 |
+
- For arithmetic/math: Use `calculator` or `python_repl`
|
| 144 |
+
- For current facts/events: Use `web_search`
|
| 145 |
+
- For historical/encyclopedic knowledge: Use `wikipedia_search`
|
| 146 |
+
- For attached files: Use `get_task_file`
|
| 147 |
+
- For complex logic/data processing: Use `python_repl`
|
| 148 |
+
|
| 149 |
+
3. **Use Tools Effectively**:
|
| 150 |
+
- Search for key facts and verify information from multiple sources
|
| 151 |
+
- Extract relevant data from search results
|
| 152 |
+
- Perform calculations or transformations
|
| 153 |
+
- Cross-check results when possible
|
| 154 |
+
|
| 155 |
+
4. **Format Your Final Answer**:
|
| 156 |
+
- For numbers: just the number (e.g., "42", "3.14", "-5")
|
| 157 |
+
- For text: exact text without extra punctuation (e.g., "Paris", "Monday")
|
| 158 |
+
- For lists: comma-separated values (e.g., "item1, item2, item3")
|
| 159 |
+
- For dates: use the format specified in the question
|
| 160 |
+
- If completely unsure: respond with just "Unknown"
|
| 161 |
+
|
| 162 |
+
5. **End Response**:
|
| 163 |
+
After your reasoning, output a clean final answer on a new line:
|
| 164 |
+
FINAL ANSWER: <your answer>
|
| 165 |
+
|
| 166 |
+
## Important Rules
|
| 167 |
+
- Never make up facts - always search or calculate
|
| 168 |
+
- Verify key numbers and spelling with web search
|
| 169 |
+
- If a calculation is involved, always show the work
|
| 170 |
+
- Be concise in your reasoning but thorough in verification
|
| 171 |
"""
|
| 172 |
|
| 173 |
def _tool_to_openai_schema(t) -> dict:
|
|
|
|
| 197 |
|
| 198 |
# Mappa nome → funzione tool per esecuzione
|
| 199 |
self.tools_by_name = {t.name: t for t in self.tools_list}
|
| 200 |
+
|
| 201 |
+
hf_token = os.getenv("HF_TOKEN")
|
| 202 |
+
if not hf_token:
|
| 203 |
+
print("WARNING: HF_TOKEN non impostata. L'agente userà fallback locale e risposte molto limitate.")
|
| 204 |
+
self.client = None
|
| 205 |
+
else:
|
| 206 |
+
# InferenceClient diretto — usa la Serverless Inference API HF
|
| 207 |
+
self.client = InferenceClient(
|
| 208 |
+
api_key=hf_token,
|
| 209 |
+
)
|
| 210 |
|
| 211 |
# Schema OpenAI dei tool per passarli al client
|
| 212 |
self.tools_schema = [_tool_to_openai_schema(t) for t in self.tools_list]
|
|
|
|
| 260 |
hf_messages = self._messages_to_hf_format([sys_msg] + state["messages"])
|
| 261 |
|
| 262 |
response = self.client.chat_completion(
|
| 263 |
+
model="Qwen/Qwen2.5-72B-Instruct",
|
| 264 |
messages=hf_messages,
|
| 265 |
tools=self.tools_schema,
|
| 266 |
tool_choice="auto",
|
| 267 |
+
max_tokens=1000,
|
| 268 |
+
temperature=0.1,
|
| 269 |
)
|
| 270 |
|
| 271 |
choice = response.choices[0].message
|
|
|
|
| 286 |
tool_calls=tool_calls,
|
| 287 |
)
|
| 288 |
return {"messages": [ai_message]}
|
| 289 |
+
|
| 290 |
+
def _local_fallback_answer(self, question: str) -> str:
|
| 291 |
+
"""
|
| 292 |
+
Minimal fallback when inference client is unavailable.
|
| 293 |
+
Attempts basic arithmetic only, otherwise returns Unknown.
|
| 294 |
+
"""
|
| 295 |
+
q = question.lower().strip()
|
| 296 |
+
|
| 297 |
+
# Try simple arithmetic if it looks like a math problem
|
| 298 |
+
if re.search(r"(?:how\s+many|calculate|compute|what\s+is).*\d+", q):
|
| 299 |
+
try:
|
| 300 |
+
# Try to extract and evaluate a simple expression
|
| 301 |
+
numbers = re.findall(r"\d+\.?\d*", question)
|
| 302 |
+
if len(numbers) >= 2:
|
| 303 |
+
# Don't try to hardcode logic - just return Unknown
|
| 304 |
+
pass
|
| 305 |
+
except Exception:
|
| 306 |
+
pass
|
| 307 |
+
|
| 308 |
+
return "Unknown"
|
| 309 |
+
|
| 310 |
def __call__(self, question: str) -> str:
|
| 311 |
+
print(f"Agent received question (first 100 chars): {question[:100]}...")
|
| 312 |
+
|
| 313 |
+
if self.client is None:
|
| 314 |
+
print("No HF InferenceClient configured; using local fallback logic")
|
| 315 |
+
return self._local_fallback_answer(question)
|
| 316 |
+
|
| 317 |
+
q_lower = question.lower().strip()
|
| 318 |
+
|
| 319 |
+
# Numeric math shortcut: explicit arithmetic detection reduces hallucination.
|
| 320 |
+
arithmetic = self._extract_arithmetic_expression(question)
|
| 321 |
+
if arithmetic:
|
| 322 |
+
calc = calculator(arithmetic)
|
| 323 |
+
if not calc.startswith("Calculation error"):
|
| 324 |
+
normalized = self._normalize_answer(calc)
|
| 325 |
+
print(f"Arithmetic shortcut using calculator: {arithmetic} -> {normalized}")
|
| 326 |
+
return normalized
|
| 327 |
+
|
| 328 |
+
# The main RL loop
|
| 329 |
+
answer = self._run_agent(question)
|
| 330 |
+
|
| 331 |
+
# Retry with explicit “Unknown” handling and chain-of-thought guidance
|
| 332 |
+
if answer == "Unknown":
|
| 333 |
+
print("Got Unknown on first pass; retrying with more explicit reasoning request")
|
| 334 |
+
replay_question = question + "\n\nPlease reason step by step with tool calls and provide FINAL ANSWER only." # gentle prompt nudge
|
| 335 |
+
answer = self._run_agent(replay_question)
|
| 336 |
+
|
| 337 |
+
print(f"Agent returning answer: '{answer}'")
|
| 338 |
+
return answer
|
| 339 |
+
|
| 340 |
+
def _run_agent(self, question: str) -> str:
|
| 341 |
try:
|
| 342 |
+
result = self.graph.invoke({"messages": [HumanMessage(content=question)]})
|
| 343 |
+
last_message = result["messages"][-1]
|
| 344 |
+
response_text = last_message.content if isinstance(last_message, AIMessage) else str(last_message)
|
| 345 |
+
print(f"Agent raw output (first 300 chars): {response_text[:300]}...")
|
| 346 |
+
return self._extract_answer(response_text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 347 |
except Exception as e:
|
| 348 |
+
error_message = str(e)
|
| 349 |
+
print(f"Agent error during run: {error_message}")
|
| 350 |
+
if "402" in error_message or "Payment Required" in error_message:
|
| 351 |
+
fallback = self._local_fallback_answer(question)
|
| 352 |
+
print(f"Payment required detected; using local fallback answer: {fallback}")
|
| 353 |
+
return fallback
|
| 354 |
+
return "Unknown"
|
| 355 |
+
|
| 356 |
+
def _extract_answer(self, text: str) -> str:
|
| 357 |
+
"""
|
| 358 |
+
Extract the final answer from agent output using multiple strategies.
|
| 359 |
+
"""
|
| 360 |
+
# Strategy 1: Look for explicit "FINAL ANSWER:" marker
|
| 361 |
+
match = re.search(r"FINAL ANSWER:\s*(.+?)(?:\n|$)", text, re.IGNORECASE)
|
| 362 |
+
if match:
|
| 363 |
+
answer = self._normalize_answer(match.group(1).strip())
|
| 364 |
+
if answer and answer != "Unknown":
|
| 365 |
+
return answer
|
| 366 |
+
|
| 367 |
+
# Strategy 2: Look at the last few lines
|
| 368 |
+
lines = [line.strip() for line in text.split('\n') if line.strip()]
|
| 369 |
+
if lines:
|
| 370 |
+
for candidate in reversed(lines[-4:]):
|
| 371 |
+
if candidate and not any(phrase in candidate.lower() for phrase in ["i'm not sure", "error", "failed", "final answer"]):
|
| 372 |
+
normalized = self._normalize_answer(candidate)
|
| 373 |
+
if normalized and normalized != "Unknown":
|
| 374 |
+
return normalized
|
| 375 |
+
|
| 376 |
+
# Fallback
|
| 377 |
+
return "Unknown"
|
| 378 |
+
|
| 379 |
+
def _normalize_answer(self, answer: str) -> str:
|
| 380 |
+
"""Normalize answer text (strip punctuation, normalize choices etc.)."""
|
| 381 |
+
answer_clean = answer.strip().strip('"\'').rstrip('.?,;')
|
| 382 |
+
|
| 383 |
+
if not answer_clean:
|
| 384 |
+
return "Unknown"
|
| 385 |
+
|
| 386 |
+
# Multiple-choice token: take plain option text if present
|
| 387 |
+
mc = re.match(r"^([A-D])\s*[:\)]\s*(.+)$", answer_clean, re.IGNORECASE)
|
| 388 |
+
if mc:
|
| 389 |
+
return mc.group(2).strip()
|
| 390 |
+
|
| 391 |
+
# Numeric decision: enforce numeric format for numeric questions
|
| 392 |
+
if re.match(r"^-?\d+(\.\d+)?$", answer_clean):
|
| 393 |
+
return answer_clean
|
| 394 |
+
|
| 395 |
+
return answer_clean
|
| 396 |
+
|
| 397 |
+
def _extract_arithmetic_expression(self, question: str) -> Optional[str]:
|
| 398 |
+
"""Extract simple arithmetic expression candidate from a question for calculator use."""
|
| 399 |
+
m = re.search(r"([-+]?\d+(?:\.\d+)?(?:\s*[-+*/]\s*\d+(?:\.\d+)?)+)", question)
|
| 400 |
+
if not m:
|
| 401 |
+
return None
|
| 402 |
+
expr = m.group(1).replace("^", "**")
|
| 403 |
+
if re.search(r"[a-zA-Z]", expr):
|
| 404 |
+
return None
|
| 405 |
+
return expr
|
| 406 |
+
|
| 407 |
+
def _detect_question_type(self, question: str) -> str:
|
| 408 |
+
q = question.lower().strip()
|
| 409 |
+
if any(tok in q for tok in ["how many", "calculate", "compute", "sum", "difference", "times", "per cent", "%"]):
|
| 410 |
+
return "numeric"
|
| 411 |
+
if any(tok in q for tok in ["when", "year", "date", "born", "died"]):
|
| 412 |
+
return "date"
|
| 413 |
+
if any(tok in q for tok in ["which of the following", "option", "choose", "select"]):
|
| 414 |
+
return "multiple_choice"
|
| 415 |
+
return "factual"
|
| 416 |
+
|
| 417 |
|
| 418 |
def run_and_submit_all( profile: gr.OAuthProfile | None):
|
| 419 |
"""
|
|
|
|
| 536 |
return status_message, results_df
|
| 537 |
|
| 538 |
|
| 539 |
+
def build_gradio_ui():
|
| 540 |
+
with gr.Blocks() as demo:
|
| 541 |
+
gr.Markdown("# Basic Agent Evaluation Runner")
|
| 542 |
+
gr.Markdown(
|
| 543 |
+
"""
|
| 544 |
+
**Instructions:**
|
| 545 |
|
| 546 |
+
1. Please clone this space, then modify the code to define your agent's logic, the tools, the necessary packages, etc ...
|
| 547 |
+
2. Log in to your Hugging Face account using the button below. This uses your HF username for submission.
|
| 548 |
+
3. Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
|
| 549 |
|
| 550 |
+
---
|
| 551 |
+
**Disclaimers:**
|
| 552 |
+
Once clicking on the "submit button, it can take quite some time ( this is the time for the agent to go through all the questions).
|
| 553 |
+
This space provides a basic setup and is intentionally sub-optimal to encourage you to develop your own, more robust solution. For instance for the delay process of the submit button, a solution could be to cache the answers and submit in a seperate action or even to answer the questions in async.
|
| 554 |
+
"""
|
| 555 |
+
)
|
| 556 |
|
| 557 |
+
gr.LoginButton()
|
| 558 |
|
| 559 |
+
run_button = gr.Button("Run Evaluation & Submit All Answers")
|
| 560 |
|
| 561 |
+
status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
|
| 562 |
+
results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
|
|
|
|
| 563 |
|
| 564 |
+
run_button.click(
|
| 565 |
+
fn=run_and_submit_all,
|
| 566 |
+
outputs=[status_output, results_table]
|
| 567 |
+
)
|
| 568 |
+
|
| 569 |
+
return demo
|
| 570 |
|
| 571 |
if __name__ == "__main__":
|
| 572 |
print("\n" + "-"*30 + " App Starting " + "-"*30)
|
| 573 |
+
|
| 574 |
# Check for SPACE_HOST and SPACE_ID at startup for information
|
| 575 |
space_host_startup = os.getenv("SPACE_HOST")
|
| 576 |
+
space_id_startup = os.getenv("SPACE_ID")
|
| 577 |
|
| 578 |
if space_host_startup:
|
| 579 |
print(f"✅ SPACE_HOST found: {space_host_startup}")
|
|
|
|
| 581 |
else:
|
| 582 |
print("ℹ️ SPACE_HOST environment variable not found (running locally?).")
|
| 583 |
|
| 584 |
+
if space_id_startup:
|
| 585 |
print(f"✅ SPACE_ID found: {space_id_startup}")
|
| 586 |
print(f" Repo URL: https://huggingface.co/spaces/{space_id_startup}")
|
| 587 |
print(f" Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main")
|
|
|
|
| 591 |
print("-"*(60 + len(" App Starting ")) + "\n")
|
| 592 |
|
| 593 |
print("Launching Gradio Interface for Basic Agent Evaluation...")
|
| 594 |
+
demo = build_gradio_ui()
|
| 595 |
+
demo.launch(debug=True, share=False)
|