pmeyhoefer commited on
Commit
fcc0bb0
Β·
verified Β·
1 Parent(s): 15fa167

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -92
app.py CHANGED
@@ -31,9 +31,15 @@ except Exception as e:
31
  logger.error(f"Failed to instantiate DuckDuckGoSearchTool: {e}. Web search will not work.")
32
  search_tool_instance = None
33
 
 
34
  @tool
35
  def web_search(query: str) -> str:
36
- """Performs a web search using DuckDuckGo."""
 
 
 
 
 
37
  logger.info(f"Executing web_search with query: '{query[:100]}...'")
38
  if search_tool_instance is None:
39
  return "Search Error: Tool not initialized."
@@ -46,50 +52,60 @@ def web_search(query: str) -> str:
46
  logger.exception(f"web_search failed for query: {query}")
47
  return f"Search Error: {e}"
48
 
 
49
  @tool
50
  def wikipedia_lookup(page_title: str) -> str:
51
- """Fetches the summary introduction text of an English Wikipedia page."""
 
 
 
 
 
52
  page_safe = page_title.replace(" ", "_")
53
  logger.info(f"Executing wikipedia_lookup for page: '{page_title}' (URL: {page_safe})")
54
  try:
55
  url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{page_safe}"
56
- headers = {'User-Agent': f'GAIAgent/1.1 ({os.getenv("SPACE_ID", "unknown")})'}
57
  r = requests.get(url, headers=headers, timeout=15)
58
  r.raise_for_status()
59
  data = r.json()
60
  extract = data.get("extract", "")
61
  if extract:
 
62
  return extract
63
  else:
64
  page_type = data.get("type", "standard")
65
  title = data.get("title", page_title)
66
  if page_type == "disambiguation":
 
67
  return f"Wikipedia Error: '{title}' is a disambiguation page. Try a more specific title."
68
  else:
 
69
  return f"Wikipedia Error: Page '{title}' found but has no summary."
70
  except requests.exceptions.HTTPError as e:
71
  if e.response.status_code == 404:
 
72
  return f"Wikipedia Error: Page '{page_safe}' not found."
73
  else:
 
74
  return f"Wikipedia Error: HTTP {e.response.status_code} for page '{page_safe}'."
75
  except Exception as e:
76
  logger.exception(f"wikipedia_lookup failed for page: {page_safe}")
77
  return f"Wikipedia Error: Unexpected error: {e}"
78
 
79
  # --- The ReACT Prompt (ensure this is the *only* main prompt definition) ---
80
- # Define the *exact* instructions for the LLM, listing the *actual* tool function names.
81
- REACT_INSTRUCTION_PROMPT = """You are a helpful assistant that answers questions using the provided tools.
82
 
83
  Available Tools:
84
- - web_search(query: str): Use this for searching the web.
85
- - wikipedia_lookup(page_title: str): Use this to get information from a specific English Wikipedia page (e.g., 'Berlin', 'Python_(programming_language)').
86
 
87
  Follow these steps:
88
- 1. Thought: Plan which tool to use.
89
  2. Action: Call ONE tool (e.g., web_search(query="...") or wikipedia_lookup(page_title="...")).
90
  3. Observation: Record the result.
91
- 4. Thought: Analyze the result. If answer found, prepare it. If not, plan next step.
92
- 5. Repeat Action/Observation/Thought until answer is found or determined impossible.
93
  6. Thought: Summarize findings based ONLY on observations.
94
  7. Final Answer: Provide the answer starting exactly with "FINAL ANSWER: " using the required format (number, short string, or comma-separated list).
95
 
@@ -117,9 +133,8 @@ except Exception as e:
117
 
118
  logger.info("Initializing CodeAgent...")
119
  try:
120
- # Pass the list of actual tool functions
121
  agent = CodeAgent(
122
- tools=[web_search, wikipedia_lookup],
123
  model=llm_model
124
  )
125
  logger.info("CodeAgent initialized OK.")
@@ -131,17 +146,16 @@ except Exception as e:
131
  def run_agent_on_question(question: str) -> str:
132
  """Runs the agent with the CORRECT prompt."""
133
  question = question.strip()
134
- if not question:
135
- return "AGENT_ERROR: Question cannot be empty."
136
 
137
  # *** CRITICAL: Construct the prompt HERE using the correct variable ***
138
  full_prompt = REACT_INSTRUCTION_PROMPT.strip() + "\n\nQUESTION: " + question
139
  logger.info(f"--- Running Agent for Question: '{question}' ---")
140
- # Add debug log to show the start of the prompt being used
141
- logger.info(f"DEBUG: Using prompt starting with: {full_prompt[:300]}...") # Log beginning of prompt
142
 
143
  try:
144
- raw_result = agent.run(full_prompt) # Pass the correctly constructed prompt
145
  logger.info(f"Agent run completed. Output length: {len(raw_result)}")
146
  return raw_result
147
  except Exception as e:
@@ -149,35 +163,15 @@ def run_agent_on_question(question: str) -> str:
149
  return f"AGENT_ERROR: Exception during run: {e}\n{traceback.format_exc()}"
150
 
151
  # --- Gradio Interface & Submission Logic ---
152
-
153
- # FIX: Define evaluate_and_submit WITHOUT the hf_profile argument initially
154
- # We will get the profile *inside* the function if needed.
155
  def evaluate_and_submit():
156
  """Gradio action: Fetches questions, runs agent, submits results."""
157
  logger.info("πŸš€ Starting evaluation run...")
158
-
159
- # Get profile info *inside* the function - this avoids the TypeError
160
- # Note: This requires the user to be logged in via the button *before* clicking Run.
161
- try:
162
- # This method of getting profile might need adjustment depending on Gradio version/context
163
- # Placeholder: Assuming we can get username some other way if direct profile access fails.
164
- # For now, let's hardcode or retrieve differently if `gr.OAuthProfile()` isn't available here.
165
- # Let's proceed without username for now if OAuthProfile is problematic.
166
- # A better approach might involve JavaScript interaction or different Gradio auth flow.
167
- username = os.getenv("HF_USERNAME", "unknown_user") # Fallback to env var or default
168
- if username == "unknown_user":
169
- logger.warning("Could not determine Hugging Face username reliably. Using fallback.")
170
- # Alternative: Could try reading from OAuth info if available in request context (advanced)
171
-
172
- except Exception as auth_e:
173
- logger.error(f"Could not get user profile: {auth_e}. Using fallback username.")
174
- username = "unknown_user_error"
175
-
176
-
177
  logger.info(f"Running as user (best effort): {username}")
178
 
179
  # 1. Fetch Questions
180
- questions = []
181
  try:
182
  resp = requests.get(f"{SUBMISSION_URL}/questions", timeout=20)
183
  resp.raise_for_status()
@@ -186,91 +180,58 @@ def evaluate_and_submit():
186
  logger.info(f"βœ… Fetched {len(questions)} questions.")
187
  except Exception as e:
188
  logger.exception("Failed to fetch questions")
189
- return f"❌ Error fetching questions: {e}", pd.DataFrame() # Return empty DF on fetch error
190
 
191
- if not questions:
192
- return "ℹ️ No questions fetched.", pd.DataFrame()
193
 
194
  # 2. Run Agent & Collect Results
195
  results_log = []
196
  answers_payload = []
197
  for i, item in enumerate(questions):
198
- task_id = item.get("task_id")
199
- question_text = item.get("question")
200
  if not task_id or not question_text: continue
201
-
202
  logger.info(f"Processing Q {i+1}/{len(questions)} (ID: {task_id})...")
203
  raw_agent_output = run_agent_on_question(question_text)
204
-
205
- final_answer = "AGENT_ERROR: No 'FINAL ANSWER:' marker." # Default
206
- marker = "FINAL ANSWER:"
207
- if marker in raw_agent_output:
208
- final_answer = raw_agent_output.split(marker, 1)[1].strip()
209
- elif "AGENT_ERROR:" in raw_agent_output:
210
- final_answer = raw_agent_output # Submit the error
211
-
212
- results_log.append({
213
- "Task ID": task_id, "Question": question_text,
214
- "Submitted Answer": final_answer, "Full Output": raw_agent_output
215
- })
216
  answers_payload.append({"task_id": task_id, "submitted_answer": final_answer})
217
 
218
  results_df = pd.DataFrame(results_log)
219
- if not answers_payload:
220
- return "⚠️ Agent ran but produced no answers.", results_df
221
 
222
  # 3. Submit Answers
223
  logger.info(f"Submitting {len(answers_payload)} answers...")
224
- space_id = os.getenv("SPACE_ID", "SPACE_ID_NOT_SET")
225
- agent_code_url = f"https://huggingface.co/spaces/{space_id}/tree/main" if "NOT_SET" not in space_id else "URL_NA"
226
  submit_data = {"username": username, "agent_code": agent_code_url, "answers": answers_payload}
227
-
228
  try:
229
  response = requests.post(f"{SUBMISSION_URL}/submit", json=submit_data, timeout=90)
230
- response.raise_for_status()
231
- result = response.json()
232
  logger.info(f"βœ… Submission successful! Response: {result}")
233
- score = result.get('score', 'N/A')
234
- score_str = f"{float(score):.2f}%" if isinstance(score, (int, float)) else str(score)
235
- status = (f"βœ… Success! Score: {score_str} "
236
- f"({result.get('correct_count','?')}/{result.get('total_attempted','?')}). "
237
- f"Msg: {result.get('message','')}")
238
  return status, results_df
239
  except Exception as e:
240
  logger.exception("Submission failed")
241
  err_msg = f"❌ Submission Failed: {e}"
242
- if hasattr(e, 'response') and e.response is not None:
243
- err_msg += f" | Response: {e.response.text[:300]}"
244
  return err_msg, results_df
245
 
246
  # --- Build Gradio App ---
247
  logger.info("Setting up Gradio interface...")
248
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
249
- gr.Markdown("# πŸš€ Agent Evaluation Runner πŸš€")
250
- gr.Markdown("Ensure `GITHUB_TOKEN` secret is set. Click Run to start.")
251
- # Removed LoginButton to simplify and avoid TypeError for now
252
- # gr.LoginButton()
253
-
254
  run_button = gr.Button("▢️ Run Evaluation & Submit All Answers", variant="primary")
255
  status_textbox = gr.Textbox(label="πŸ“Š Status", lines=4, interactive=False)
256
- results_df_display = gr.DataFrame(
257
- label="πŸ“‹ Detailed Log",
258
- headers=["Task ID", "Question", "Submitted Answer", "Full Output"],
259
- wrap=True, column_widths=["10%", "25%", "20%", "45%"]
260
- )
261
-
262
- # Connect button click to the function WITHOUT inputs arg for now
263
- run_button.click(
264
- fn=evaluate_and_submit,
265
- inputs=None, # No direct inputs from UI components
266
- outputs=[status_textbox, results_df_display]
267
- )
268
-
269
  logger.info("Gradio interface setup complete.")
270
 
271
  # --- Launch ---
272
  if __name__ == "__main__":
273
  logger.info("Launching Gradio application...")
274
- demo.launch(debug=True, share=False) # share=False is fine for HF Spaces internally
 
275
  logger.info("Gradio application launched.")
276
-
 
31
  logger.error(f"Failed to instantiate DuckDuckGoSearchTool: {e}. Web search will not work.")
32
  search_tool_instance = None
33
 
34
+ # *** FIX: Added Args description to docstrings ***
35
  @tool
36
  def web_search(query: str) -> str:
37
+ """
38
+ Performs a web search using DuckDuckGo. Use this for general questions or current info.
39
+
40
+ Args:
41
+ query (str): The search query string.
42
+ """
43
  logger.info(f"Executing web_search with query: '{query[:100]}...'")
44
  if search_tool_instance is None:
45
  return "Search Error: Tool not initialized."
 
52
  logger.exception(f"web_search failed for query: {query}")
53
  return f"Search Error: {e}"
54
 
55
+ # *** FIX: Added Args description to docstrings ***
56
  @tool
57
  def wikipedia_lookup(page_title: str) -> str:
58
+ """
59
+ Fetches the summary intro text of an English Wikipedia page. Use exact titles.
60
+
61
+ Args:
62
+ page_title (str): The exact title of the Wikipedia page (e.g., 'Albert Einstein').
63
+ """
64
  page_safe = page_title.replace(" ", "_")
65
  logger.info(f"Executing wikipedia_lookup for page: '{page_title}' (URL: {page_safe})")
66
  try:
67
  url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{page_safe}"
68
+ headers = {'User-Agent': f'GAIAgent/1.2 ({os.getenv("SPACE_ID", "unknown")})'}
69
  r = requests.get(url, headers=headers, timeout=15)
70
  r.raise_for_status()
71
  data = r.json()
72
  extract = data.get("extract", "")
73
  if extract:
74
+ logger.info(f"Wikipedia found summary ({len(extract)} chars) for '{page_title}'.")
75
  return extract
76
  else:
77
  page_type = data.get("type", "standard")
78
  title = data.get("title", page_title)
79
  if page_type == "disambiguation":
80
+ logger.warning(f"Wikipedia page '{title}' is disambiguation.")
81
  return f"Wikipedia Error: '{title}' is a disambiguation page. Try a more specific title."
82
  else:
83
+ logger.warning(f"Wikipedia page '{title}' found but has no summary.")
84
  return f"Wikipedia Error: Page '{title}' found but has no summary."
85
  except requests.exceptions.HTTPError as e:
86
  if e.response.status_code == 404:
87
+ logger.warning(f"Wikipedia page not found: {page_safe}")
88
  return f"Wikipedia Error: Page '{page_safe}' not found."
89
  else:
90
+ logger.error(f"Wikipedia HTTP error {e.response.status_code} for {page_safe}")
91
  return f"Wikipedia Error: HTTP {e.response.status_code} for page '{page_safe}'."
92
  except Exception as e:
93
  logger.exception(f"wikipedia_lookup failed for page: {page_safe}")
94
  return f"Wikipedia Error: Unexpected error: {e}"
95
 
96
  # --- The ReACT Prompt (ensure this is the *only* main prompt definition) ---
97
+ REACT_INSTRUCTION_PROMPT = """You are a helpful assistant using tools to answer questions.
 
98
 
99
  Available Tools:
100
+ - web_search(query: str): Searches the web. Use for general info or current events.
101
+ - wikipedia_lookup(page_title: str): Looks up a specific English Wikipedia page. Use exact titles (e.g., 'Berlin').
102
 
103
  Follow these steps:
104
+ 1. Thought: Plan which tool to use and why.
105
  2. Action: Call ONE tool (e.g., web_search(query="...") or wikipedia_lookup(page_title="...")).
106
  3. Observation: Record the result.
107
+ 4. Thought: Analyze result. If answered, prepare final answer. If not, plan next step.
108
+ 5. Repeat Action/Observation/Thought until answered or determined impossible.
109
  6. Thought: Summarize findings based ONLY on observations.
110
  7. Final Answer: Provide the answer starting exactly with "FINAL ANSWER: " using the required format (number, short string, or comma-separated list).
111
 
 
133
 
134
  logger.info("Initializing CodeAgent...")
135
  try:
 
136
  agent = CodeAgent(
137
+ tools=[web_search, wikipedia_lookup], # Pass the functions decorated with @tool
138
  model=llm_model
139
  )
140
  logger.info("CodeAgent initialized OK.")
 
146
  def run_agent_on_question(question: str) -> str:
147
  """Runs the agent with the CORRECT prompt."""
148
  question = question.strip()
149
+ if not question: return "AGENT_ERROR: Question cannot be empty."
 
150
 
151
  # *** CRITICAL: Construct the prompt HERE using the correct variable ***
152
  full_prompt = REACT_INSTRUCTION_PROMPT.strip() + "\n\nQUESTION: " + question
153
  logger.info(f"--- Running Agent for Question: '{question}' ---")
154
+ # *** Add more prominent logging to verify the prompt ***
155
+ logger.info(f"CRITICAL_DEBUG: Using prompt beginning:\n{full_prompt[:400]}\n...") # Log first 400 chars
156
 
157
  try:
158
+ raw_result = agent.run(full_prompt)
159
  logger.info(f"Agent run completed. Output length: {len(raw_result)}")
160
  return raw_result
161
  except Exception as e:
 
163
  return f"AGENT_ERROR: Exception during run: {e}\n{traceback.format_exc()}"
164
 
165
  # --- Gradio Interface & Submission Logic ---
166
+ # Using the version without direct profile input to avoid potential TypeErrors
 
 
167
  def evaluate_and_submit():
168
  """Gradio action: Fetches questions, runs agent, submits results."""
169
  logger.info("πŸš€ Starting evaluation run...")
170
+ username = os.getenv("HF_USERNAME", "unknown_user") # Fallback username
171
+ if username == "unknown_user": logger.warning("Could not get HF username reliably.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
  logger.info(f"Running as user (best effort): {username}")
173
 
174
  # 1. Fetch Questions
 
175
  try:
176
  resp = requests.get(f"{SUBMISSION_URL}/questions", timeout=20)
177
  resp.raise_for_status()
 
180
  logger.info(f"βœ… Fetched {len(questions)} questions.")
181
  except Exception as e:
182
  logger.exception("Failed to fetch questions")
183
+ return f"❌ Error fetching questions: {e}", pd.DataFrame()
184
 
185
+ if not questions: return "ℹ️ No questions fetched.", pd.DataFrame()
 
186
 
187
  # 2. Run Agent & Collect Results
188
  results_log = []
189
  answers_payload = []
190
  for i, item in enumerate(questions):
191
+ task_id = item.get("task_id"); question_text = item.get("question")
 
192
  if not task_id or not question_text: continue
 
193
  logger.info(f"Processing Q {i+1}/{len(questions)} (ID: {task_id})...")
194
  raw_agent_output = run_agent_on_question(question_text)
195
+ final_answer = "AGENT_ERROR: No 'FINAL ANSWER:' marker."
196
+ marker = "FINAL ANSWER:";
197
+ if marker in raw_agent_output: final_answer = raw_agent_output.split(marker, 1)[1].strip()
198
+ elif "AGENT_ERROR:" in raw_agent_output: final_answer = raw_agent_output
199
+ results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": final_answer, "Full Output": raw_agent_output})
 
 
 
 
 
 
 
200
  answers_payload.append({"task_id": task_id, "submitted_answer": final_answer})
201
 
202
  results_df = pd.DataFrame(results_log)
203
+ if not answers_payload: return "⚠️ Agent ran but produced no answers.", results_df
 
204
 
205
  # 3. Submit Answers
206
  logger.info(f"Submitting {len(answers_payload)} answers...")
207
+ space_id = os.getenv("SPACE_ID", "NA"); agent_code_url = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id != "NA" else "NA"
 
208
  submit_data = {"username": username, "agent_code": agent_code_url, "answers": answers_payload}
 
209
  try:
210
  response = requests.post(f"{SUBMISSION_URL}/submit", json=submit_data, timeout=90)
211
+ response.raise_for_status(); result = response.json()
 
212
  logger.info(f"βœ… Submission successful! Response: {result}")
213
+ score = result.get('score', 'N/A'); score_str = f"{float(score):.2f}%" if isinstance(score, (int, float)) else str(score)
214
+ status = (f"βœ… Success! Score: {score_str} ({result.get('correct_count','?')}/{result.get('total_attempted','?')}). Msg: {result.get('message','')}")
 
 
 
215
  return status, results_df
216
  except Exception as e:
217
  logger.exception("Submission failed")
218
  err_msg = f"❌ Submission Failed: {e}"
219
+ if hasattr(e, 'response') and e.response is not None: err_msg += f" | Response: {e.response.text[:300]}"
 
220
  return err_msg, results_df
221
 
222
  # --- Build Gradio App ---
223
  logger.info("Setting up Gradio interface...")
224
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
225
+ gr.Markdown("# πŸš€ Agent Evaluation Runner πŸš€\nEnsure `GITHUB_TOKEN` secret is set. Click Run to start.")
 
 
 
 
226
  run_button = gr.Button("▢️ Run Evaluation & Submit All Answers", variant="primary")
227
  status_textbox = gr.Textbox(label="πŸ“Š Status", lines=4, interactive=False)
228
+ results_df_display = gr.DataFrame(label="πŸ“‹ Detailed Log", headers=["Task ID", "Question", "Submitted Answer", "Full Output"], wrap=True, column_widths=["10%", "25%", "20%", "45%"])
229
+ run_button.click(fn=evaluate_and_submit, inputs=None, outputs=[status_textbox, results_df_display])
 
 
 
 
 
 
 
 
 
 
 
230
  logger.info("Gradio interface setup complete.")
231
 
232
  # --- Launch ---
233
  if __name__ == "__main__":
234
  logger.info("Launching Gradio application...")
235
+ # Setting share=False as recommended for HF Spaces, debug=True for detailed Gradio logs
236
+ demo.launch(debug=True, share=False)
237
  logger.info("Gradio application launched.")