ahnhs2k commited on
Commit
1cdf0e9
·
1 Parent(s): fdc623e
Files changed (1) hide show
  1. app.py +301 -107
app.py CHANGED
@@ -1,155 +1,349 @@
1
  import os
2
- import re
3
  import time
 
4
  import gradio as gr
5
  import requests
 
6
  import pandas as pd
7
- from typing import TypedDict
8
- from langgraph.graph import StateGraph, START, END
9
  from langchain_openai import ChatOpenAI
10
  from langchain_core.messages import SystemMessage, HumanMessage
11
 
12
- # -------------------------
13
- # Constants
14
- # -------------------------
15
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
16
 
17
- # -------------------------
18
- # Utils
19
- # -------------------------
20
  def clean_answer(text: str) -> str:
21
  if not text:
22
  return ""
 
23
  s = text.strip()
24
- s = re.sub(r"^(Final answer:|Answer:)\s*", "", s, flags=re.I)
 
 
 
 
 
 
 
25
  s = s.splitlines()[0].strip()
26
- s = s.strip("\"'`")
27
- if s.endswith(".") and len(s) > 1:
28
- s = s[:-1]
 
 
 
 
 
 
 
 
29
  return s
30
 
31
- # -------------------------
32
- # State
33
- # -------------------------
34
- class AgentState(TypedDict):
35
- question: str
36
- answer: str
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
- # -------------------------
39
- # LLM
40
- # -------------------------
41
  llm = ChatOpenAI(
42
  model="gpt-4o-mini",
43
  temperature=0,
44
- max_tokens=128
45
  )
46
 
47
- SYSTEM_PROMPT = """
48
- You are solving GAIA benchmark questions.
49
 
50
- Rules (VERY IMPORTANT):
51
- - You MUST reason step by step internally.
52
- - NEVER show your reasoning.
53
- - Output ONLY the final answer.
54
- - No explanation.
55
- - No formatting.
56
- - No extra words.
57
  """
58
 
59
- # -------------------------
60
- # Node
61
- # -------------------------
62
- def solve_question(state: AgentState) -> dict:
63
- question = state["question"]
64
 
65
- messages = [
 
 
 
 
 
 
 
 
66
  SystemMessage(content=SYSTEM_PROMPT),
67
- HumanMessage(content=question)
68
  ]
 
 
 
 
 
 
69
 
70
- resp = llm.invoke(messages)
71
- answer = clean_answer(resp.content)
72
 
73
- return {"answer": answer}
 
74
 
75
- # -------------------------
76
- # Graph
77
- # -------------------------
78
- builder = StateGraph(AgentState)
79
- builder.add_node("solve", solve_question)
80
- builder.add_edge(START, "solve")
81
- builder.add_edge("solve", END)
82
- agent_graph = builder.compile()
83
 
84
- # -------------------------
85
- # Wrapper
86
- # -------------------------
 
 
 
 
 
87
  class BasicAgent:
 
 
 
88
  def __call__(self, question: str) -> str:
89
- result = agent_graph.invoke({"question": question, "answer": ""})
90
- return result["answer"]
91
 
92
- # -------------------------
93
- # Runner
94
- # -------------------------
95
- def run_and_submit_all(profile: gr.OAuthProfile | None):
96
- if not profile:
97
- return "Please login first.", None
 
98
 
99
- username = profile.username
100
- questions_url = f"{DEFAULT_API_URL}/questions"
101
- submit_url = f"{DEFAULT_API_URL}/submit"
102
 
103
- agent = BasicAgent()
 
 
 
 
 
 
104
 
105
- # fetch once
106
- questions = requests.get(questions_url).json()
 
 
 
 
107
 
108
- answers_payload = []
109
- log = []
 
110
 
111
- for q in questions:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  try:
113
- ans = agent(q["question"])
114
- answers_payload.append({
115
- "task_id": q["task_id"],
116
- "submitted_answer": ans
117
- })
118
- log.append({
119
- "task_id": q["task_id"],
120
- "answer": ans
121
- })
122
  except Exception as e:
123
- log.append({
124
- "task_id": q["task_id"],
125
- "answer": f"ERROR: {e}"
126
- })
127
-
128
- submission = {
129
- "username": username,
130
- "agent_code": "https://huggingface.co/spaces/ahnhs2k/Agents_Final_Assignment_",
131
- "answers": answers_payload
132
- }
133
-
134
- res = requests.post(submit_url, json=submission).json()
135
-
136
- status = (
137
- f"Score: {res.get('score')}% "
138
- f"({res.get('correct_count')}/{res.get('total_attempted')})\n"
139
- f"{res.get('message')}"
140
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
 
142
- return status, pd.DataFrame(log)
143
 
144
- # -------------------------
145
- # UI
146
- # -------------------------
147
  with gr.Blocks() as demo:
148
- gr.Markdown("# GAIA Level-1 Agent")
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  gr.LoginButton()
150
- btn = gr.Button("Run & Submit")
151
- out = gr.Textbox(lines=4)
152
- table = gr.DataFrame()
153
- btn.click(run_and_submit_all, outputs=[out, table])
154
 
155
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
 
2
  import time
3
+ import random
4
  import gradio as gr
5
  import requests
6
+ import inspect
7
  import pandas as pd
8
+ from typing import Optional
9
+
10
  from langchain_openai import ChatOpenAI
11
  from langchain_core.messages import SystemMessage, HumanMessage
12
 
13
+ # (Keep Constants as is)
14
+ # --- Constants ---
 
15
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
16
 
17
+ # =========================================================
18
+ # Answer cleaning (EXACT MATCH 최적화)
19
+ # =========================================================
20
  def clean_answer(text: str) -> str:
21
  if not text:
22
  return ""
23
+
24
  s = text.strip()
25
+
26
+ # 흔한 접두/포맷 제거
27
+ s = re.sub(r"^(final\s*answer|answer)\s*:\s*", "", s, flags=re.IGNORECASE).strip()
28
+
29
+ # 코드블록/마크다운 제거
30
+ s = s.strip("`").strip()
31
+
32
+ # 여러 줄이면 첫 줄만
33
  s = s.splitlines()[0].strip()
34
+
35
+ # 양끝 따옴표 제거
36
+ s = s.strip("\"'")
37
+
38
+ # 끝에 . 하나 붙는 버릇 제거 (단, 약어/소수점은 건드리면 위험하니 매우 보수적으로)
39
+ if len(s) > 1 and s.endswith(".") and not re.search(r"\d\.$", s):
40
+ s = s[:-1].strip()
41
+
42
+ # 불필요한 공백 정리
43
+ s = re.sub(r"\s+", " ", s).strip()
44
+
45
  return s
46
 
47
+ # =========================================================
48
+ # Robust request wrapper for GAIA server (429 대응)
49
+ # =========================================================
50
+ def get_with_backoff(url: str, timeout: int = 15, max_retries: int = 6) -> requests.Response:
51
+ for i in range(max_retries):
52
+ try:
53
+ r = requests.get(url, timeout=timeout)
54
+ if r.status_code == 429:
55
+ # 지수 백오프 + 지터
56
+ sleep_s = min(30, (2 ** i) + random.uniform(0, 1.5))
57
+ print(f"[WARN] 429 Too Many Requests. Sleeping {sleep_s:.2f}s then retry {i+1}/{max_retries}...")
58
+ time.sleep(sleep_s)
59
+ continue
60
+ r.raise_for_status()
61
+ return r
62
+ except requests.exceptions.RequestException as e:
63
+ if i == max_retries - 1:
64
+ raise
65
+ sleep_s = min(20, (2 ** i) + random.uniform(0, 1.0))
66
+ print(f"[WARN] GET failed: {e}. Sleeping {sleep_s:.2f}s then retry {i+1}/{max_retries}...")
67
+ time.sleep(sleep_s)
68
+ raise RuntimeError("get_with_backoff exhausted retries")
69
+
70
+ def post_with_backoff(url: str, json_data: dict, timeout: int = 60, max_retries: int = 5) -> requests.Response:
71
+ for i in range(max_retries):
72
+ try:
73
+ r = requests.post(url, json=json_data, timeout=timeout)
74
+ if r.status_code == 429:
75
+ sleep_s = min(30, (2 ** i) + random.uniform(0, 1.5))
76
+ print(f"[WARN] 429 Too Many Requests (POST). Sleeping {sleep_s:.2f}s then retry {i+1}/{max_retries}...")
77
+ time.sleep(sleep_s)
78
+ continue
79
+ r.raise_for_status()
80
+ return r
81
+ except requests.exceptions.RequestException as e:
82
+ if i == max_retries - 1:
83
+ raise
84
+ sleep_s = min(20, (2 ** i) + random.uniform(0, 1.0))
85
+ print(f"[WARN] POST failed: {e}. Sleeping {sleep_s:.2f}s then retry {i+1}/{max_retries}...")
86
+ time.sleep(sleep_s)
87
+ raise RuntimeError("post_with_backoff exhausted retries")
88
+
89
+ # =========================================================
90
+ # LLM setup (OpenAI)
91
+ # =========================================================
92
+ # Space Secrets에 OPENAI_API_KEY 필요
93
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
94
+ if OPENAI_API_KEY:
95
+ os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
96
 
 
 
 
97
  llm = ChatOpenAI(
98
  model="gpt-4o-mini",
99
  temperature=0,
100
+ max_tokens=96,
101
  )
102
 
103
+ SYSTEM_PROMPT = """You are solving GAIA benchmark questions.
 
104
 
105
+ Hard rules:
106
+ - Think step by step internally, but DO NOT reveal your reasoning.
107
+ - Verify arithmetic, units, dates, and entity names before finalizing.
108
+ - Output ONLY the final answer (exactly what should be matched).
109
+ - No explanation. No prefixes. No punctuation unless required by the answer itself.
110
+ - If the answer is a number/date/name, output it in the simplest canonical form.
 
111
  """
112
 
113
+ def build_user_prompt(question: str) -> str:
114
+ return f"""Question:
115
+ {question}
 
 
116
 
117
+ Return ONLY the final answer.
118
+ """
119
+
120
+ # =========================================================
121
+ # 2-pass solve: (1) answer (2) self-check and possibly revise
122
+ # =========================================================
123
+ def solve_with_selfcheck(question: str) -> str:
124
+ # Pass 1: initial solve
125
+ msg1 = [
126
  SystemMessage(content=SYSTEM_PROMPT),
127
+ HumanMessage(content=build_user_prompt(question)),
128
  ]
129
+ r1 = llm.invoke(msg1)
130
+ a1 = clean_answer(getattr(r1, "content", "") or "")
131
+
132
+ # Pass 2: self-check (짧게 검증만)
133
+ # - GAIA는 "정답만"을 요구하므로, 검증도 출력은 정답만 하게 강제
134
+ check_prompt = f"""You previously answered: {a1}
135
 
136
+ Now do a silent verification. If the answer is wrong or not in canonical exact-match form, output the corrected final answer.
137
+ If it is correct, output exactly the same answer again.
138
 
139
+ Question:
140
+ {question}
141
 
142
+ Return ONLY the final answer.
143
+ """
144
+ msg2 = [
145
+ SystemMessage(content=SYSTEM_PROMPT),
146
+ HumanMessage(content=check_prompt),
147
+ ]
148
+ r2 = llm.invoke(msg2)
149
+ a2 = clean_answer(getattr(r2, "content", "") or "")
150
 
151
+ # 둘 다 비었으면 실패 처리
152
+ if not a2 and a1:
153
+ return a1
154
+ return a2
155
+
156
+ # =========================================================
157
+ # Basic Agent Definition (템플릿 유지, 여기만 “진짜”로 바꿈)
158
+ # =========================================================
159
  class BasicAgent:
160
+ def __init__(self):
161
+ print("BasicAgent initialized (LLM + self-check).")
162
+
163
  def __call__(self, question: str) -> str:
164
+ print(f"Agent received question (first 50 chars): {question[:50]}...")
 
165
 
166
+ try:
167
+ answer = solve_with_selfcheck(question)
168
+ except Exception as e:
169
+ # LLM 에러가 나면 빈 답 내면 0점이니, 최소한 에러를 로깅하고 빈 문자열 반환
170
+ # (여기서 다른 fallback 넣고 싶으면 넣을 수 있음)
171
+ print(f"[ERROR] LLM call failed: {e}")
172
+ answer = ""
173
 
174
+ print(f"Agent returning answer: {answer}")
175
+ return answer
 
176
 
177
+ def run_and_submit_all( profile: gr.OAuthProfile | None):
178
+ """
179
+ Fetches all questions, runs the BasicAgent on them, submits all answers,
180
+ and displays the results.
181
+ """
182
+ # --- Determine HF Space Runtime URL and Repo URL ---
183
+ space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
184
 
185
+ if profile:
186
+ username= f"{profile.username}"
187
+ print(f"User logged in: {username}")
188
+ else:
189
+ print("User not logged in.")
190
+ return "Please Login to Hugging Face with the button.", None
191
 
192
+ api_url = DEFAULT_API_URL
193
+ questions_url = f"{api_url}/questions"
194
+ submit_url = f"{api_url}/submit"
195
 
196
+ # 1. Instantiate Agent ( modify this part to create your agent)
197
+ try:
198
+ agent = BasicAgent()
199
+ except Exception as e:
200
+ print(f"Error instantiating agent: {e}")
201
+ return f"Error initializing agent: {e}", None
202
+ # In the case of an app running as a hugging Face space, this link points toward your codebase ( usefull for others so please keep it public)
203
+ agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
204
+ print(agent_code)
205
+
206
+ # 2. Fetch Questions
207
+ print(f"Fetching questions from: {questions_url}")
208
+ try:
209
+ response = requests.get(questions_url, timeout=15)
210
+ response.raise_for_status()
211
+ questions_data = response.json()
212
+ if not questions_data:
213
+ print("Fetched questions list is empty.")
214
+ return "Fetched questions list is empty or invalid format.", None
215
+ print(f"Fetched {len(questions_data)} questions.")
216
+ except requests.exceptions.RequestException as e:
217
+ print(f"Error fetching questions: {e}")
218
+ return f"Error fetching questions: {e}", None
219
+ except requests.exceptions.JSONDecodeError as e:
220
+ print(f"Error decoding JSON response from questions endpoint: {e}")
221
+ print(f"Response text: {response.text[:500]}")
222
+ return f"Error decoding server response for questions: {e}", None
223
+ except Exception as e:
224
+ print(f"An unexpected error occurred fetching questions: {e}")
225
+ return f"An unexpected error occurred fetching questions: {e}", None
226
+
227
+ # 3. Run your Agent
228
+ results_log = []
229
+ answers_payload = []
230
+ print(f"Running agent on {len(questions_data)} questions...")
231
+ for item in questions_data:
232
+ task_id = item.get("task_id")
233
+ question_text = item.get("question")
234
+ if not task_id or question_text is None:
235
+ print(f"Skipping item with missing task_id or question: {item}")
236
+ continue
237
  try:
238
+ submitted_answer = agent(question_text)
239
+ answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
240
+ results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
 
 
 
 
 
 
241
  except Exception as e:
242
+ print(f"Error running agent on task {task_id}: {e}")
243
+ results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
244
+
245
+ if not answers_payload:
246
+ print("Agent did not produce any answers to submit.")
247
+ return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
248
+
249
+ # 4. Prepare Submission
250
+ submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
251
+ status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
252
+ print(status_update)
253
+
254
+ # 5. Submit
255
+ print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
256
+ try:
257
+ response = requests.post(submit_url, json=submission_data, timeout=60)
258
+ response.raise_for_status()
259
+ result_data = response.json()
260
+ final_status = (
261
+ f"Submission Successful!\n"
262
+ f"User: {result_data.get('username')}\n"
263
+ f"Overall Score: {result_data.get('score', 'N/A')}% "
264
+ f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
265
+ f"Message: {result_data.get('message', 'No message received.')}"
266
+ )
267
+ print("Submission successful.")
268
+ results_df = pd.DataFrame(results_log)
269
+ return final_status, results_df
270
+ except requests.exceptions.HTTPError as e:
271
+ error_detail = f"Server responded with status {e.response.status_code}."
272
+ try:
273
+ error_json = e.response.json()
274
+ error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
275
+ except requests.exceptions.JSONDecodeError:
276
+ error_detail += f" Response: {e.response.text[:500]}"
277
+ status_message = f"Submission Failed: {error_detail}"
278
+ print(status_message)
279
+ results_df = pd.DataFrame(results_log)
280
+ return status_message, results_df
281
+ except requests.exceptions.Timeout:
282
+ status_message = "Submission Failed: The request timed out."
283
+ print(status_message)
284
+ results_df = pd.DataFrame(results_log)
285
+ return status_message, results_df
286
+ except requests.exceptions.RequestException as e:
287
+ status_message = f"Submission Failed: Network error - {e}"
288
+ print(status_message)
289
+ results_df = pd.DataFrame(results_log)
290
+ return status_message, results_df
291
+ except Exception as e:
292
+ status_message = f"An unexpected error occurred during submission: {e}"
293
+ print(status_message)
294
+ results_df = pd.DataFrame(results_log)
295
+ return status_message, results_df
296
 
 
297
 
298
+ # --- Build Gradio Interface using Blocks ---
 
 
299
  with gr.Blocks() as demo:
300
+ gr.Markdown("# Basic Agent Evaluation Runner")
301
+ gr.Markdown(
302
+ """
303
+ **Instructions:**
304
+ 1. Please clone this space, then modify the code to define your agent's logic, the tools, the necessary packages, etc ...
305
+ 2. Log in to your Hugging Face account using the button below. This uses your HF username for submission.
306
+ 3. Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
307
+ ---
308
+ **Disclaimers:**
309
+ Once clicking on the "submit button, it can take quite some time ( this is the time for the agent to go through all the questions).
310
+ This space provides a basic setup and is intentionally sub-optimal to encourage you to develop your own, more robust solution. For instance for the delay process of the submit button, a solution could be to cache the answers and submit in a seperate action or even to answer the questions in async.
311
+ """
312
+ )
313
+
314
  gr.LoginButton()
 
 
 
 
315
 
316
+ run_button = gr.Button("Run Evaluation & Submit All Answers")
317
+
318
+ status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
319
+ # Removed max_rows=10 from DataFrame constructor
320
+ results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
321
+
322
+ run_button.click(
323
+ fn=run_and_submit_all,
324
+ outputs=[status_output, results_table]
325
+ )
326
+
327
+ if __name__ == "__main__":
328
+ print("\n" + "-"*30 + " App Starting " + "-"*30)
329
+ # Check for SPACE_HOST and SPACE_ID at startup for information
330
+ space_host_startup = os.getenv("SPACE_HOST")
331
+ space_id_startup = os.getenv("SPACE_ID") # Get SPACE_ID at startup
332
+
333
+ if space_host_startup:
334
+ print(f"✅ SPACE_HOST found: {space_host_startup}")
335
+ print(f" Runtime URL should be: https://{space_host_startup}.hf.space")
336
+ else:
337
+ print("ℹ️ SPACE_HOST environment variable not found (running locally?).")
338
+
339
+ if space_id_startup: # Print repo URLs if SPACE_ID is found
340
+ print(f"✅ SPACE_ID found: {space_id_startup}")
341
+ print(f" Repo URL: https://huggingface.co/spaces/{space_id_startup}")
342
+ print(f" Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main")
343
+ else:
344
+ print("ℹ️ SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined.")
345
+
346
+ print("-"*(60 + len(" App Starting ")) + "\n")
347
+
348
+ print("Launching Gradio Interface for Basic Agent Evaluation...")
349
+ demo.launch(debug=True, share=False)