Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -34,7 +34,6 @@ def rate_limited_groq(api_key, prompt, system="", max_tokens=128):
|
|
| 34 |
return resp.json()["choices"][0]["message"]["content"].strip()
|
| 35 |
|
| 36 |
def download_task_file(task_id, hf_token=None):
|
| 37 |
-
"""Download with HF OAuth token for authentication."""
|
| 38 |
url = f"{DEFAULT_API_URL}/files/{task_id}"
|
| 39 |
headers = {}
|
| 40 |
if hf_token:
|
|
@@ -42,7 +41,7 @@ def download_task_file(task_id, hf_token=None):
|
|
| 42 |
try:
|
| 43 |
resp = requests.get(url, headers=headers, timeout=30)
|
| 44 |
print(f" File [{task_id[:8]}]: HTTP {resp.status_code}, "
|
| 45 |
-
f"size={len(resp.content)}, ct={resp.headers.get('content-type','?')[:
|
| 46 |
if resp.status_code != 200 or len(resp.content) == 0:
|
| 47 |
return None, None
|
| 48 |
cd = resp.headers.get("content-disposition", "")
|
|
@@ -62,7 +61,7 @@ def download_task_file(task_id, hf_token=None):
|
|
| 62 |
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=ext, prefix="gaia_")
|
| 63 |
tmp.write(resp.content)
|
| 64 |
tmp.close()
|
| 65 |
-
print(f" Saved: {fname} ({len(resp.content)} bytes)")
|
| 66 |
return tmp.name, fname
|
| 67 |
except Exception as e:
|
| 68 |
print(f" Download error: {e}")
|
|
@@ -131,9 +130,9 @@ def test_api():
|
|
| 131 |
except Exception as e:
|
| 132 |
return f"β {e}"
|
| 133 |
|
| 134 |
-
SYSTEM = """You are a GAIA benchmark agent. Exact match grading
|
| 135 |
Reply with ONLY the final answer. No explanation. No prefix. No "The answer is".
|
| 136 |
-
Give
|
| 137 |
|
| 138 |
class BasicAgent:
|
| 139 |
def __init__(self, hf_token=None):
|
|
@@ -141,7 +140,7 @@ class BasicAgent:
|
|
| 141 |
if not self.key:
|
| 142 |
raise RuntimeError("GROQ_API_KEY not set!")
|
| 143 |
self.hf_token = hf_token
|
| 144 |
-
print(f"Agent ready. Groq
|
| 145 |
|
| 146 |
def ask(self, prompt, max_tokens=128):
|
| 147 |
return clean_answer(rate_limited_groq(self.key, prompt, SYSTEM, max_tokens))
|
|
@@ -157,7 +156,7 @@ class BasicAgent:
|
|
| 157 |
file_ctx = ""
|
| 158 |
is_py = False
|
| 159 |
|
| 160 |
-
# Download file
|
| 161 |
if task_id:
|
| 162 |
lp, fn = download_task_file(task_id, self.hf_token)
|
| 163 |
if lp and fn:
|
|
@@ -183,7 +182,7 @@ class BasicAgent:
|
|
| 183 |
if results and "error" not in results.lower():
|
| 184 |
search_ctx = f"\n[Search]\n{results[:3500]}\n"
|
| 185 |
|
| 186 |
-
# Format hints
|
| 187 |
q = question.lower()
|
| 188 |
fmt = ""
|
| 189 |
if "studio album" in q:
|
|
@@ -196,20 +195,20 @@ class BasicAgent:
|
|
| 196 |
fmt = "\nSingle integer only."
|
| 197 |
elif "how many" in q:
|
| 198 |
fmt = "\nSingle integer only."
|
| 199 |
-
elif "ioc" in q
|
| 200 |
-
fmt = "\nIOC country code only (3 letters). If tied, alphabetically first
|
| 201 |
elif "excel" in q or ("sale" in q and "food" in q):
|
| 202 |
-
fmt = "\
|
| 203 |
elif "chess" in q:
|
| 204 |
-
fmt = "\nChess move in algebraic notation only (e.g. Qd8
|
| 205 |
elif "pitcher" in q and "number" in q:
|
| 206 |
-
fmt = "\nTwo last names
|
| 207 |
elif "wikipedia" in q and "nominat" in q:
|
| 208 |
fmt = "\nWikipedia username only."
|
| 209 |
elif "grocery" in q or ("shopping" in q and "list" in q):
|
| 210 |
fmt = "\nComma-separated list, alphabetical order."
|
| 211 |
elif "youtube" in q or "video" in q:
|
| 212 |
-
fmt = "\nExact short answer
|
| 213 |
|
| 214 |
prompt = (
|
| 215 |
f"Question: {question}"
|
|
@@ -232,15 +231,20 @@ class BasicAgent:
|
|
| 232 |
print(f" Error: {e}")
|
| 233 |
return ""
|
| 234 |
|
| 235 |
-
def run_and_submit_all(profile: gr.OAuthProfile | None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 236 |
space_id = os.getenv("SPACE_ID")
|
| 237 |
if not profile:
|
| 238 |
return "Please Login to Hugging Face.", None
|
| 239 |
|
| 240 |
username = profile.username
|
| 241 |
-
#
|
| 242 |
-
hf_token =
|
| 243 |
-
print(f"User: {username}
|
| 244 |
|
| 245 |
try:
|
| 246 |
agent = BasicAgent(hf_token=hf_token)
|
|
@@ -280,7 +284,8 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
| 280 |
|
| 281 |
try:
|
| 282 |
resp = requests.post(f"{DEFAULT_API_URL}/submit",
|
| 283 |
-
json={"username": username.strip(), "agent_code": agent_code,
|
|
|
|
| 284 |
timeout=60)
|
| 285 |
resp.raise_for_status()
|
| 286 |
r = resp.json()
|
|
@@ -292,7 +297,10 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
| 292 |
|
| 293 |
with gr.Blocks() as demo:
|
| 294 |
gr.Markdown("# Basic Agent Evaluation Runner")
|
| 295 |
-
gr.Markdown(
|
|
|
|
|
|
|
|
|
|
| 296 |
gr.LoginButton()
|
| 297 |
with gr.Row():
|
| 298 |
test_btn = gr.Button("π¬ Test Groq API", variant="secondary")
|
|
@@ -302,7 +310,12 @@ with gr.Blocks() as demo:
|
|
| 302 |
run_button = gr.Button("π Run Evaluation & Submit All Answers", variant="primary")
|
| 303 |
status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
|
| 304 |
results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
|
| 305 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 306 |
|
| 307 |
if __name__ == "__main__":
|
| 308 |
key = os.getenv("GROQ_API_KEY", "")
|
|
|
|
| 34 |
return resp.json()["choices"][0]["message"]["content"].strip()
|
| 35 |
|
| 36 |
def download_task_file(task_id, hf_token=None):
|
|
|
|
| 37 |
url = f"{DEFAULT_API_URL}/files/{task_id}"
|
| 38 |
headers = {}
|
| 39 |
if hf_token:
|
|
|
|
| 41 |
try:
|
| 42 |
resp = requests.get(url, headers=headers, timeout=30)
|
| 43 |
print(f" File [{task_id[:8]}]: HTTP {resp.status_code}, "
|
| 44 |
+
f"size={len(resp.content)}, ct={resp.headers.get('content-type','?')[:50]}")
|
| 45 |
if resp.status_code != 200 or len(resp.content) == 0:
|
| 46 |
return None, None
|
| 47 |
cd = resp.headers.get("content-disposition", "")
|
|
|
|
| 61 |
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=ext, prefix="gaia_")
|
| 62 |
tmp.write(resp.content)
|
| 63 |
tmp.close()
|
| 64 |
+
print(f" Saved: {fname} ({len(resp.content)} bytes) -> {tmp.name}")
|
| 65 |
return tmp.name, fname
|
| 66 |
except Exception as e:
|
| 67 |
print(f" Download error: {e}")
|
|
|
|
| 130 |
except Exception as e:
|
| 131 |
return f"β {e}"
|
| 132 |
|
| 133 |
+
SYSTEM = """You are a GAIA benchmark agent. Exact match grading is used.
|
| 134 |
Reply with ONLY the final answer. No explanation. No prefix. No "The answer is".
|
| 135 |
+
Give only: a name, number, word, or short phrase."""
|
| 136 |
|
| 137 |
class BasicAgent:
|
| 138 |
def __init__(self, hf_token=None):
|
|
|
|
| 140 |
if not self.key:
|
| 141 |
raise RuntimeError("GROQ_API_KEY not set!")
|
| 142 |
self.hf_token = hf_token
|
| 143 |
+
print(f"Agent ready. Groq: {self.key[:8]}... | HF token: {'YES β
' if hf_token else 'NO β'}")
|
| 144 |
|
| 145 |
def ask(self, prompt, max_tokens=128):
|
| 146 |
return clean_answer(rate_limited_groq(self.key, prompt, SYSTEM, max_tokens))
|
|
|
|
| 156 |
file_ctx = ""
|
| 157 |
is_py = False
|
| 158 |
|
| 159 |
+
# Download file using HF OAuth token
|
| 160 |
if task_id:
|
| 161 |
lp, fn = download_task_file(task_id, self.hf_token)
|
| 162 |
if lp and fn:
|
|
|
|
| 182 |
if results and "error" not in results.lower():
|
| 183 |
search_ctx = f"\n[Search]\n{results[:3500]}\n"
|
| 184 |
|
| 185 |
+
# Format hints per question type
|
| 186 |
q = question.lower()
|
| 187 |
fmt = ""
|
| 188 |
if "studio album" in q:
|
|
|
|
| 195 |
fmt = "\nSingle integer only."
|
| 196 |
elif "how many" in q:
|
| 197 |
fmt = "\nSingle integer only."
|
| 198 |
+
elif "ioc" in q:
|
| 199 |
+
fmt = "\nIOC country code only (3 letters e.g. USA). If tied on athlete count, alphabetically first."
|
| 200 |
elif "excel" in q or ("sale" in q and "food" in q):
|
| 201 |
+
fmt = "\nUSD with two decimal places, no $ sign, no commas (e.g. 8945.50)."
|
| 202 |
elif "chess" in q:
|
| 203 |
+
fmt = "\nChess move in algebraic notation only (e.g. Qd8)."
|
| 204 |
elif "pitcher" in q and "number" in q:
|
| 205 |
+
fmt = "\nTwo last names comma-separated. Lower jersey number pitcher first."
|
| 206 |
elif "wikipedia" in q and "nominat" in q:
|
| 207 |
fmt = "\nWikipedia username only."
|
| 208 |
elif "grocery" in q or ("shopping" in q and "list" in q):
|
| 209 |
fmt = "\nComma-separated list, alphabetical order."
|
| 210 |
elif "youtube" in q or "video" in q:
|
| 211 |
+
fmt = "\nExact short answer only."
|
| 212 |
|
| 213 |
prompt = (
|
| 214 |
f"Question: {question}"
|
|
|
|
| 231 |
print(f" Error: {e}")
|
| 232 |
return ""
|
| 233 |
|
| 234 |
+
def run_and_submit_all(profile: gr.OAuthProfile | None,
|
| 235 |
+
oauth_token: gr.OAuthToken | None):
|
| 236 |
+
"""
|
| 237 |
+
IMPORTANT: oauth_token gives us the actual HF bearer token
|
| 238 |
+
needed to download task files from the scoring API.
|
| 239 |
+
"""
|
| 240 |
space_id = os.getenv("SPACE_ID")
|
| 241 |
if not profile:
|
| 242 |
return "Please Login to Hugging Face.", None
|
| 243 |
|
| 244 |
username = profile.username
|
| 245 |
+
# Extract the actual token string
|
| 246 |
+
hf_token = oauth_token.token if oauth_token else None
|
| 247 |
+
print(f"User: {username} | HF token present: {'YES β
' if hf_token else 'NO β'}")
|
| 248 |
|
| 249 |
try:
|
| 250 |
agent = BasicAgent(hf_token=hf_token)
|
|
|
|
| 284 |
|
| 285 |
try:
|
| 286 |
resp = requests.post(f"{DEFAULT_API_URL}/submit",
|
| 287 |
+
json={"username": username.strip(), "agent_code": agent_code,
|
| 288 |
+
"answers": answers_payload},
|
| 289 |
timeout=60)
|
| 290 |
resp.raise_for_status()
|
| 291 |
r = resp.json()
|
|
|
|
| 297 |
|
| 298 |
with gr.Blocks() as demo:
|
| 299 |
gr.Markdown("# Basic Agent Evaluation Runner")
|
| 300 |
+
gr.Markdown(
|
| 301 |
+
"**Setup:** Add `GROQ_API_KEY` in Space Settings β Secrets. "
|
| 302 |
+
"Free key at [console.groq.com](https://console.groq.com)"
|
| 303 |
+
)
|
| 304 |
gr.LoginButton()
|
| 305 |
with gr.Row():
|
| 306 |
test_btn = gr.Button("π¬ Test Groq API", variant="secondary")
|
|
|
|
| 310 |
run_button = gr.Button("π Run Evaluation & Submit All Answers", variant="primary")
|
| 311 |
status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
|
| 312 |
results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
|
| 313 |
+
# Pass BOTH profile AND oauth_token so we can use the HF bearer token
|
| 314 |
+
run_button.click(
|
| 315 |
+
fn=run_and_submit_all,
|
| 316 |
+
|
| 317 |
+
outputs=[status_output, results_table]
|
| 318 |
+
)
|
| 319 |
|
| 320 |
if __name__ == "__main__":
|
| 321 |
key = os.getenv("GROQ_API_KEY", "")
|