Spaces:
Sleeping
Sleeping
File size: 12,137 Bytes
2f9d68e 1726dc3 2f9d68e 1726dc3 10e9b7d 1726dc3 2f9d68e 1726dc3 e80aab9 1726dc3 3c4371f 1726dc3 2f9d68e 1726dc3 2f9d68e 1726dc3 e80aab9 1726dc3 e80aab9 1726dc3 2f9d68e 1726dc3 2f9d68e 1726dc3 2f9d68e 1726dc3 2f9d68e 1726dc3 2f9d68e 1726dc3 2f9d68e 1726dc3 e80aab9 1726dc3 e80aab9 1726dc3 e80aab9 1726dc3 e80aab9 1726dc3 7d65c66 1726dc3 2f9d68e 1726dc3 2f9d68e 1726dc3 7d65c66 1726dc3 7d65c66 1726dc3 3c4371f 1726dc3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 |
import os
import gradio as gr
import requests
import json
import pandas as pd
from agent import BasicAgent
import traceback
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
HF_TOKEN = os.getenv("HF_TOKEN_HERE")
if not HF_TOKEN:
raise ValueError("HF_TOKEN_HERE is missing in Secrets!")
HEADERS = {
"Authorization": f"Bearer {HF_TOKEN}",
"Content-Type": "application/json"
}
VALIDATION_URL = "https://huggingface.co/datasets/gaia-benchmark/GAIA/resolve/main/2023/validation/metadata.jsonl"
def fetch_validation_questions():
"""Fetch validation questions with better error handling."""
try:
response = requests.get(VALIDATION_URL, headers=HEADERS, timeout=15)
response.raise_for_status()
lines = response.text.splitlines()
questions = []
for line in lines:
if line.strip():
try:
row = json.loads(line)
if row.get("Level") == 1:
questions.append({
"task_id": row.get("task_id", ""),
"question": row.get("Question", ""),
"file_name": row.get("file_name", "")
})
except json.JSONDecodeError as e:
print(f"Error parsing line: {line[:50]}... Error: {e}")
continue
print(f"Fetched {len(questions)} Level 1 validation questions.")
return questions[:20] # Limit to 20 for testing
except Exception as e:
print(f"Error fetching validation questions: {e}")
print(f"Traceback: {traceback.format_exc()}")
return []
def run_and_submit_all(use_validation: bool, profile: gr.OAuthProfile | None = None):
"""Enhanced run function with better logging and error handling."""
space_id = os.getenv("SPACE_ID") or "saandip5/Final_Assignment_Template"
if profile:
username = f"{profile.username}"
print(f"User logged in: {username}")
else:
print("User not logged in.")
return "Please Login to Hugging Face with the button.", None
api_url = DEFAULT_API_URL
questions_url = f"{api_url}/questions"
submit_url = f"{api_url}/submit"
agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
print(f"Agent code link: {agent_code}")
# Initialize agent with error handling
try:
agent = BasicAgent()
print("Agent initialized successfully")
except Exception as e:
error_msg = f"Error initializing agent: {e}\n{traceback.format_exc()}"
print(error_msg)
return error_msg, None
# Fetch questions
if use_validation:
print("Using validation dataset...")
questions_data = fetch_validation_questions()
else:
print(f"Fetching test questions from: {questions_url}")
try:
response = requests.get(questions_url, headers=HEADERS, timeout=15)
response.raise_for_status()
questions_data = response.json()
print(f"Fetched {len(questions_data)} test questions.")
except requests.exceptions.RequestException as e:
error_msg = f"Error fetching questions: {e}"
print(error_msg)
return error_msg, None
except json.JSONDecodeError as e:
error_msg = f"Error decoding JSON response: {e}"
print(error_msg)
return error_msg, None
if not questions_data:
error_msg = "Fetched questions list is empty."
print(error_msg)
return error_msg, None
# Process questions
results_log = []
answers_payload = []
successful_answers = 0
print(f"\n{'='*60}")
print(f"STARTING EVALUATION ON {len(questions_data)} QUESTIONS")
print(f"{'='*60}")
for i, item in enumerate(questions_data, 1):
task_id = item.get("task_id")
question_text = item.get("question")
file_name = item.get("file_name", "")
print(f"\n[{i}/{len(questions_data)}] Processing task: {task_id}")
if not task_id or question_text is None:
print(f"Skipping item with missing data: {item}")
continue
try:
# Call agent with enhanced error handling
submitted_answer = agent(question_text, task_id, file_name)
if submitted_answer and submitted_answer != "unknown":
successful_answers += 1
print(f" Answer: {submitted_answer}")
else:
print(f" No answer found")
answers_payload.append({
"task_id": task_id,
"submitted_answer": submitted_answer
})
results_log.append({
"Task ID": task_id,
"Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
"File": file_name,
"Submitted Answer": submitted_answer,
"Status": "Success" if submitted_answer != "unknown" else "❓ Unknown"
})
except Exception as e:
error_msg = f"AGENT ERROR: {str(e)}"
print(f" Error processing task {task_id}: {e}")
print(f"Traceback: {traceback.format_exc()}")
results_log.append({
"Task ID": task_id,
"Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
"File": file_name,
"Submitted Answer": error_msg,
"Status": " Error"
})
print(f"\n{'='*60}")
print(f"EVALUATION COMPLETE")
print(f"Total questions: {len(questions_data)}")
print(f"Successful answers: {successful_answers}")
print(f"Success rate: {(successful_answers/len(questions_data)*100):.1f}%")
print(f"{'='*60}")
if not answers_payload:
error_msg = "Agent did not produce any answers to submit."
print(error_msg)
return error_msg, pd.DataFrame(results_log)
# Save results log
try:
with open("results_log.json", "w") as f:
json.dump(results_log, f, indent=2)
print(" Saved results_log.json")
except Exception as e:
print(f" Error saving results_log.json: {e}")
# Prepare submission
submission_data = {
"username": username.strip(),
"agent_code": agent_code,
"answers": answers_payload
}
status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
print(status_update)
# Submit or return results
if not use_validation:
print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
try:
response = requests.post(submit_url, json=submission_data, headers=HEADERS, timeout=60)
response.raise_for_status()
result_data = response.json()
final_status = (
f" Submission Successful!\n"
f"User: {result_data.get('username')}\n"
f"Overall Score: {result_data.get('score', 'N/A')}% "
f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
f"Message: {result_data.get('message', 'No message received.')}\n\n"
f" Processing Summary:\n"
f"• Total questions processed: {len(questions_data)}\n"
f"• Answers found (non-'unknown'): {successful_answers}\n"
f"• Processing success rate: {(successful_answers/len(questions_data)*100):.1f}%"
)
print(" Submission successful.")
return final_status, pd.DataFrame(results_log)
except requests.exceptions.HTTPError as e:
error_detail = f"Server responded with status {e.response.status_code}."
try:
error_json = e.response.json()
error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
except:
error_detail += f" Response: {e.response.text[:500]}"
status_message = f" Submission Failed: {error_detail}"
print(status_message)
return status_message, pd.DataFrame(results_log)
except Exception as e:
status_message = f"Submission Failed: {e}\n{traceback.format_exc()}"
print(status_message)
return status_message, pd.DataFrame(results_log)
else:
print("Validation mode: Skipping submission, returning results.")
validation_summary = (
f" Validation Run Complete\n\n"
f" Summary:\n"
f"• Total questions processed: {len(questions_data)}\n"
f"• Answers found (non-'unknown'): {successful_answers}\n"
f"• Processing success rate: {(successful_answers/len(questions_data)*100):.1f}%\n\n"
f" This gives you an estimate of potential performance.\n"
f"Check the results table below for detailed breakdown."
)
return validation_summary, pd.DataFrame(results_log)
# Gradio Interface
with gr.Blocks(title="GAIA Benchmark Agent Evaluation", theme=gr.themes.Soft()) as demo:
gr.Markdown("# GAIA Benchmark Agent Evaluation")
gr.Markdown(
"""
### Instructions:
1. **Setup**: Ensure `HF_TOKEN_HERE` is set in Space Secrets
2. **Development**: Clone this Space and modify `agent.py` with your logic
3. **Authentication**: Log in to Hugging Face below
4. **Testing**: Select 'Use Validation' for local testing or leave unchecked for test set submission
5. **Run**: Click 'Run Evaluation & Submit All Answers' to process questions and submit
### Important Notes:
- **Validation Mode**: Use this to test your agent on known questions before submitting
- **Test Mode**: Submits to the actual benchmark (limited submissions per day)
- **Processing Time**: May take several minutes depending on number of questions
- **Debugging**: Check `results_log.json` if you need to debug failures
### Current Goal: Improve accuracy
"""
)
gr.LoginButton()
with gr.Row():
use_validation = gr.Checkbox(
label="🧪 Use Validation Set for Testing",
value=True, # Default to validation for safety
info="Recommended: Test on validation set first before submitting to test set"
)
run_button = gr.Button(
"🚀 Run Evaluation & Submit All Answers",
variant="primary",
size="lg"
)
status_output = gr.Textbox(
label="Run Status / Submission Result",
lines=10,
interactive=False,
show_copy_button=True
)
results_table = gr.DataFrame(
label="Detailed Results: Questions and Agent Answers",
wrap=True,
interactive=False
)
run_button.click(
fn=run_and_submit_all,
inputs=[use_validation],
outputs=[status_output, results_table]
)
if __name__ == "__main__":
print("\n" + "="*70)
print(" GAIA BENCHMARK AGENT - STARTING UP ")
print("="*70)
space_host = os.getenv("SPACE_HOST")
space_id = os.getenv("SPACE_ID") or "saandip5/Final_Assignment_Template"
if space_host:
print(f" SPACE_HOST found: {space_host}")
print(f" Runtime URL: https://{space_host}.hf.space")
else:
print(" SPACE_HOST not found (running locally?)")
if space_id:
print(f" SPACE_ID found: {space_id}")
print(f" Repo URL: https://huggingface.co/spaces/{space_id}")
print(f" Repo Tree URL: https://huggingface.co/spaces/{space_id}/tree/main")
else:
print(" SPACE_ID not found (running locally?)")
print("="*70)
print(" Launching Gradio Interface...")
print("="*70 + "\n")
demo.launch(debug=True, share=False) |