Spaces:
Runtime error
Runtime error
Refactor run_gaia_evaluation to integrate LiteLLMModel and update agent initialization.
Browse files
app.py
CHANGED
|
@@ -1,18 +1,11 @@
|
|
| 1 |
import os
|
| 2 |
-
import gradio as gr
|
| 3 |
import requests
|
| 4 |
-
import pandas as pd
|
| 5 |
from dotenv import load_dotenv
|
|
|
|
| 6 |
|
| 7 |
-
from smolagents import CodeAgent, OpenAIServerModel, DuckDuckGoSearchTool
|
| 8 |
-
|
| 9 |
-
# Load environment variables (including OPENAI_API_KEY)
|
| 10 |
load_dotenv()
|
| 11 |
|
| 12 |
-
# --- Constants ---
|
| 13 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
| 14 |
-
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
| 15 |
-
|
| 16 |
INSTRUCTIONS = """You are a general AI assistant. I will ask you a question. Report your thoughts, and then provide your final answer.
|
| 17 |
|
| 18 |
CRITICAL FORMATTING RULES:
|
|
@@ -24,62 +17,34 @@ CRITICAL FORMATTING RULES:
|
|
| 24 |
- For strings: no extra spaces, no punctuation unless part of the answer, lowercase
|
| 25 |
- For numbers: just the number, no units, no commas, no currency symbols
|
| 26 |
- Provide ONLY the answer as your final response, nothing else
|
|
|
|
| 27 |
|
| 28 |
You have access to a web search tool to help you find accurate information. Use it when you need to look up facts."""
|
| 29 |
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
def __init__(self):
|
| 33 |
-
print("Initializing SmolAgent with OpenAI model...")
|
| 34 |
-
if not OPENAI_API_KEY:
|
| 35 |
-
raise ValueError("OPENAI_API_KEY not found. Please set it in your environment.")
|
| 36 |
-
|
| 37 |
-
# Initialize the OpenAI-backed model
|
| 38 |
-
self.model = OpenAIServerModel(
|
| 39 |
-
model_id="gpt-4o-mini", # or "gpt-4", "gpt-3.5-turbo", etc.
|
| 40 |
-
api_base="https://api.openai.com/v1",
|
| 41 |
-
api_key=OPENAI_API_KEY,
|
| 42 |
-
)
|
| 43 |
-
|
| 44 |
-
# Initialize the agent with tools and instructions
|
| 45 |
-
self.agent = CodeAgent(
|
| 46 |
-
tools=[DuckDuckGoSearchTool()],
|
| 47 |
-
model=self.model,
|
| 48 |
-
instructions=INSTRUCTIONS,
|
| 49 |
-
max_steps=7,
|
| 50 |
-
)
|
| 51 |
-
print("SmolAgent initialized with CodeAgent and DuckDuckGoSearchTool.")
|
| 52 |
-
|
| 53 |
-
def __call__(self, question: str) -> str:
|
| 54 |
-
print(f"\nπͺ Running on question:\n{question}\n")
|
| 55 |
-
try:
|
| 56 |
-
answer = self.agent.run(question)
|
| 57 |
-
print(f"β
Agent's final answer: {answer}")
|
| 58 |
-
return str(answer)
|
| 59 |
-
except Exception as e:
|
| 60 |
-
import traceback
|
| 61 |
-
traceback.print_exc()
|
| 62 |
-
error_message = f"AGENT ERROR: {e}"
|
| 63 |
-
print(f"β {error_message}")
|
| 64 |
-
return error_message
|
| 65 |
-
|
| 66 |
-
def run_gaia_evaluation(username: str):
|
| 67 |
-
"""Run the complete GAIA evaluation and submit results"""
|
| 68 |
-
print("π GAIA Benchmark Evaluation with ChatGPT")
|
| 69 |
print("=" * 60)
|
| 70 |
-
|
|
|
|
| 71 |
if not username:
|
| 72 |
-
|
| 73 |
-
|
| 74 |
print(f"π€ User: {username}")
|
| 75 |
-
|
| 76 |
-
# Initialize the agent
|
| 77 |
-
try:
|
| 78 |
-
agent = SmolAgent()
|
| 79 |
-
except Exception as e:
|
| 80 |
-
return f"β Failed to initialize agent: {e}"
|
| 81 |
|
| 82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
try:
|
| 84 |
resp = requests.get(f"{DEFAULT_API_URL}/questions", timeout=30)
|
| 85 |
resp.raise_for_status()
|
|
@@ -87,121 +52,51 @@ def run_gaia_evaluation(username: str):
|
|
| 87 |
questions = data if isinstance(data, list) else data.get("questions", [])
|
| 88 |
print(f"π Loaded {len(questions)} questions")
|
| 89 |
except requests.RequestException as e:
|
| 90 |
-
|
|
|
|
| 91 |
|
| 92 |
-
# Process questions
|
| 93 |
results = []
|
| 94 |
-
progress_log = []
|
| 95 |
-
|
| 96 |
for i, q in enumerate(questions):
|
| 97 |
task_id = q["task_id"]
|
| 98 |
text = q["question"]
|
| 99 |
-
progress_log.append(f"β Question {i+1}: {text}")
|
| 100 |
print(f"\nβ Question {i+1}: {text}")
|
| 101 |
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
except Exception as e:
|
| 119 |
-
error_msg = f"AGENT ERROR: {e}"
|
| 120 |
-
results.append({"task_id": task_id, "submitted_answer": error_msg})
|
| 121 |
-
progress_log.append(f"β Error: {error_msg}")
|
| 122 |
-
print(f"β Error: {error_msg}")
|
| 123 |
-
|
| 124 |
-
# Submit results
|
| 125 |
payload = {
|
| 126 |
"username": username,
|
| 127 |
-
"agent_code": "
|
| 128 |
"answers": results,
|
| 129 |
}
|
| 130 |
-
|
| 131 |
try:
|
| 132 |
-
print("π€ Submitting to GAIA leaderboard...")
|
| 133 |
post = requests.post(f"{DEFAULT_API_URL}/submit", json=payload, timeout=60)
|
| 134 |
post.raise_for_status()
|
| 135 |
res = post.json()
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
{'
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
π¬ Message: {res.get('message', 'N/A')}
|
| 145 |
-
{'=' * 60}
|
| 146 |
-
"""
|
| 147 |
-
|
| 148 |
-
# Combine progress log with final results
|
| 149 |
-
full_log = "\n".join(progress_log) + "\n" + result_summary
|
| 150 |
-
return full_log
|
| 151 |
-
|
| 152 |
except requests.RequestException as e:
|
| 153 |
-
|
| 154 |
done = sum(1 for r in results if not r["submitted_answer"].startswith("AGENT ERROR"))
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
# --- Gradio Interface ---
|
| 159 |
-
def create_interface():
|
| 160 |
-
with gr.Blocks(title="GAIA Benchmark with ChatGPT", theme=gr.themes.Soft()) as demo:
|
| 161 |
-
gr.Markdown("# π GAIA Benchmark Evaluation with ChatGPT")
|
| 162 |
-
gr.Markdown("This app runs the GAIA benchmark using ChatGPT (GPT-4o-mini) with web search capabilities.")
|
| 163 |
-
|
| 164 |
-
with gr.Row():
|
| 165 |
-
with gr.Column(scale=1):
|
| 166 |
-
username_input = gr.Textbox(
|
| 167 |
-
label="Hugging Face Username",
|
| 168 |
-
placeholder="Enter your HF username",
|
| 169 |
-
info="This will be used for the GAIA leaderboard submission"
|
| 170 |
-
)
|
| 171 |
-
|
| 172 |
-
run_button = gr.Button("π Run GAIA Evaluation", variant="primary", size="lg")
|
| 173 |
-
|
| 174 |
-
with gr.Column(scale=2):
|
| 175 |
-
output_area = gr.Textbox(
|
| 176 |
-
label="Results & Progress",
|
| 177 |
-
lines=20,
|
| 178 |
-
max_lines=50,
|
| 179 |
-
interactive=False
|
| 180 |
-
)
|
| 181 |
-
|
| 182 |
-
# Event handler
|
| 183 |
-
run_button.click(
|
| 184 |
-
fn=run_gaia_evaluation,
|
| 185 |
-
inputs=[username_input],
|
| 186 |
-
outputs=[output_area]
|
| 187 |
-
)
|
| 188 |
-
|
| 189 |
-
gr.Markdown("""
|
| 190 |
-
### How it works:
|
| 191 |
-
1. Enter your Hugging Face username
|
| 192 |
-
2. Click "Run GAIA Evaluation"
|
| 193 |
-
3. The agent will process all 20 GAIA questions using ChatGPT + web search
|
| 194 |
-
4. Results will be automatically submitted to the GAIA leaderboard
|
| 195 |
-
5. Your score will be displayed here
|
| 196 |
-
|
| 197 |
-
### Requirements:
|
| 198 |
-
- Set `OPENAI_API_KEY` in your environment variables
|
| 199 |
-
- Valid Hugging Face username for leaderboard submission
|
| 200 |
-
""")
|
| 201 |
-
|
| 202 |
-
return demo
|
| 203 |
-
|
| 204 |
-
# --- Main execution ---
|
| 205 |
if __name__ == "__main__":
|
| 206 |
-
|
| 207 |
-
demo.launch()
|
|
|
|
| 1 |
import os
|
|
|
|
| 2 |
import requests
|
|
|
|
| 3 |
from dotenv import load_dotenv
|
| 4 |
+
from smolagents import CodeAgent, DuckDuckGoSearchTool, LiteLLMModel
|
| 5 |
|
|
|
|
|
|
|
|
|
|
| 6 |
load_dotenv()
|
| 7 |
|
|
|
|
| 8 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
|
|
|
|
|
|
| 9 |
INSTRUCTIONS = """You are a general AI assistant. I will ask you a question. Report your thoughts, and then provide your final answer.
|
| 10 |
|
| 11 |
CRITICAL FORMATTING RULES:
|
|
|
|
| 17 |
- For strings: no extra spaces, no punctuation unless part of the answer, lowercase
|
| 18 |
- For numbers: just the number, no units, no commas, no currency symbols
|
| 19 |
- Provide ONLY the answer as your final response, nothing else
|
| 20 |
+
- Expand abbreviations like 'St.' to 'Saint' in city names
|
| 21 |
|
| 22 |
You have access to a web search tool to help you find accurate information. Use it when you need to look up facts."""
|
| 23 |
|
| 24 |
+
def run_gaia_evaluation():
|
| 25 |
+
print("π GAIA Benchmark Evaluation with Ollama")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
print("=" * 60)
|
| 27 |
+
|
| 28 |
+
username = os.getenv("HF_USERNAME")
|
| 29 |
if not username:
|
| 30 |
+
print("β Please set HF_USERNAME environment variable")
|
| 31 |
+
return
|
| 32 |
print(f"π€ User: {username}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
+
model = LiteLLMModel(
|
| 35 |
+
model_id="ollama_chat/gemma3",
|
| 36 |
+
api_base="http://localhost:11434",
|
| 37 |
+
num_ctx=8192,
|
| 38 |
+
temperature=0.1, # Low temperature for more deterministic answers
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
agent = CodeAgent(
|
| 42 |
+
tools=[DuckDuckGoSearchTool()],
|
| 43 |
+
model=model,
|
| 44 |
+
instructions=INSTRUCTIONS,
|
| 45 |
+
max_steps=10,
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
try:
|
| 49 |
resp = requests.get(f"{DEFAULT_API_URL}/questions", timeout=30)
|
| 50 |
resp.raise_for_status()
|
|
|
|
| 52 |
questions = data if isinstance(data, list) else data.get("questions", [])
|
| 53 |
print(f"π Loaded {len(questions)} questions")
|
| 54 |
except requests.RequestException as e:
|
| 55 |
+
print(f"β Error fetching questions: {e}")
|
| 56 |
+
return
|
| 57 |
|
|
|
|
| 58 |
results = []
|
|
|
|
|
|
|
| 59 |
for i, q in enumerate(questions):
|
| 60 |
task_id = q["task_id"]
|
| 61 |
text = q["question"]
|
|
|
|
| 62 |
print(f"\nβ Question {i+1}: {text}")
|
| 63 |
|
| 64 |
+
result = agent.run(text, reset=True)
|
| 65 |
+
result_str = str(result).strip()
|
| 66 |
+
|
| 67 |
+
# Take the last line as the answer (since agent should provide only the answer)
|
| 68 |
+
out = result_str.splitlines()[-1] if result_str else "AGENT ERROR: No response."
|
| 69 |
+
|
| 70 |
+
if out.startswith("{"):
|
| 71 |
+
out = "AGENT ERROR: No final answer."
|
| 72 |
+
|
| 73 |
+
out = out.strip().rstrip(".")
|
| 74 |
+
results.append({"task_id": task_id, "submitted_answer": out})
|
| 75 |
+
print(f"β
Answer: '{out}'")
|
| 76 |
+
print(f"π Preview: {result_str[:200]}...")
|
| 77 |
+
|
| 78 |
+
# Submit answers automatically
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
payload = {
|
| 80 |
"username": username,
|
| 81 |
+
"agent_code": "ollama-gemma3-with-tools",
|
| 82 |
"answers": results,
|
| 83 |
}
|
|
|
|
| 84 |
try:
|
|
|
|
| 85 |
post = requests.post(f"{DEFAULT_API_URL}/submit", json=payload, timeout=60)
|
| 86 |
post.raise_for_status()
|
| 87 |
res = post.json()
|
| 88 |
+
print("\n" + "=" * 60)
|
| 89 |
+
print("π GAIA BENCHMARK RESULTS")
|
| 90 |
+
print("=" * 60)
|
| 91 |
+
print(f"π€ User: {res.get('username', username)}")
|
| 92 |
+
print(f"π Overall Score: {res.get('score', res.get('overall_score', 'N/A'))}%")
|
| 93 |
+
print(f"β
Correct: {res.get('correct_count', res.get('num_correct', 'N/A'))}/{len(results)}")
|
| 94 |
+
print(f"π¬ Message: {res.get('message', 'N/A')}")
|
| 95 |
+
print("=" * 60)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
except requests.RequestException as e:
|
| 97 |
+
print(f"β Error submitting: {e}")
|
| 98 |
done = sum(1 for r in results if not r["submitted_answer"].startswith("AGENT ERROR"))
|
| 99 |
+
print(f"Completed locally: {done}/{len(results)}")
|
| 100 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
if __name__ == "__main__":
|
| 102 |
+
run_gaia_evaluation()
|
|
|