pmeyhoefer's picture
Update app.py
35bdc51 verified
import os
import logging
import traceback
import gradio as gr
import requests
import pandas as pd
from smolagents import CodeAgent, tool
from smolagents.models import OpenAIServerModel
# Setup logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
logger = logging.getLogger(__name__)
# Constants
SUBMISSION_URL = "https://agents-course-unit4-scoring.hf.space"
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
if not GITHUB_TOKEN:
raise ValueError("CRITICAL: GITHUB_TOKEN environment variable not set.")
GITHUB_ENDPOINT = "https://models.github.ai/inference"
MODEL_ID = os.getenv("MODEL_ID", "openai/gpt-4o-mini")
@tool
def wikipedia_lookup(page_title: str) -> str:
"""
Fetches the summary intro text of an English Wikipedia page. Use exact titles.
Args:
page_title (str): The exact title of the Wikipedia page (e.g., 'Albert Einstein').
"""
page_safe = page_title.replace(" ", "_")
logger.info(f"Wikipedia lookup: '{page_title}'")
try:
url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{page_safe}"
headers = {'User-Agent': f'GAIAgent/1.2 ({os.getenv("SPACE_ID", "unknown")})'}
r = requests.get(url, headers=headers, timeout=15)
r.raise_for_status()
data = r.json()
if extract := data.get("extract", ""):
return extract
title = data.get("title", page_title)
if data.get("type") == "disambiguation":
return f"Wikipedia Error: '{title}' is a disambiguation page. Try a more specific title."
return f"Wikipedia Error: Page '{title}' found but has no summary."
except requests.exceptions.HTTPError as e:
status_code = e.response.status_code
return f"Wikipedia Error: {'Page not found' if status_code == 404 else f'HTTP {status_code}'} for '{page_title}'."
except Exception as e:
return f"Wikipedia Error: {e}"
# Agent prompt - updated to mention only Wikipedia tool
REACT_INSTRUCTION_PROMPT = """You are a helpful assistant using tools to answer questions.
Available Tools:
- wikipedia_lookup(page_title: str): Looks up a specific English Wikipedia page. Use exact titles (e.g., 'Berlin').
Follow these steps:
1. Thought: Plan which tool to use and why.
2. Action: Call the tool (e.g., wikipedia_lookup(page_title="...")).
3. Observation: Record the result.
4. Thought: Analyze result. If answered, prepare final answer. If not, plan next step.
5. Repeat Action/Observation/Thought until answered or determined impossible.
6. Thought: Summarize findings based ONLY on observations.
7. Final Answer: Provide the answer starting exactly with "FINAL ANSWER: " using the required format (number, short string, or comma-separated list).
Formatting Rules for FINAL ANSWER:
- Numbers: Just the number (e.g., `42`).
- Strings: Minimal words, no articles. Digits as words (e.g., `seven`).
- Lists: Comma-separated (e.g., `paris,london,three`).
Let's begin!
"""
# Initialize LLM and agent
logger.info(f"Initializing LLM and agent: {MODEL_ID}")
try:
llm_model = OpenAIServerModel(
model_id=MODEL_ID,
api_key=GITHUB_TOKEN,
api_base=GITHUB_ENDPOINT
)
agent = CodeAgent(
tools=[wikipedia_lookup], # Only Wikipedia tool
model=llm_model
)
logger.info("Agent initialization complete")
except Exception as e:
logger.exception("CRITICAL: Agent initialization failed")
raise RuntimeError(f"Agent initialization failed: {e}") from e
def run_agent_on_question(question: str) -> str:
"""Run the agent on a question and return the result."""
question = question.strip()
if not question:
return "AGENT_ERROR: Empty question"
logger.info(f"Running agent on: '{question}'")
try:
return agent.run(f"{REACT_INSTRUCTION_PROMPT.strip()}\n\nQUESTION: {question}")
except Exception as e:
logger.exception("Agent run failed")
return f"AGENT_ERROR: {e}\n{traceback.format_exc()}"
def evaluate_and_submit():
"""Evaluate all questions and submit answers."""
logger.info("πŸš€ Starting evaluation...")
username = os.getenv("HF_USERNAME", "unknown_user")
# Fetch questions
try:
questions = requests.get(f"{SUBMISSION_URL}/questions", timeout=20).json()
if not isinstance(questions, list):
raise ValueError("Invalid response format")
logger.info(f"βœ… Fetched {len(questions)} questions")
except Exception as e:
logger.exception("Failed to fetch questions")
return f"❌ Error fetching questions: {e}", pd.DataFrame()
if not questions:
return "ℹ️ No questions received", pd.DataFrame()
# Process questions
results_log = []
answers_payload = []
for i, item in enumerate(questions):
task_id, question_text = item.get("task_id"), item.get("question")
if not task_id or not question_text:
continue
logger.info(f"Processing Q{i+1}/{len(questions)}: ID={task_id}")
raw_output = run_agent_on_question(question_text)
# Extract final answer
if "FINAL ANSWER:" in raw_output:
final_answer = raw_output.split("FINAL ANSWER:", 1)[1].strip()
elif "AGENT_ERROR:" in raw_output:
final_answer = raw_output
else:
final_answer = "AGENT_ERROR: No final answer found"
results_log.append({
"Task ID": task_id,
"Question": question_text,
"Submitted Answer": final_answer,
"Full Output": raw_output
})
answers_payload.append({"task_id": task_id, "submitted_answer": final_answer})
results_df = pd.DataFrame(results_log)
if not answers_payload:
return "⚠️ No answers generated", results_df
# Submit answers
logger.info(f"Submitting {len(answers_payload)} answers...")
space_id = os.getenv("SPACE_ID", "NA")
agent_code_url = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id != "NA" else "NA"
try:
response = requests.post(
f"{SUBMISSION_URL}/submit",
json={"username": username, "agent_code": agent_code_url, "answers": answers_payload},
timeout=90
).json()
score = response.get('score', 'N/A')
score_str = f"{float(score):.2f}%" if isinstance(score, (int, float)) else str(score)
return (f"βœ… Success! Score: {score_str} "
f"({response.get('correct_count','?')}/{response.get('total_attempted','?')}). "
f"Msg: {response.get('message','')}"), results_df
except Exception as e:
err_msg = f"❌ Submission Failed: {e}"
if hasattr(e, 'response') and e.response:
err_msg += f" | Response: {e.response.text[:300]}"
return err_msg, results_df
# Gradio interface
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("# πŸš€ Agent Evaluation Runner πŸš€\nEnsure `GITHUB_TOKEN` secret is set. Click Run to start.")
run_button = gr.Button("▢️ Run Evaluation & Submit All Answers", variant="primary")
status_box = gr.Textbox(label="πŸ“Š Status", lines=4, interactive=False)
results_display = gr.DataFrame(
label="πŸ“‹ Detailed Log",
headers=["Task ID", "Question", "Submitted Answer", "Full Output"],
wrap=True,
column_widths=["10%", "25%", "20%", "45%"]
)
run_button.click(fn=evaluate_and_submit, outputs=[status_box, results_display])
if __name__ == "__main__":
logger.info("Launching Gradio application...")
demo.launch(debug=True, share=False)