File size: 7,683 Bytes
10e9b7d 475f56b 009368a 10e9b7d eccf8e4 3c4371f 35bdc51 a0349ea 475f56b d7730f0 009368a 475f56b 70658cb d7730f0 009368a a0349ea 475f56b 70658cb 009368a d876082 009368a d7730f0 70658cb 81d72bd fcc0bb0 15fa167 d2d0f74 d7730f0 009368a d7730f0 d2d0f74 d7730f0 70658cb d7730f0 009368a 35bdc51 fcc0bb0 009368a fcc0bb0 15fa167 fcc0bb0 35bdc51 15fa167 fcc0bb0 15fa167 009368a 15fa167 009368a 475f56b ad7b1a7 d7730f0 475f56b a0349ea d2d0f74 a0349ea 475f56b d7730f0 009368a 35bdc51 009368a d7730f0 009368a d7730f0 009368a d7730f0 009368a d7730f0 009368a d7730f0 009368a d7730f0 009368a d7730f0 15fa167 d7730f0 a442fc4 d7730f0 31243f4 d7730f0 e777122 81d72bd fcc0bb0 0cf07a2 d7730f0 81d72bd d7730f0 009368a d7730f0 81d72bd d7730f0 009368a d7730f0 009368a d7730f0 15fa167 d7730f0 e80aab9 d7730f0 e777122 15fa167 d7730f0 15fa167 009368a d7730f0 009368a a0349ea 009368a d7730f0 009368a 475f56b 009368a fcc0bb0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 |
import os
import logging
import traceback
import gradio as gr
import requests
import pandas as pd
from smolagents import CodeAgent, tool
from smolagents.models import OpenAIServerModel
# Setup logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
logger = logging.getLogger(__name__)
# Constants
SUBMISSION_URL = "https://agents-course-unit4-scoring.hf.space"
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
if not GITHUB_TOKEN:
raise ValueError("CRITICAL: GITHUB_TOKEN environment variable not set.")
GITHUB_ENDPOINT = "https://models.github.ai/inference"
MODEL_ID = os.getenv("MODEL_ID", "openai/gpt-4o-mini")
@tool
def wikipedia_lookup(page_title: str) -> str:
"""
Fetches the summary intro text of an English Wikipedia page. Use exact titles.
Args:
page_title (str): The exact title of the Wikipedia page (e.g., 'Albert Einstein').
"""
page_safe = page_title.replace(" ", "_")
logger.info(f"Wikipedia lookup: '{page_title}'")
try:
url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{page_safe}"
headers = {'User-Agent': f'GAIAgent/1.2 ({os.getenv("SPACE_ID", "unknown")})'}
r = requests.get(url, headers=headers, timeout=15)
r.raise_for_status()
data = r.json()
if extract := data.get("extract", ""):
return extract
title = data.get("title", page_title)
if data.get("type") == "disambiguation":
return f"Wikipedia Error: '{title}' is a disambiguation page. Try a more specific title."
return f"Wikipedia Error: Page '{title}' found but has no summary."
except requests.exceptions.HTTPError as e:
status_code = e.response.status_code
return f"Wikipedia Error: {'Page not found' if status_code == 404 else f'HTTP {status_code}'} for '{page_title}'."
except Exception as e:
return f"Wikipedia Error: {e}"
# Agent prompt - updated to mention only Wikipedia tool
REACT_INSTRUCTION_PROMPT = """You are a helpful assistant using tools to answer questions.
Available Tools:
- wikipedia_lookup(page_title: str): Looks up a specific English Wikipedia page. Use exact titles (e.g., 'Berlin').
Follow these steps:
1. Thought: Plan which tool to use and why.
2. Action: Call the tool (e.g., wikipedia_lookup(page_title="...")).
3. Observation: Record the result.
4. Thought: Analyze result. If answered, prepare final answer. If not, plan next step.
5. Repeat Action/Observation/Thought until answered or determined impossible.
6. Thought: Summarize findings based ONLY on observations.
7. Final Answer: Provide the answer starting exactly with "FINAL ANSWER: " using the required format (number, short string, or comma-separated list).
Formatting Rules for FINAL ANSWER:
- Numbers: Just the number (e.g., `42`).
- Strings: Minimal words, no articles. Digits as words (e.g., `seven`).
- Lists: Comma-separated (e.g., `paris,london,three`).
Let's begin!
"""
# Initialize LLM and agent
logger.info(f"Initializing LLM and agent: {MODEL_ID}")
try:
llm_model = OpenAIServerModel(
model_id=MODEL_ID,
api_key=GITHUB_TOKEN,
api_base=GITHUB_ENDPOINT
)
agent = CodeAgent(
tools=[wikipedia_lookup], # Only Wikipedia tool
model=llm_model
)
logger.info("Agent initialization complete")
except Exception as e:
logger.exception("CRITICAL: Agent initialization failed")
raise RuntimeError(f"Agent initialization failed: {e}") from e
def run_agent_on_question(question: str) -> str:
"""Run the agent on a question and return the result."""
question = question.strip()
if not question:
return "AGENT_ERROR: Empty question"
logger.info(f"Running agent on: '{question}'")
try:
return agent.run(f"{REACT_INSTRUCTION_PROMPT.strip()}\n\nQUESTION: {question}")
except Exception as e:
logger.exception("Agent run failed")
return f"AGENT_ERROR: {e}\n{traceback.format_exc()}"
def evaluate_and_submit():
"""Evaluate all questions and submit answers."""
logger.info("π Starting evaluation...")
username = os.getenv("HF_USERNAME", "unknown_user")
# Fetch questions
try:
questions = requests.get(f"{SUBMISSION_URL}/questions", timeout=20).json()
if not isinstance(questions, list):
raise ValueError("Invalid response format")
logger.info(f"β
Fetched {len(questions)} questions")
except Exception as e:
logger.exception("Failed to fetch questions")
return f"β Error fetching questions: {e}", pd.DataFrame()
if not questions:
return "βΉοΈ No questions received", pd.DataFrame()
# Process questions
results_log = []
answers_payload = []
for i, item in enumerate(questions):
task_id, question_text = item.get("task_id"), item.get("question")
if not task_id or not question_text:
continue
logger.info(f"Processing Q{i+1}/{len(questions)}: ID={task_id}")
raw_output = run_agent_on_question(question_text)
# Extract final answer
if "FINAL ANSWER:" in raw_output:
final_answer = raw_output.split("FINAL ANSWER:", 1)[1].strip()
elif "AGENT_ERROR:" in raw_output:
final_answer = raw_output
else:
final_answer = "AGENT_ERROR: No final answer found"
results_log.append({
"Task ID": task_id,
"Question": question_text,
"Submitted Answer": final_answer,
"Full Output": raw_output
})
answers_payload.append({"task_id": task_id, "submitted_answer": final_answer})
results_df = pd.DataFrame(results_log)
if not answers_payload:
return "β οΈ No answers generated", results_df
# Submit answers
logger.info(f"Submitting {len(answers_payload)} answers...")
space_id = os.getenv("SPACE_ID", "NA")
agent_code_url = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id != "NA" else "NA"
try:
response = requests.post(
f"{SUBMISSION_URL}/submit",
json={"username": username, "agent_code": agent_code_url, "answers": answers_payload},
timeout=90
).json()
score = response.get('score', 'N/A')
score_str = f"{float(score):.2f}%" if isinstance(score, (int, float)) else str(score)
return (f"β
Success! Score: {score_str} "
f"({response.get('correct_count','?')}/{response.get('total_attempted','?')}). "
f"Msg: {response.get('message','')}"), results_df
except Exception as e:
err_msg = f"β Submission Failed: {e}"
if hasattr(e, 'response') and e.response:
err_msg += f" | Response: {e.response.text[:300]}"
return err_msg, results_df
# Gradio interface
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("# π Agent Evaluation Runner π\nEnsure `GITHUB_TOKEN` secret is set. Click Run to start.")
run_button = gr.Button("βΆοΈ Run Evaluation & Submit All Answers", variant="primary")
status_box = gr.Textbox(label="π Status", lines=4, interactive=False)
results_display = gr.DataFrame(
label="π Detailed Log",
headers=["Task ID", "Question", "Submitted Answer", "Full Output"],
wrap=True,
column_widths=["10%", "25%", "20%", "45%"]
)
run_button.click(fn=evaluate_and_submit, outputs=[status_box, results_display])
if __name__ == "__main__":
logger.info("Launching Gradio application...")
demo.launch(debug=True, share=False)
|