|
|
import os |
|
|
import logging |
|
|
import traceback |
|
|
import gradio as gr |
|
|
import requests |
|
|
import pandas as pd |
|
|
from smolagents import CodeAgent, tool |
|
|
from smolagents.models import OpenAIServerModel |
|
|
|
|
|
|
|
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s") |
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
SUBMISSION_URL = "https://agents-course-unit4-scoring.hf.space" |
|
|
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN") |
|
|
if not GITHUB_TOKEN: |
|
|
raise ValueError("CRITICAL: GITHUB_TOKEN environment variable not set.") |
|
|
GITHUB_ENDPOINT = "https://models.github.ai/inference" |
|
|
MODEL_ID = os.getenv("MODEL_ID", "openai/gpt-4o-mini") |
|
|
|
|
|
@tool |
|
|
def wikipedia_lookup(page_title: str) -> str: |
|
|
""" |
|
|
Fetches the summary intro text of an English Wikipedia page. Use exact titles. |
|
|
|
|
|
Args: |
|
|
page_title (str): The exact title of the Wikipedia page (e.g., 'Albert Einstein'). |
|
|
""" |
|
|
page_safe = page_title.replace(" ", "_") |
|
|
logger.info(f"Wikipedia lookup: '{page_title}'") |
|
|
try: |
|
|
url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{page_safe}" |
|
|
headers = {'User-Agent': f'GAIAgent/1.2 ({os.getenv("SPACE_ID", "unknown")})'} |
|
|
r = requests.get(url, headers=headers, timeout=15) |
|
|
r.raise_for_status() |
|
|
data = r.json() |
|
|
|
|
|
if extract := data.get("extract", ""): |
|
|
return extract |
|
|
|
|
|
title = data.get("title", page_title) |
|
|
if data.get("type") == "disambiguation": |
|
|
return f"Wikipedia Error: '{title}' is a disambiguation page. Try a more specific title." |
|
|
return f"Wikipedia Error: Page '{title}' found but has no summary." |
|
|
except requests.exceptions.HTTPError as e: |
|
|
status_code = e.response.status_code |
|
|
return f"Wikipedia Error: {'Page not found' if status_code == 404 else f'HTTP {status_code}'} for '{page_title}'." |
|
|
except Exception as e: |
|
|
return f"Wikipedia Error: {e}" |
|
|
|
|
|
|
|
|
REACT_INSTRUCTION_PROMPT = """You are a helpful assistant using tools to answer questions. |
|
|
Available Tools: |
|
|
- wikipedia_lookup(page_title: str): Looks up a specific English Wikipedia page. Use exact titles (e.g., 'Berlin'). |
|
|
Follow these steps: |
|
|
1. Thought: Plan which tool to use and why. |
|
|
2. Action: Call the tool (e.g., wikipedia_lookup(page_title="...")). |
|
|
3. Observation: Record the result. |
|
|
4. Thought: Analyze result. If answered, prepare final answer. If not, plan next step. |
|
|
5. Repeat Action/Observation/Thought until answered or determined impossible. |
|
|
6. Thought: Summarize findings based ONLY on observations. |
|
|
7. Final Answer: Provide the answer starting exactly with "FINAL ANSWER: " using the required format (number, short string, or comma-separated list). |
|
|
Formatting Rules for FINAL ANSWER: |
|
|
- Numbers: Just the number (e.g., `42`). |
|
|
- Strings: Minimal words, no articles. Digits as words (e.g., `seven`). |
|
|
- Lists: Comma-separated (e.g., `paris,london,three`). |
|
|
Let's begin! |
|
|
""" |
|
|
|
|
|
|
|
|
logger.info(f"Initializing LLM and agent: {MODEL_ID}") |
|
|
try: |
|
|
llm_model = OpenAIServerModel( |
|
|
model_id=MODEL_ID, |
|
|
api_key=GITHUB_TOKEN, |
|
|
api_base=GITHUB_ENDPOINT |
|
|
) |
|
|
|
|
|
agent = CodeAgent( |
|
|
tools=[wikipedia_lookup], |
|
|
model=llm_model |
|
|
) |
|
|
logger.info("Agent initialization complete") |
|
|
except Exception as e: |
|
|
logger.exception("CRITICAL: Agent initialization failed") |
|
|
raise RuntimeError(f"Agent initialization failed: {e}") from e |
|
|
|
|
|
def run_agent_on_question(question: str) -> str: |
|
|
"""Run the agent on a question and return the result.""" |
|
|
question = question.strip() |
|
|
if not question: |
|
|
return "AGENT_ERROR: Empty question" |
|
|
|
|
|
logger.info(f"Running agent on: '{question}'") |
|
|
try: |
|
|
return agent.run(f"{REACT_INSTRUCTION_PROMPT.strip()}\n\nQUESTION: {question}") |
|
|
except Exception as e: |
|
|
logger.exception("Agent run failed") |
|
|
return f"AGENT_ERROR: {e}\n{traceback.format_exc()}" |
|
|
|
|
|
def evaluate_and_submit(): |
|
|
"""Evaluate all questions and submit answers.""" |
|
|
logger.info("π Starting evaluation...") |
|
|
username = os.getenv("HF_USERNAME", "unknown_user") |
|
|
|
|
|
|
|
|
try: |
|
|
questions = requests.get(f"{SUBMISSION_URL}/questions", timeout=20).json() |
|
|
if not isinstance(questions, list): |
|
|
raise ValueError("Invalid response format") |
|
|
logger.info(f"β
Fetched {len(questions)} questions") |
|
|
except Exception as e: |
|
|
logger.exception("Failed to fetch questions") |
|
|
return f"β Error fetching questions: {e}", pd.DataFrame() |
|
|
|
|
|
if not questions: |
|
|
return "βΉοΈ No questions received", pd.DataFrame() |
|
|
|
|
|
|
|
|
results_log = [] |
|
|
answers_payload = [] |
|
|
|
|
|
for i, item in enumerate(questions): |
|
|
task_id, question_text = item.get("task_id"), item.get("question") |
|
|
if not task_id or not question_text: |
|
|
continue |
|
|
|
|
|
logger.info(f"Processing Q{i+1}/{len(questions)}: ID={task_id}") |
|
|
raw_output = run_agent_on_question(question_text) |
|
|
|
|
|
|
|
|
if "FINAL ANSWER:" in raw_output: |
|
|
final_answer = raw_output.split("FINAL ANSWER:", 1)[1].strip() |
|
|
elif "AGENT_ERROR:" in raw_output: |
|
|
final_answer = raw_output |
|
|
else: |
|
|
final_answer = "AGENT_ERROR: No final answer found" |
|
|
|
|
|
results_log.append({ |
|
|
"Task ID": task_id, |
|
|
"Question": question_text, |
|
|
"Submitted Answer": final_answer, |
|
|
"Full Output": raw_output |
|
|
}) |
|
|
answers_payload.append({"task_id": task_id, "submitted_answer": final_answer}) |
|
|
|
|
|
results_df = pd.DataFrame(results_log) |
|
|
if not answers_payload: |
|
|
return "β οΈ No answers generated", results_df |
|
|
|
|
|
|
|
|
logger.info(f"Submitting {len(answers_payload)} answers...") |
|
|
space_id = os.getenv("SPACE_ID", "NA") |
|
|
agent_code_url = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id != "NA" else "NA" |
|
|
|
|
|
try: |
|
|
response = requests.post( |
|
|
f"{SUBMISSION_URL}/submit", |
|
|
json={"username": username, "agent_code": agent_code_url, "answers": answers_payload}, |
|
|
timeout=90 |
|
|
).json() |
|
|
|
|
|
score = response.get('score', 'N/A') |
|
|
score_str = f"{float(score):.2f}%" if isinstance(score, (int, float)) else str(score) |
|
|
return (f"β
Success! Score: {score_str} " |
|
|
f"({response.get('correct_count','?')}/{response.get('total_attempted','?')}). " |
|
|
f"Msg: {response.get('message','')}"), results_df |
|
|
except Exception as e: |
|
|
err_msg = f"β Submission Failed: {e}" |
|
|
if hasattr(e, 'response') and e.response: |
|
|
err_msg += f" | Response: {e.response.text[:300]}" |
|
|
return err_msg, results_df |
|
|
|
|
|
|
|
|
with gr.Blocks(theme=gr.themes.Soft()) as demo: |
|
|
gr.Markdown("# π Agent Evaluation Runner π\nEnsure `GITHUB_TOKEN` secret is set. Click Run to start.") |
|
|
run_button = gr.Button("βΆοΈ Run Evaluation & Submit All Answers", variant="primary") |
|
|
status_box = gr.Textbox(label="π Status", lines=4, interactive=False) |
|
|
results_display = gr.DataFrame( |
|
|
label="π Detailed Log", |
|
|
headers=["Task ID", "Question", "Submitted Answer", "Full Output"], |
|
|
wrap=True, |
|
|
column_widths=["10%", "25%", "20%", "45%"] |
|
|
) |
|
|
run_button.click(fn=evaluate_and_submit, outputs=[status_box, results_display]) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
logger.info("Launching Gradio application...") |
|
|
demo.launch(debug=True, share=False) |
|
|
|