import os import logging import traceback import gradio as gr import requests import pandas as pd from smolagents import CodeAgent, tool from smolagents.models import OpenAIServerModel # Setup logging logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s") logger = logging.getLogger(__name__) # Constants SUBMISSION_URL = "https://agents-course-unit4-scoring.hf.space" GITHUB_TOKEN = os.getenv("GITHUB_TOKEN") if not GITHUB_TOKEN: raise ValueError("CRITICAL: GITHUB_TOKEN environment variable not set.") GITHUB_ENDPOINT = "https://models.github.ai/inference" MODEL_ID = os.getenv("MODEL_ID", "openai/gpt-4o-mini") @tool def wikipedia_lookup(page_title: str) -> str: """ Fetches the summary intro text of an English Wikipedia page. Use exact titles. Args: page_title (str): The exact title of the Wikipedia page (e.g., 'Albert Einstein'). """ page_safe = page_title.replace(" ", "_") logger.info(f"Wikipedia lookup: '{page_title}'") try: url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{page_safe}" headers = {'User-Agent': f'GAIAgent/1.2 ({os.getenv("SPACE_ID", "unknown")})'} r = requests.get(url, headers=headers, timeout=15) r.raise_for_status() data = r.json() if extract := data.get("extract", ""): return extract title = data.get("title", page_title) if data.get("type") == "disambiguation": return f"Wikipedia Error: '{title}' is a disambiguation page. Try a more specific title." return f"Wikipedia Error: Page '{title}' found but has no summary." except requests.exceptions.HTTPError as e: status_code = e.response.status_code return f"Wikipedia Error: {'Page not found' if status_code == 404 else f'HTTP {status_code}'} for '{page_title}'." except Exception as e: return f"Wikipedia Error: {e}" # Agent prompt - updated to mention only Wikipedia tool REACT_INSTRUCTION_PROMPT = """You are a helpful assistant using tools to answer questions. Available Tools: - wikipedia_lookup(page_title: str): Looks up a specific English Wikipedia page. Use exact titles (e.g., 'Berlin'). Follow these steps: 1. Thought: Plan which tool to use and why. 2. Action: Call the tool (e.g., wikipedia_lookup(page_title="...")). 3. Observation: Record the result. 4. Thought: Analyze result. If answered, prepare final answer. If not, plan next step. 5. Repeat Action/Observation/Thought until answered or determined impossible. 6. Thought: Summarize findings based ONLY on observations. 7. Final Answer: Provide the answer starting exactly with "FINAL ANSWER: " using the required format (number, short string, or comma-separated list). Formatting Rules for FINAL ANSWER: - Numbers: Just the number (e.g., `42`). - Strings: Minimal words, no articles. Digits as words (e.g., `seven`). - Lists: Comma-separated (e.g., `paris,london,three`). Let's begin! """ # Initialize LLM and agent logger.info(f"Initializing LLM and agent: {MODEL_ID}") try: llm_model = OpenAIServerModel( model_id=MODEL_ID, api_key=GITHUB_TOKEN, api_base=GITHUB_ENDPOINT ) agent = CodeAgent( tools=[wikipedia_lookup], # Only Wikipedia tool model=llm_model ) logger.info("Agent initialization complete") except Exception as e: logger.exception("CRITICAL: Agent initialization failed") raise RuntimeError(f"Agent initialization failed: {e}") from e def run_agent_on_question(question: str) -> str: """Run the agent on a question and return the result.""" question = question.strip() if not question: return "AGENT_ERROR: Empty question" logger.info(f"Running agent on: '{question}'") try: return agent.run(f"{REACT_INSTRUCTION_PROMPT.strip()}\n\nQUESTION: {question}") except Exception as e: logger.exception("Agent run failed") return f"AGENT_ERROR: {e}\n{traceback.format_exc()}" def evaluate_and_submit(): """Evaluate all questions and submit answers.""" logger.info("🚀 Starting evaluation...") username = os.getenv("HF_USERNAME", "unknown_user") # Fetch questions try: questions = requests.get(f"{SUBMISSION_URL}/questions", timeout=20).json() if not isinstance(questions, list): raise ValueError("Invalid response format") logger.info(f"✅ Fetched {len(questions)} questions") except Exception as e: logger.exception("Failed to fetch questions") return f"❌ Error fetching questions: {e}", pd.DataFrame() if not questions: return "â„šī¸ No questions received", pd.DataFrame() # Process questions results_log = [] answers_payload = [] for i, item in enumerate(questions): task_id, question_text = item.get("task_id"), item.get("question") if not task_id or not question_text: continue logger.info(f"Processing Q{i+1}/{len(questions)}: ID={task_id}") raw_output = run_agent_on_question(question_text) # Extract final answer if "FINAL ANSWER:" in raw_output: final_answer = raw_output.split("FINAL ANSWER:", 1)[1].strip() elif "AGENT_ERROR:" in raw_output: final_answer = raw_output else: final_answer = "AGENT_ERROR: No final answer found" results_log.append({ "Task ID": task_id, "Question": question_text, "Submitted Answer": final_answer, "Full Output": raw_output }) answers_payload.append({"task_id": task_id, "submitted_answer": final_answer}) results_df = pd.DataFrame(results_log) if not answers_payload: return "âš ī¸ No answers generated", results_df # Submit answers logger.info(f"Submitting {len(answers_payload)} answers...") space_id = os.getenv("SPACE_ID", "NA") agent_code_url = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id != "NA" else "NA" try: response = requests.post( f"{SUBMISSION_URL}/submit", json={"username": username, "agent_code": agent_code_url, "answers": answers_payload}, timeout=90 ).json() score = response.get('score', 'N/A') score_str = f"{float(score):.2f}%" if isinstance(score, (int, float)) else str(score) return (f"✅ Success! Score: {score_str} " f"({response.get('correct_count','?')}/{response.get('total_attempted','?')}). " f"Msg: {response.get('message','')}"), results_df except Exception as e: err_msg = f"❌ Submission Failed: {e}" if hasattr(e, 'response') and e.response: err_msg += f" | Response: {e.response.text[:300]}" return err_msg, results_df # Gradio interface with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.Markdown("# 🚀 Agent Evaluation Runner 🚀\nEnsure `GITHUB_TOKEN` secret is set. Click Run to start.") run_button = gr.Button("â–ļī¸ Run Evaluation & Submit All Answers", variant="primary") status_box = gr.Textbox(label="📊 Status", lines=4, interactive=False) results_display = gr.DataFrame( label="📋 Detailed Log", headers=["Task ID", "Question", "Submitted Answer", "Full Output"], wrap=True, column_widths=["10%", "25%", "20%", "45%"] ) run_button.click(fn=evaluate_and_submit, outputs=[status_box, results_display]) if __name__ == "__main__": logger.info("Launching Gradio application...") demo.launch(debug=True, share=False)