File size: 7,683 Bytes
10e9b7d
475f56b
009368a
10e9b7d
eccf8e4
3c4371f
35bdc51
a0349ea
475f56b
d7730f0
009368a
475f56b
70658cb
d7730f0
009368a
a0349ea
 
 
 
 
475f56b
70658cb
009368a
d876082
 
 
 
 
 
009368a
d7730f0
70658cb
81d72bd
fcc0bb0
15fa167
 
d2d0f74
d7730f0
 
009368a
d7730f0
 
 
 
 
d2d0f74
d7730f0
 
70658cb
d7730f0
009368a
35bdc51
fcc0bb0
009368a
fcc0bb0
15fa167
fcc0bb0
35bdc51
15fa167
fcc0bb0
 
15fa167
 
009368a
15fa167
 
 
009368a
475f56b
ad7b1a7
d7730f0
 
475f56b
a0349ea
d2d0f74
a0349ea
 
475f56b
d7730f0
009368a
35bdc51
009368a
 
d7730f0
009368a
d7730f0
 
009368a
 
d7730f0
009368a
d7730f0
 
009368a
d7730f0
009368a
d7730f0
009368a
d7730f0
 
15fa167
 
d7730f0
 
a442fc4
d7730f0
 
31243f4
d7730f0
 
 
 
e777122
81d72bd
fcc0bb0
0cf07a2
d7730f0
 
81d72bd
d7730f0
009368a
 
d7730f0
81d72bd
d7730f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
009368a
 
 
d7730f0
 
009368a
d7730f0
15fa167
d7730f0
 
 
e80aab9
d7730f0
 
 
 
 
 
 
 
 
 
 
e777122
15fa167
d7730f0
 
15fa167
009368a
d7730f0
009368a
a0349ea
009368a
d7730f0
 
 
 
 
 
 
 
009368a
475f56b
009368a
fcc0bb0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
import os
import logging
import traceback
import gradio as gr
import requests
import pandas as pd
from smolagents import CodeAgent, tool
from smolagents.models import OpenAIServerModel

# Setup logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
logger = logging.getLogger(__name__)

# Constants
SUBMISSION_URL = "https://agents-course-unit4-scoring.hf.space"
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
if not GITHUB_TOKEN:
    raise ValueError("CRITICAL: GITHUB_TOKEN environment variable not set.")
GITHUB_ENDPOINT = "https://models.github.ai/inference"
MODEL_ID = os.getenv("MODEL_ID", "openai/gpt-4o-mini")

@tool
def wikipedia_lookup(page_title: str) -> str:
    """
    Fetches the summary intro text of an English Wikipedia page. Use exact titles.
    
    Args:
        page_title (str): The exact title of the Wikipedia page (e.g., 'Albert Einstein').
    """
    page_safe = page_title.replace(" ", "_")
    logger.info(f"Wikipedia lookup: '{page_title}'")
    try:
        url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{page_safe}"
        headers = {'User-Agent': f'GAIAgent/1.2 ({os.getenv("SPACE_ID", "unknown")})'}
        r = requests.get(url, headers=headers, timeout=15)
        r.raise_for_status()
        data = r.json()
        
        if extract := data.get("extract", ""):
            return extract
        
        title = data.get("title", page_title)
        if data.get("type") == "disambiguation":
            return f"Wikipedia Error: '{title}' is a disambiguation page. Try a more specific title."
        return f"Wikipedia Error: Page '{title}' found but has no summary."
    except requests.exceptions.HTTPError as e:
        status_code = e.response.status_code
        return f"Wikipedia Error: {'Page not found' if status_code == 404 else f'HTTP {status_code}'} for '{page_title}'."
    except Exception as e:
        return f"Wikipedia Error: {e}"

# Agent prompt - updated to mention only Wikipedia tool
REACT_INSTRUCTION_PROMPT = """You are a helpful assistant using tools to answer questions.
Available Tools:
- wikipedia_lookup(page_title: str): Looks up a specific English Wikipedia page. Use exact titles (e.g., 'Berlin').
Follow these steps:
1. Thought: Plan which tool to use and why.
2. Action: Call the tool (e.g., wikipedia_lookup(page_title="...")).
3. Observation: Record the result.
4. Thought: Analyze result. If answered, prepare final answer. If not, plan next step.
5. Repeat Action/Observation/Thought until answered or determined impossible.
6. Thought: Summarize findings based ONLY on observations.
7. Final Answer: Provide the answer starting exactly with "FINAL ANSWER: " using the required format (number, short string, or comma-separated list).
Formatting Rules for FINAL ANSWER:
- Numbers: Just the number (e.g., `42`).
- Strings: Minimal words, no articles. Digits as words (e.g., `seven`).
- Lists: Comma-separated (e.g., `paris,london,three`).
Let's begin!
"""

# Initialize LLM and agent
logger.info(f"Initializing LLM and agent: {MODEL_ID}")
try:
    llm_model = OpenAIServerModel(
        model_id=MODEL_ID,
        api_key=GITHUB_TOKEN,
        api_base=GITHUB_ENDPOINT
    )
    
    agent = CodeAgent(
        tools=[wikipedia_lookup],  # Only Wikipedia tool
        model=llm_model
    )
    logger.info("Agent initialization complete")
except Exception as e:
    logger.exception("CRITICAL: Agent initialization failed")
    raise RuntimeError(f"Agent initialization failed: {e}") from e

def run_agent_on_question(question: str) -> str:
    """Run the agent on a question and return the result."""
    question = question.strip()
    if not question: 
        return "AGENT_ERROR: Empty question"

    logger.info(f"Running agent on: '{question}'")
    try:
        return agent.run(f"{REACT_INSTRUCTION_PROMPT.strip()}\n\nQUESTION: {question}")
    except Exception as e:
        logger.exception("Agent run failed")
        return f"AGENT_ERROR: {e}\n{traceback.format_exc()}"

def evaluate_and_submit():
    """Evaluate all questions and submit answers."""
    logger.info("πŸš€ Starting evaluation...")
    username = os.getenv("HF_USERNAME", "unknown_user")
    
    # Fetch questions
    try:
        questions = requests.get(f"{SUBMISSION_URL}/questions", timeout=20).json()
        if not isinstance(questions, list):
            raise ValueError("Invalid response format")
        logger.info(f"βœ… Fetched {len(questions)} questions")
    except Exception as e:
        logger.exception("Failed to fetch questions")
        return f"❌ Error fetching questions: {e}", pd.DataFrame()

    if not questions:
        return "ℹ️ No questions received", pd.DataFrame()

    # Process questions
    results_log = []
    answers_payload = []
    
    for i, item in enumerate(questions):
        task_id, question_text = item.get("task_id"), item.get("question")
        if not task_id or not question_text:
            continue
            
        logger.info(f"Processing Q{i+1}/{len(questions)}: ID={task_id}")
        raw_output = run_agent_on_question(question_text)
        
        # Extract final answer
        if "FINAL ANSWER:" in raw_output:
            final_answer = raw_output.split("FINAL ANSWER:", 1)[1].strip()
        elif "AGENT_ERROR:" in raw_output:
            final_answer = raw_output
        else:
            final_answer = "AGENT_ERROR: No final answer found"
            
        results_log.append({
            "Task ID": task_id,
            "Question": question_text,
            "Submitted Answer": final_answer,
            "Full Output": raw_output
        })
        answers_payload.append({"task_id": task_id, "submitted_answer": final_answer})

    results_df = pd.DataFrame(results_log)
    if not answers_payload:
        return "⚠️ No answers generated", results_df

    # Submit answers
    logger.info(f"Submitting {len(answers_payload)} answers...")
    space_id = os.getenv("SPACE_ID", "NA")
    agent_code_url = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id != "NA" else "NA"
    
    try:
        response = requests.post(
            f"{SUBMISSION_URL}/submit",
            json={"username": username, "agent_code": agent_code_url, "answers": answers_payload},
            timeout=90
        ).json()
        
        score = response.get('score', 'N/A')
        score_str = f"{float(score):.2f}%" if isinstance(score, (int, float)) else str(score)
        return (f"βœ… Success! Score: {score_str} "
                f"({response.get('correct_count','?')}/{response.get('total_attempted','?')}). "
                f"Msg: {response.get('message','')}"), results_df
    except Exception as e:
        err_msg = f"❌ Submission Failed: {e}"
        if hasattr(e, 'response') and e.response:
            err_msg += f" | Response: {e.response.text[:300]}"
        return err_msg, results_df

# Gradio interface
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# πŸš€ Agent Evaluation Runner πŸš€\nEnsure `GITHUB_TOKEN` secret is set. Click Run to start.")
    run_button = gr.Button("▢️ Run Evaluation & Submit All Answers", variant="primary")
    status_box = gr.Textbox(label="πŸ“Š Status", lines=4, interactive=False)
    results_display = gr.DataFrame(
        label="πŸ“‹ Detailed Log", 
        headers=["Task ID", "Question", "Submitted Answer", "Full Output"],
        wrap=True, 
        column_widths=["10%", "25%", "20%", "45%"]
    )
    run_button.click(fn=evaluate_and_submit, outputs=[status_box, results_display])

if __name__ == "__main__":
    logger.info("Launching Gradio application...")
    demo.launch(debug=True, share=False)