File size: 10,580 Bytes
93b72dc
 
 
 
 
 
 
 
d1dcd56
93b72dc
d1dcd56
 
 
93b72dc
 
 
 
 
 
 
d1dcd56
 
 
93b72dc
 
 
d1dcd56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93b72dc
 
 
d1dcd56
 
 
 
93b72dc
 
 
 
d1dcd56
93b72dc
 
 
d1dcd56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93b72dc
 
 
d1dcd56
 
 
 
 
 
93b72dc
 
d1dcd56
 
 
 
 
 
 
 
 
 
 
 
 
 
93b72dc
d1dcd56
93b72dc
 
d1dcd56
 
 
93b72dc
 
 
d1dcd56
93b72dc
d1dcd56
93b72dc
 
 
 
d1dcd56
93b72dc
 
 
 
 
 
 
 
 
 
 
d1dcd56
93b72dc
 
 
 
d1dcd56
93b72dc
d1dcd56
93b72dc
 
 
 
 
 
 
d1dcd56
93b72dc
 
 
d1dcd56
 
 
93b72dc
 
 
 
 
 
d1dcd56
93b72dc
 
 
d1dcd56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93b72dc
d1dcd56
93b72dc
 
 
 
 
 
 
d1dcd56
93b72dc
 
 
 
 
e95a92e
d1dcd56
93b72dc
 
 
d1dcd56
93b72dc
 
d1dcd56
93b72dc
 
 
 
d1dcd56
93b72dc
d1dcd56
93b72dc
 
 
d1dcd56
93b72dc
 
d1dcd56
 
93b72dc
d1dcd56
93b72dc
d1dcd56
93b72dc
 
d1dcd56
93b72dc
 
d1dcd56
93b72dc
d1dcd56
 
 
 
93b72dc
d1dcd56
93b72dc
d1dcd56
 
 
93b72dc
 
d1dcd56
93b72dc
d1dcd56
 
93b72dc
 
d1dcd56
 
93b72dc
d1dcd56
 
93b72dc
 
d1dcd56
 
93b72dc
d1dcd56
 
93b72dc
 
 
d1dcd56
 
 
93b72dc
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
import os
import gradio as gr
import requests
import pandas as pd
import tempfile
import json
import logging
from typing import Optional
from dotenv import load_dotenv

load_dotenv()

from agent_enhanced import GAIAAgent, is_ollama_available, is_production

DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


def fetch_questions(api_url: str = DEFAULT_API_URL) -> list:
    """Fetch all questions from the GAIA API."""
    for attempt in range(3):
        try:
            response = requests.get(f"{api_url}/questions", timeout=30)
            response.raise_for_status()
            questions = response.json()
            
            # Print all questions with their task IDs
            print("\n" + "="*80)
            print("ALL QUESTIONS WITH TASK IDs:")
            print("="*80)
            for i, q in enumerate(questions, 1):
                task_id = q.get("task_id", "N/A")
                question_text = q.get("question", "N/A")
                file_name = q.get("file_name", "")
                print(f"\n[{i}] Task ID: {task_id}")
                print(f"    Question: {question_text[:200]}{'...' if len(question_text) > 200 else ''}")
                if file_name:
                    print(f"    File: {file_name}")
            print("\n" + "="*80)
            print(f"Total questions: {len(questions)}")
            print("="*80 + "\n")
            
            return questions
        except Exception as e:
            logger.warning(f"Attempt {attempt + 1} failed: {e}")
    return []


def fetch_random_question(api_url: str = DEFAULT_API_URL) -> dict:
    """Fetch a random question."""
    for attempt in range(3):
        try:
            response = requests.get(f"{api_url}/random-question", timeout=30)
            response.raise_for_status()
            return response.json()
        except Exception as e:
            logger.warning(f"Attempt {attempt + 1} failed: {e}")
    return {}


def fetch_file(task_id: str, api_url: str = DEFAULT_API_URL) -> Optional[str]:
    """Fetch file for a task."""
    try:
        response = requests.get(f"{api_url}/files/{task_id}", timeout=30)
        if response.status_code == 200:
            content_disposition = response.headers.get('content-disposition', '')
            filename = f"task_{task_id}_file"
            if 'filename=' in content_disposition:
                filename = content_disposition.split('filename=')[1].strip('"')
            
            temp_dir = tempfile.mkdtemp()
            file_path = os.path.join(temp_dir, filename)
            
            with open(file_path, 'wb') as f:
                f.write(response.content)
            
            logger.info(f"Downloaded: {file_path}")
            return file_path
        elif response.status_code == 404:
            return None
    except Exception as e:
        logger.error(f"File fetch failed: {e}")
    return None


def submit_answers(username: str, agent_code: str, answers: list, api_url: str = DEFAULT_API_URL) -> dict:
    """Submit answers to API."""
    payload = {"username": username, "agent_code": agent_code, "answers": answers}
    response = requests.post(f"{api_url}/submit", json=payload, timeout=60)
    response.raise_for_status()
    return response.json()


def get_env_status() -> str:
    """Get environment status."""
    if is_production():
        return "☁️ **Production Mode** (HuggingFace Spaces) - Using OpenAI GPT-4o"
    elif is_ollama_available():
        return "🏠 **Local Mode** - Using Ollama"
    elif os.environ.get("OPENAI_API_KEY"):
        return "☁️ **Local + OpenAI** - Using OpenAI GPT-4o"
    else:
        return "⚠️ **No Backend** - Set OPENAI_API_KEY or start Ollama"


def run_agent_on_questions(progress=gr.Progress()):
    """Run agent on all questions."""
    try:
        env_info = get_env_status()
        progress(0, desc="Initializing agent...")
        
        agent = GAIAAgent()
        
        progress(0.05, desc="Fetching questions...")
        questions = fetch_questions()
        
        if not questions:
            return "Error: Failed to fetch questions.", None
        
        total = len(questions)
        results = []
        answers_for_submission = []
        
        for i, q in enumerate(questions):
            progress((i + 1) / total, desc=f"Question {i+1}/{total}...")
            
            task_id = q.get("task_id", "")
            question_text = q.get("question", "")
            
            file_path = None
            if q.get("file_name"):
                file_path = fetch_file(task_id)
            
            try:
                answer = agent.run(question_text, task_id, file_path)
            except Exception as e:
                logger.error(f"Error on question {i+1}: {e}")
                answer = f"Error: {str(e)}"
            
            results.append({
                "Task ID": task_id,
                "Question": question_text,
                "Answer": answer,
                "Status": "βœ“" if answer and not answer.startswith("Error:") and answer != "Unable to determine answer" else "βœ—"
            })
            
            answers_for_submission.append({
                "task_id": task_id,
                "submitted_answer": answer
            })
            
            # Cleanup
            if file_path and os.path.exists(file_path):
                try:
                    os.remove(file_path)
                    os.rmdir(os.path.dirname(file_path))
                except:
                    pass
        
        df = pd.DataFrame(results)
        progress(1.0, desc="Complete!")
        return df, answers_for_submission
        
    except Exception as e:
        logger.error(f"Error: {e}")
        return f"Error: {str(e)}", None


def test_single_question():
    """Test on a single random question."""
    try:
        agent = GAIAAgent()
        question_data = fetch_random_question()
        
        if not question_data:
            return "Error: Failed to fetch question.", "", "", ""
        
        task_id = question_data.get("task_id", "")
        question_text = question_data.get("question", "")
        
        file_path = None
        if question_data.get("file_name"):
            file_path = fetch_file(task_id)
        
        answer = agent.run(question_text, task_id, file_path)
        
        # Cleanup
        if file_path and os.path.exists(file_path):
            try:
                os.remove(file_path)
                os.rmdir(os.path.dirname(file_path))
            except:
                pass
        
        status = "βœ“ Valid" if answer and not answer.startswith("Error") else "⚠️ Check answer"
        return question_text, answer, task_id, status
        
    except Exception as e:
        logger.error(f"Error: {e}")
        return f"Error: {str(e)}", "", "", ""


def submit_to_leaderboard(username: str, space_url: str, answers_json: str):
    """Submit to leaderboard."""
    if not username or not space_url or not answers_json:
        return "Please fill in all fields and run the agent first."
    
    try:
        answers = json.loads(answers_json) if isinstance(answers_json, str) else answers_json
        
        if not isinstance(answers, list) or len(answers) == 0:
            return "Error: Run the benchmark first."
        
        if not space_url.endswith("/tree/main"):
            space_url = space_url.rstrip("/") + "/tree/main"
        
        result = submit_answers(username, space_url, answers)
        print(result)
        score = result.get("score", 0)
        correct = result.get("correct_count", 0)
        total = result.get("total_attempted", 0)
        
        cert_msg = "πŸ† **Congratulations!** Score above 30% - Certificate earned!" if score > 0.3 else "❌ Need >30% for certificate."
        
        return f"""
## Submission Results

**Score:** {score:.1%}
**Correct:** {correct}/{total}

{cert_msg}

[View Leaderboard](https://huggingface.co/spaces/agents-course/Students_leaderboard)
"""
    except Exception as e:
        logger.error(f"Submission error: {e}")
        return f"Error: {str(e)}"


# ============ GRADIO APP ============
with gr.Blocks(title="GAIA Agent", theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
# πŸ€– GAIA Benchmark Agent

**Tools:** πŸ” Web Search | πŸ“š Wikipedia | 🐍 Python | πŸ“„ Files | πŸ”’ Calculator | 🌐 Webpages | πŸ‘οΈ Vision (OpenAI)
""")
    
    env_status = gr.Markdown(get_env_status())
    
    with gr.Tabs():
        with gr.TabItem("πŸ§ͺ Test Single"):
            test_btn = gr.Button("Fetch & Solve Random Question", variant="primary")
            test_q = gr.Textbox(label="Question", lines=4, interactive=False)
            test_a = gr.Textbox(label="Answer", lines=2, interactive=False)
            test_id = gr.Textbox(label="Task ID", interactive=False)
            test_status = gr.Textbox(label="Status", interactive=False)
            
            test_btn.click(test_single_question, outputs=[test_q, test_a, test_id, test_status])
        
        with gr.TabItem("πŸš€ Full Benchmark"):
            run_btn = gr.Button("Run on All Questions", variant="primary")
            results_df = gr.Dataframe(label="Results")
            answers_state = gr.State()
            
            run_btn.click(run_agent_on_questions, outputs=[results_df, answers_state])
        
        with gr.TabItem("πŸ“€ Submit"):
            gr.Markdown("### Submit to Leaderboard")
            
            with gr.Row():
                username_in = gr.Textbox(label="HF Username", placeholder="your-username")
                space_url_in = gr.Textbox(label="Space URL", placeholder="https://huggingface.co/spaces/you/space")
            
            answers_in = gr.Textbox(label="Answers JSON (auto-filled)", lines=8)
            submit_btn = gr.Button("Submit", variant="primary")
            submit_result = gr.Markdown()
            
            def format_answers(a):
                return json.dumps(a, indent=2) if a else ""
            
            answers_state.change(format_answers, inputs=[answers_state], outputs=[answers_in])
            submit_btn.click(submit_to_leaderboard, inputs=[username_in, space_url_in, answers_in], outputs=[submit_result])
    
    gr.Markdown("""
---
**Setup:**
- Local: `ollama serve` + `ollama pull qwen2.5:32b`  
- Production: Set `OPENAI_API_KEY` in `.env` or HF Secrets
""")

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)