Spaces:
Runtime error
Runtime error
| import os | |
| import requests | |
| from dotenv import load_dotenv | |
| from smolagents import CodeAgent, DuckDuckGoSearchTool, LiteLLMModel | |
| load_dotenv() | |
| DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space" | |
| INSTRUCTIONS = """You are a general AI assistant. I will ask you a question. Report your thoughts, and then provide your final answer. | |
| CRITICAL FORMATTING RULES: | |
| - Your final answer should be a number OR as few words as possible OR a comma separated list of numbers and/or strings | |
| - If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise | |
| - If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise | |
| - If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string | |
| - Be extremely precise with spelling and formatting - the evaluation uses exact matching | |
| - For strings: no extra spaces, no punctuation unless part of the answer, lowercase | |
| - For numbers: just the number, no units, no commas, no currency symbols | |
| - Provide ONLY the answer as your final response, nothing else | |
| - Expand abbreviations like 'St.' to 'Saint' in city names | |
| You have access to a web search tool to help you find accurate information. Use it when you need to look up facts.""" | |
| def run_gaia_evaluation(): | |
| print("π GAIA Benchmark Evaluation with Ollama") | |
| print("=" * 60) | |
| username = os.getenv("HF_USERNAME") | |
| if not username: | |
| print("β Please set HF_USERNAME environment variable") | |
| return | |
| print(f"π€ User: {username}") | |
| model = LiteLLMModel( | |
| model_id="ollama_chat/gemma3", | |
| api_base="http://localhost:11434", | |
| num_ctx=8192, | |
| temperature=0.1, # Low temperature for more deterministic answers | |
| ) | |
| agent = CodeAgent( | |
| tools=[DuckDuckGoSearchTool()], | |
| model=model, | |
| instructions=INSTRUCTIONS, | |
| max_steps=10, | |
| ) | |
| try: | |
| resp = requests.get(f"{DEFAULT_API_URL}/questions", timeout=30) | |
| resp.raise_for_status() | |
| data = resp.json() | |
| questions = data if isinstance(data, list) else data.get("questions", []) | |
| print(f"π Loaded {len(questions)} questions") | |
| except requests.RequestException as e: | |
| print(f"β Error fetching questions: {e}") | |
| return | |
| results = [] | |
| for i, q in enumerate(questions): | |
| task_id = q["task_id"] | |
| text = q["question"] | |
| print(f"\nβ Question {i+1}: {text}") | |
| result = agent.run(text, reset=True) | |
| result_str = str(result).strip() | |
| # Take the last line as the answer (since agent should provide only the answer) | |
| out = result_str.splitlines()[-1] if result_str else "AGENT ERROR: No response." | |
| if out.startswith("{"): | |
| out = "AGENT ERROR: No final answer." | |
| out = out.strip().rstrip(".") | |
| results.append({"task_id": task_id, "submitted_answer": out}) | |
| print(f"β Answer: '{out}'") | |
| print(f"π Preview: {result_str[:200]}...") | |
| # Submit answers automatically | |
| payload = { | |
| "username": username, | |
| "agent_code": "ollama-gemma3-with-tools", | |
| "answers": results, | |
| } | |
| try: | |
| post = requests.post(f"{DEFAULT_API_URL}/submit", json=payload, timeout=60) | |
| post.raise_for_status() | |
| res = post.json() | |
| print("\n" + "=" * 60) | |
| print("π GAIA BENCHMARK RESULTS") | |
| print("=" * 60) | |
| print(f"π€ User: {res.get('username', username)}") | |
| print(f"π Overall Score: {res.get('score', res.get('overall_score', 'N/A'))}%") | |
| print(f"β Correct: {res.get('correct_count', res.get('num_correct', 'N/A'))}/{len(results)}") | |
| print(f"π¬ Message: {res.get('message', 'N/A')}") | |
| print("=" * 60) | |
| except requests.RequestException as e: | |
| print(f"β Error submitting: {e}") | |
| done = sum(1 for r in results if not r["submitted_answer"].startswith("AGENT ERROR")) | |
| print(f"Completed locally: {done}/{len(results)}") | |
| if __name__ == "__main__": | |
| run_gaia_evaluation() | |