import os import requests from dotenv import load_dotenv from smolagents import CodeAgent, DuckDuckGoSearchTool, LiteLLMModel load_dotenv() DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space" INSTRUCTIONS = """You are a general AI assistant. I will ask you a question. Report your thoughts, and then provide your final answer. CRITICAL FORMATTING RULES: - Your final answer should be a number OR as few words as possible OR a comma separated list of numbers and/or strings - If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise - If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise - If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string - Be extremely precise with spelling and formatting - the evaluation uses exact matching - For strings: no extra spaces, no punctuation unless part of the answer, lowercase - For numbers: just the number, no units, no commas, no currency symbols - Provide ONLY the answer as your final response, nothing else - Expand abbreviations like 'St.' to 'Saint' in city names You have access to a web search tool to help you find accurate information. Use it when you need to look up facts.""" def run_gaia_evaluation(): print("šŸš€ GAIA Benchmark Evaluation with Ollama") print("=" * 60) username = os.getenv("HF_USERNAME") if not username: print("āŒ Please set HF_USERNAME environment variable") return print(f"šŸ‘¤ User: {username}") model = LiteLLMModel( model_id="ollama_chat/gemma3", api_base="http://localhost:11434", num_ctx=8192, temperature=0.1, # Low temperature for more deterministic answers ) agent = CodeAgent( tools=[DuckDuckGoSearchTool()], model=model, instructions=INSTRUCTIONS, max_steps=10, ) try: resp = requests.get(f"{DEFAULT_API_URL}/questions", timeout=30) resp.raise_for_status() data = resp.json() questions = data if isinstance(data, list) else data.get("questions", []) print(f"šŸ“‹ Loaded {len(questions)} questions") except requests.RequestException as e: print(f"āŒ Error fetching questions: {e}") return results = [] for i, q in enumerate(questions): task_id = q["task_id"] text = q["question"] print(f"\nā“ Question {i+1}: {text}") result = agent.run(text, reset=True) result_str = str(result).strip() # Take the last line as the answer (since agent should provide only the answer) out = result_str.splitlines()[-1] if result_str else "AGENT ERROR: No response." if out.startswith("{"): out = "AGENT ERROR: No final answer." out = out.strip().rstrip(".") results.append({"task_id": task_id, "submitted_answer": out}) print(f"āœ… Answer: '{out}'") print(f"šŸ“ Preview: {result_str[:200]}...") # Submit answers automatically payload = { "username": username, "agent_code": "ollama-gemma3-with-tools", "answers": results, } try: post = requests.post(f"{DEFAULT_API_URL}/submit", json=payload, timeout=60) post.raise_for_status() res = post.json() print("\n" + "=" * 60) print("šŸ† GAIA BENCHMARK RESULTS") print("=" * 60) print(f"šŸ‘¤ User: {res.get('username', username)}") print(f"šŸ“Š Overall Score: {res.get('score', res.get('overall_score', 'N/A'))}%") print(f"āœ… Correct: {res.get('correct_count', res.get('num_correct', 'N/A'))}/{len(results)}") print(f"šŸ’¬ Message: {res.get('message', 'N/A')}") print("=" * 60) except requests.RequestException as e: print(f"āŒ Error submitting: {e}") done = sum(1 for r in results if not r["submitted_answer"].startswith("AGENT ERROR")) print(f"Completed locally: {done}/{len(results)}") if __name__ == "__main__": run_gaia_evaluation()