| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "id": "4594782d", | |
| "metadata": {}, | |
| "source": [ | |
| "# AIMO 3 submission (local runtime)\n", | |
| "\n", | |
| "Run this notebook with the working directory set to this folder. Expected layout:\n", | |
| "\n", | |
| "| Path | Description |\n", | |
| "|------|-------------|\n", | |
| "| `wheels.tar.gz` | Offline pip wheels archive (from Kaggle `aimo-3-utils`) |\n", | |
| "| `model/` | Fine-tuned model weights (same files as `gpt-oss-120b` on Kaggle) |\n", | |
| "| `test.csv` | Optional; used by `run_local_gateway` when not in competition rerun mode |\n", | |
| "| `setup/` | Created on first run when `wheels.tar.gz` is extracted |\n", | |
| "\n", | |
| "Paths are resolved via `BASE_DIR` in the paths cell (defaults to the current working directory)." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "930bac8a", | |
| "metadata": { | |
| "_kg_hide-output": true, | |
| "execution": { | |
| "iopub.execute_input": "2026-03-02T23:31:07.265460Z", | |
| "iopub.status.busy": "2026-03-02T23:31:07.264869Z", | |
| "iopub.status.idle": "2026-03-02T23:32:11.935746Z", | |
| "shell.execute_reply": "2026-03-02T23:32:11.935164Z" | |
| }, | |
| "papermill": { | |
| "duration": 64.675441, | |
| "end_time": "2026-03-02T23:32:11.936641", | |
| "exception": false, | |
| "start_time": "2026-03-02T23:31:07.261200", | |
| "status": "completed" | |
| }, | |
| "tags": [] | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "%pip uninstall --yes 'keras' 'matplotlib' 'scikit-learn' 'tensorflow'\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "1142e246", | |
| "metadata": { | |
| "execution": { | |
| "iopub.execute_input": "2026-03-02T23:32:11.942163Z", | |
| "iopub.status.busy": "2026-03-02T23:32:11.941759Z", | |
| "iopub.status.idle": "2026-03-02T23:32:11.944262Z", | |
| "shell.execute_reply": "2026-03-02T23:32:11.943943Z" | |
| }, | |
| "papermill": { | |
| "duration": 0.006029, | |
| "end_time": "2026-03-02T23:32:11.945005", | |
| "exception": false, | |
| "start_time": "2026-03-02T23:32:11.938976", | |
| "status": "completed" | |
| }, | |
| "tags": [] | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "import warnings\n", | |
| "warnings.simplefilter('ignore')\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "b8f32bba", | |
| "metadata": { | |
| "execution": { | |
| "iopub.execute_input": "2026-03-02T23:32:11.949690Z", | |
| "iopub.status.busy": "2026-03-02T23:32:11.949507Z", | |
| "iopub.status.idle": "2026-03-02T23:32:11.951646Z", | |
| "shell.execute_reply": "2026-03-02T23:32:11.951310Z" | |
| }, | |
| "papermill": { | |
| "duration": 0.005356, | |
| "end_time": "2026-03-02T23:32:11.952363", | |
| "exception": false, | |
| "start_time": "2026-03-02T23:32:11.947007", | |
| "status": "completed" | |
| }, | |
| "tags": [] | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "import os\n", | |
| "import sys\n", | |
| "import subprocess\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "local-paths-config", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "from pathlib import Path\n", | |
| "\n", | |
| "# Local runtime: place wheels.tar.gz, model/, and test.csv next to this notebook.\n", | |
| "BASE_DIR = Path.cwd().resolve()\n", | |
| "if not (BASE_DIR / \"submission.ipynb\").exists():\n", | |
| " _here = Path(__file__).resolve().parent if \"__file__\" in dir() else None\n", | |
| " if _here and (_here / \"submission.ipynb\").exists():\n", | |
| " BASE_DIR = _here\n", | |
| "\n", | |
| "WHEELS_ARCHIVE = BASE_DIR / \"wheels.tar.gz\"\n", | |
| "SETUP_DIR = BASE_DIR / \"setup\"\n", | |
| "TIKTOKEN_DIR = SETUP_DIR / \"tiktoken_encodings\"\n", | |
| "MODEL_PATH = BASE_DIR / \"model\"\n", | |
| "TEST_CSV = BASE_DIR / \"test.csv\"\n", | |
| "\n", | |
| "print(f\"BASE_DIR={BASE_DIR}\")\n", | |
| "print(f\"WHEELS_ARCHIVE={WHEELS_ARCHIVE} (exists={WHEELS_ARCHIVE.exists()})\")\n", | |
| "print(f\"MODEL_PATH={MODEL_PATH} (exists={MODEL_PATH.exists()})\")\n", | |
| "print(f\"TEST_CSV={TEST_CSV} (exists={TEST_CSV.exists()})\")\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "a99307a7", | |
| "metadata": { | |
| "execution": { | |
| "iopub.execute_input": "2026-03-02T23:32:11.957453Z", | |
| "iopub.status.busy": "2026-03-02T23:32:11.957284Z", | |
| "iopub.status.idle": "2026-03-02T23:32:11.960258Z", | |
| "shell.execute_reply": "2026-03-02T23:32:11.959901Z" | |
| }, | |
| "papermill": { | |
| "duration": 0.006493, | |
| "end_time": "2026-03-02T23:32:11.961037", | |
| "exception": false, | |
| "start_time": "2026-03-02T23:32:11.954544", | |
| "status": "completed" | |
| }, | |
| "tags": [] | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def set_env(input_archive, temp_dir):\n", | |
| "\n", | |
| " if not os.path.exists(temp_dir):\n", | |
| " os.makedirs(temp_dir, exist_ok=True)\n", | |
| " \n", | |
| " subprocess.run(['tar', '-xzf', input_archive, '-C', temp_dir], check=True)\n", | |
| " \n", | |
| " subprocess.run([\n", | |
| " sys.executable, \n", | |
| " '-m', \n", | |
| " 'pip', \n", | |
| " 'install', \n", | |
| " '--no-index', \n", | |
| " '--find-links', \n", | |
| " f'{temp_dir}/wheels', \n", | |
| " 'unsloth', \n", | |
| " 'trl', \n", | |
| " 'vllm', \n", | |
| " 'openai_harmony'\n", | |
| " ], check=True)\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "465a0d30", | |
| "metadata": { | |
| "_kg_hide-output": true, | |
| "execution": { | |
| "iopub.execute_input": "2026-03-02T23:32:11.966030Z", | |
| "iopub.status.busy": "2026-03-02T23:32:11.965890Z", | |
| "iopub.status.idle": "2026-03-02T23:35:40.222509Z", | |
| "shell.execute_reply": "2026-03-02T23:35:40.222085Z" | |
| }, | |
| "papermill": { | |
| "duration": 208.260154, | |
| "end_time": "2026-03-02T23:35:40.223481", | |
| "exception": false, | |
| "start_time": "2026-03-02T23:32:11.963327", | |
| "status": "completed" | |
| }, | |
| "tags": [] | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "set_env(\n", | |
| " input_archive=str(WHEELS_ARCHIVE),\n", | |
| " temp_dir=str(SETUP_DIR)\n", | |
| ")\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "3d150baf", | |
| "metadata": { | |
| "_kg_hide-output": true, | |
| "execution": { | |
| "iopub.execute_input": "2026-03-02T23:35:40.233022Z", | |
| "iopub.status.busy": "2026-03-02T23:35:40.232854Z", | |
| "iopub.status.idle": "2026-03-02T23:35:40.255130Z", | |
| "shell.execute_reply": "2026-03-02T23:35:40.254775Z" | |
| }, | |
| "papermill": { | |
| "duration": 0.028035, | |
| "end_time": "2026-03-02T23:35:40.255949", | |
| "exception": false, | |
| "start_time": "2026-03-02T23:35:40.227914", | |
| "status": "completed" | |
| }, | |
| "tags": [] | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "subprocess.run(['ls', str(TIKTOKEN_DIR)], shell=(os.name == 'nt'))\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "3bebe614", | |
| "metadata": { | |
| "execution": { | |
| "iopub.execute_input": "2026-03-02T23:35:40.264925Z", | |
| "iopub.status.busy": "2026-03-02T23:35:40.264781Z", | |
| "iopub.status.idle": "2026-03-02T23:35:40.267403Z", | |
| "shell.execute_reply": "2026-03-02T23:35:40.267030Z" | |
| }, | |
| "papermill": { | |
| "duration": 0.008155, | |
| "end_time": "2026-03-02T23:35:40.268225", | |
| "exception": false, | |
| "start_time": "2026-03-02T23:35:40.260070", | |
| "status": "completed" | |
| }, | |
| "tags": [] | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "os.environ['TRANSFORMERS_NO_TF'] = '1'\n", | |
| "os.environ['TRANSFORMERS_NO_FLAX'] = '1'\n", | |
| "os.environ['CUDA_VISIBLE_DEVICES'] = '0'\n", | |
| "os.environ['TOKENIZERS_PARALLELISM'] = 'false'\n", | |
| "os.environ['TRITON_PTXAS_PATH'] = '/usr/local/cuda/bin/ptxas'\n", | |
| "os.environ['TIKTOKEN_ENCODINGS_BASE'] = str(TIKTOKEN_DIR)\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "f2172fa9", | |
| "metadata": { | |
| "execution": { | |
| "iopub.execute_input": "2026-03-02T23:35:40.277500Z", | |
| "iopub.status.busy": "2026-03-02T23:35:40.277357Z", | |
| "iopub.status.idle": "2026-03-02T23:35:47.734677Z", | |
| "shell.execute_reply": "2026-03-02T23:35:47.734143Z" | |
| }, | |
| "papermill": { | |
| "duration": 7.463299, | |
| "end_time": "2026-03-02T23:35:47.736102", | |
| "exception": false, | |
| "start_time": "2026-03-02T23:35:40.272803", | |
| "status": "completed" | |
| }, | |
| "tags": [] | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "import gc\n", | |
| "import re\n", | |
| "import math\n", | |
| "import time\n", | |
| "import queue\n", | |
| "import threading\n", | |
| "import contextlib\n", | |
| "from typing import Optional\n", | |
| "from jupyter_client import KernelManager\n", | |
| "from collections import Counter, defaultdict\n", | |
| "from concurrent.futures import as_completed, ThreadPoolExecutor\n", | |
| "\n", | |
| "import pandas as pd\n", | |
| "import polars as pl\n", | |
| "\n", | |
| "from openai import OpenAI\n", | |
| "\n", | |
| "from openai_harmony import (\n", | |
| " HarmonyEncodingName, \n", | |
| " load_harmony_encoding, \n", | |
| " SystemContent, \n", | |
| " ReasoningEffort, \n", | |
| " ToolNamespaceConfig, \n", | |
| " Author, \n", | |
| " Message, \n", | |
| " Role, \n", | |
| " TextContent, \n", | |
| " Conversation\n", | |
| ")\n", | |
| "\n", | |
| "from transformers import set_seed\n", | |
| "import kaggle_evaluation.aimo_3_inference_server\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "731c6203", | |
| "metadata": { | |
| "execution": { | |
| "iopub.execute_input": "2026-03-02T23:35:47.746428Z", | |
| "iopub.status.busy": "2026-03-02T23:35:47.745982Z", | |
| "iopub.status.idle": "2026-03-02T23:35:47.750715Z", | |
| "shell.execute_reply": "2026-03-02T23:35:47.750346Z" | |
| }, | |
| "papermill": { | |
| "duration": 0.010861, | |
| "end_time": "2026-03-02T23:35:47.751434", | |
| "exception": false, | |
| "start_time": "2026-03-02T23:35:47.740573", | |
| "status": "completed" | |
| }, | |
| "tags": [] | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "class CFG:\n", | |
| " \n", | |
| " system_prompt = (\n", | |
| " 'You are an elite mathematical problem solver with expertise at the International '\n", | |
| " 'Mathematical Olympiad (IMO) level. Your goal is to find the correct answer through '\n", | |
| " 'rigorous mathematical reasoning.\\n\\n'\n", | |
| " \n", | |
| " '# Problem-Solving Approach:\\n'\n", | |
| " '1. UNDERSTAND: Carefully read and rephrase the problem in your own words. '\n", | |
| " 'Identify what is given, what needs to be found, and any constraints.\\n'\n", | |
| " '2. EXPLORE: Consider multiple solution strategies. Think about relevant theorems, '\n", | |
| " 'techniques, patterns, or analogous problems. Don\\'t commit to one approach immediately.\\n'\n", | |
| " '3. PLAN: Select the most promising approach and outline key steps before executing.\\n'\n", | |
| " '4. EXECUTE: Work through your solution methodically. Show all reasoning steps clearly.\\n'\n", | |
| " '5. VERIFY: Check your answer by substituting back, testing edge cases, or using '\n", | |
| " 'alternative methods. Ensure logical consistency throughout.\\n\\n'\n", | |
| " \n", | |
| " '# Mathematical Reasoning Principles:\\n'\n", | |
| " '- Break complex problems into smaller, manageable sub-problems\\n'\n", | |
| " '- Look for patterns, symmetries, and special cases that provide insight\\n'\n", | |
| " '- Use concrete examples to build intuition before generalizing\\n'\n", | |
| " '- Consider extreme cases and boundary conditions\\n'\n", | |
| " '- If stuck, try working backwards from the desired result\\n'\n", | |
| " '- Be willing to restart with a different approach if needed\\n\\n'\n", | |
| " \n", | |
| " '# Verification Requirements:\\n'\n", | |
| " '- Cross-check arithmetic and algebraic manipulations\\n'\n", | |
| " '- Verify that your solution satisfies all problem constraints\\n'\n", | |
| " '- Test your answer with simple cases or special values when possible\\n'\n", | |
| " '- Ensure dimensional consistency and reasonableness of the result\\n\\n'\n", | |
| " \n", | |
| " '# Output Format:\\n'\n", | |
| " 'The final answer must be a non-negative integer between 0 and 99999.\\n'\n", | |
| " 'Place your final numerical answer inside \\\\boxed{}, e.g., \\\\boxed{42}\\n\\n'\n", | |
| " \n", | |
| " 'Think step-by-step and show your complete reasoning process. Quality of reasoning '\n", | |
| " 'is as important as the final answer.'\n", | |
| " )\n", | |
| " \n", | |
| " tool_prompt = (\n", | |
| " 'Use this tool to execute Python code for:\\n'\n", | |
| " '- Complex calculations that would be error-prone by hand\\n'\n", | |
| " '- Numerical verification of analytical results\\n'\n", | |
| " '- Generating examples or testing conjectures\\n'\n", | |
| " '- Visualizing problem structure when helpful\\n'\n", | |
| " '- Brute-force verification for small cases\\n\\n'\n", | |
| " \n", | |
| " 'The environment is a stateful Jupyter notebook. Code persists between executions.\\n'\n", | |
| " 'Always use print() to display results. Write clear, well-commented code.\\n\\n'\n", | |
| " \n", | |
| " 'Remember: Code should support your mathematical reasoning, not replace it. '\n", | |
| " 'Explain what you\\'re computing and why before running code.'\n", | |
| " )\n", | |
| " \n", | |
| " preference_prompt = (\n", | |
| " 'You have access to `math`, `numpy`, and `sympy` for:\\n\\n'\n", | |
| " \n", | |
| " '# Symbolic Computation (sympy):\\n'\n", | |
| " '- Algebraic manipulation and simplification\\n'\n", | |
| " '- Solving equations and systems of equations\\n'\n", | |
| " '- Symbolic differentiation and integration\\n'\n", | |
| " '- Number theory functions (primes, divisors, modular arithmetic)\\n'\n", | |
| " '- Polynomial operations and factorization\\n'\n", | |
| " '- Working with mathematical expressions symbolically\\n\\n'\n", | |
| " \n", | |
| " '# Numerical Computation (numpy):\\n'\n", | |
| " '- Array operations and linear algebra\\n'\n", | |
| " '- Efficient numerical calculations for large datasets\\n'\n", | |
| " '- Matrix operations and eigenvalue problems\\n'\n", | |
| " '- Statistical computations\\n\\n'\n", | |
| " \n", | |
| " '# Mathematical Functions (math):\\n'\n", | |
| " '- Standard mathematical functions (trig, log, exp)\\n'\n", | |
| " '- Constants like pi and e\\n'\n", | |
| " '- Basic operations for single values\\n\\n'\n", | |
| " \n", | |
| " 'Best Practices:\\n'\n", | |
| " '- Use sympy for exact symbolic answers when possible\\n'\n", | |
| " '- Use numpy for numerical verification and large-scale computation\\n'\n", | |
| " '- Combine symbolic and numerical approaches: derive symbolically, verify numerically\\n'\n", | |
| " '- Document your computational strategy clearly\\n'\n", | |
| " '- Validate computational results against known cases or theoretical bounds'\n", | |
| " )\n", | |
| " \n", | |
| " served_model_name = 'gpt-oss'\n", | |
| " model_path = str(MODEL_PATH)\n", | |
| " \n", | |
| " kv_cache_dtype = 'fp8_e4m3'\n", | |
| " dtype = 'auto'\n", | |
| "\n", | |
| " high_problem_timeout = 900\n", | |
| " base_problem_timeout = 300\n", | |
| "\n", | |
| " notebook_limit = 17400\n", | |
| " server_timeout = 180\n", | |
| "\n", | |
| " session_timeout = 960\n", | |
| " jupyter_timeout = 6\n", | |
| " sandbox_timeout = 3\n", | |
| "\n", | |
| " stream_interval = 200\n", | |
| " context_tokens = 65536\n", | |
| " buffer_tokens = 512\n", | |
| " search_tokens = 32\n", | |
| " top_logprobs = 5\n", | |
| " batch_size = 256\n", | |
| " early_stop = 4\n", | |
| " attempts = 8\n", | |
| " workers = 16\n", | |
| " turns = 128\n", | |
| " seed = 42\n", | |
| "\n", | |
| " gpu_memory_utilization = 0.96\n", | |
| " temperature = 1.0\n", | |
| " min_p = 0.02\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "059bc178", | |
| "metadata": { | |
| "execution": { | |
| "iopub.execute_input": "2026-03-02T23:35:47.760209Z", | |
| "iopub.status.busy": "2026-03-02T23:35:47.760067Z", | |
| "iopub.status.idle": "2026-03-02T23:35:47.763772Z", | |
| "shell.execute_reply": "2026-03-02T23:35:47.763427Z" | |
| }, | |
| "papermill": { | |
| "duration": 0.009471, | |
| "end_time": "2026-03-02T23:35:47.764879", | |
| "exception": false, | |
| "start_time": "2026-03-02T23:35:47.755408", | |
| "status": "completed" | |
| }, | |
| "tags": [] | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "set_seed(CFG.seed)\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "ea5919bf", | |
| "metadata": { | |
| "execution": { | |
| "iopub.execute_input": "2026-03-02T23:35:47.774175Z", | |
| "iopub.status.busy": "2026-03-02T23:35:47.774044Z", | |
| "iopub.status.idle": "2026-03-02T23:35:47.777139Z", | |
| "shell.execute_reply": "2026-03-02T23:35:47.776813Z" | |
| }, | |
| "papermill": { | |
| "duration": 0.008651, | |
| "end_time": "2026-03-02T23:35:47.777858", | |
| "exception": false, | |
| "start_time": "2026-03-02T23:35:47.769207", | |
| "status": "completed" | |
| }, | |
| "tags": [] | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "class AIMO3Template:\n", | |
| "\n", | |
| " def __init__(self):\n", | |
| "\n", | |
| " pass\n", | |
| "\n", | |
| " def get_system_content(self, system_prompt: str, tool_config: ToolNamespaceConfig) -> SystemContent:\n", | |
| "\n", | |
| " return (\n", | |
| " SystemContent.new()\n", | |
| " .with_model_identity(system_prompt)\n", | |
| " .with_reasoning_effort(reasoning_effort=ReasoningEffort.HIGH)\n", | |
| " .with_tools(tool_config)\n", | |
| " )\n", | |
| "\n", | |
| " def apply_chat_template(\n", | |
| " self, \n", | |
| " system_prompt: str, \n", | |
| " user_prompt: str, \n", | |
| " tool_config: ToolNamespaceConfig\n", | |
| " ) -> list[Message]:\n", | |
| "\n", | |
| " system_content = self.get_system_content(system_prompt, tool_config) \n", | |
| " system_message = Message.from_role_and_content(Role.SYSTEM, system_content)\n", | |
| "\n", | |
| " user_message = Message.from_role_and_content(Role.USER, user_prompt)\n", | |
| "\n", | |
| " return [system_message, user_message]\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "fee141b2", | |
| "metadata": { | |
| "execution": { | |
| "iopub.execute_input": "2026-03-02T23:35:47.786739Z", | |
| "iopub.status.busy": "2026-03-02T23:35:47.786603Z", | |
| "iopub.status.idle": "2026-03-02T23:35:47.795724Z", | |
| "shell.execute_reply": "2026-03-02T23:35:47.795368Z" | |
| }, | |
| "papermill": { | |
| "duration": 0.014551, | |
| "end_time": "2026-03-02T23:35:47.796430", | |
| "exception": false, | |
| "start_time": "2026-03-02T23:35:47.781879", | |
| "status": "completed" | |
| }, | |
| "tags": [] | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "class AIMO3Sandbox:\n", | |
| "\n", | |
| " _port_lock = threading.Lock()\n", | |
| " _next_port = 50000\n", | |
| "\n", | |
| " @classmethod\n", | |
| " def _get_next_ports(cls, count: int = 5) -> list[int]:\n", | |
| "\n", | |
| " with cls._port_lock:\n", | |
| " ports = list(range(cls._next_port, cls._next_port + count))\n", | |
| " cls._next_port += count\n", | |
| "\n", | |
| " return ports\n", | |
| "\n", | |
| " def __init__(self, timeout: float):\n", | |
| "\n", | |
| " self._default_timeout = timeout\n", | |
| " self._owns_kernel = False\n", | |
| " self._client = None\n", | |
| " self._km = None\n", | |
| " \n", | |
| " ports = self._get_next_ports(5)\n", | |
| "\n", | |
| " env = os.environ.copy()\n", | |
| " env['PYDEVD_DISABLE_FILE_VALIDATION'] = '1'\n", | |
| " env['PYDEVD_WARN_EVALUATION_TIMEOUT'] = '0'\n", | |
| " env['JUPYTER_PLATFORM_DIRS'] = '1'\n", | |
| " env['PYTHONWARNINGS'] = 'ignore'\n", | |
| " env['MPLBACKEND'] = 'Agg'\n", | |
| "\n", | |
| " self._km = KernelManager()\n", | |
| " self._km.shell_port = ports[0]\n", | |
| " self._km.iopub_port = ports[1]\n", | |
| " self._km.stdin_port = ports[2]\n", | |
| " self._km.hb_port = ports[3]\n", | |
| " self._km.control_port = ports[4]\n", | |
| "\n", | |
| " self._km.start_kernel(env=env, extra_arguments=['--Application.log_level=CRITICAL'])\n", | |
| "\n", | |
| " self._client = self._km.blocking_client()\n", | |
| " self._client.start_channels()\n", | |
| " self._client.wait_for_ready(timeout=self._default_timeout)\n", | |
| " self._owns_kernel = True\n", | |
| "\n", | |
| " self.execute(\n", | |
| " 'import math\\n'\n", | |
| " 'import numpy\\n'\n", | |
| " 'import sympy\\n'\n", | |
| " 'import itertools\\n'\n", | |
| " 'import collections\\n'\n", | |
| " 'import mpmath\\n'\n", | |
| " 'mpmath.mp.dps = 64\\n'\n", | |
| " )\n", | |
| "\n", | |
| " def _format_error(self, traceback: list[str]) -> str:\n", | |
| "\n", | |
| " clean_lines = []\n", | |
| "\n", | |
| " for frame in traceback:\n", | |
| " clean_frame = re.sub(r'\\x1b\\[[0-9;]*m', '', frame)\n", | |
| "\n", | |
| " if 'File \"' in clean_frame and 'ipython-input' not in clean_frame:\n", | |
| " continue\n", | |
| "\n", | |
| " clean_lines.append(clean_frame)\n", | |
| "\n", | |
| " return ''.join(clean_lines)\n", | |
| "\n", | |
| " def execute(self, code: str, timeout: float | None = None) -> str:\n", | |
| "\n", | |
| " client = self._client\n", | |
| " effective_timeout = timeout or self._default_timeout\n", | |
| " \n", | |
| " msg_id = client.execute(\n", | |
| " code, \n", | |
| " store_history=True, \n", | |
| " allow_stdin=False, \n", | |
| " stop_on_error=False\n", | |
| " )\n", | |
| "\n", | |
| " stdout_parts = []\n", | |
| " stderr_parts = []\n", | |
| " \n", | |
| " start_time = time.time()\n", | |
| "\n", | |
| " while True:\n", | |
| " elapsed = time.time() - start_time\n", | |
| "\n", | |
| " if elapsed > effective_timeout:\n", | |
| " self._km.interrupt_kernel()\n", | |
| "\n", | |
| " return f'[ERROR] Execution timed out after {effective_timeout} seconds'\n", | |
| "\n", | |
| " try:\n", | |
| " msg = client.get_iopub_msg(timeout=1.0)\n", | |
| "\n", | |
| " except queue.Empty:\n", | |
| " continue\n", | |
| "\n", | |
| " if msg.get('parent_header', {}).get('msg_id') != msg_id:\n", | |
| " continue\n", | |
| "\n", | |
| " msg_type = msg.get('msg_type')\n", | |
| " content = msg.get('content', {})\n", | |
| "\n", | |
| " if msg_type == 'stream':\n", | |
| " text = content.get('text', '')\n", | |
| "\n", | |
| " if content.get('name') == 'stdout':\n", | |
| " stdout_parts.append(text)\n", | |
| "\n", | |
| " else:\n", | |
| " stderr_parts.append(text)\n", | |
| "\n", | |
| " elif msg_type == 'error':\n", | |
| " traceback_list = content.get('traceback', [])\n", | |
| "\n", | |
| " stderr_parts.append(self._format_error(traceback_list))\n", | |
| "\n", | |
| " elif msg_type in {'execute_result', 'display_data'}:\n", | |
| " data = content.get('data', {})\n", | |
| " text = data.get('text/plain')\n", | |
| "\n", | |
| " if text:\n", | |
| " stdout_parts.append(text if text.endswith('\\n') else f'{text}\\n')\n", | |
| "\n", | |
| " elif msg_type == 'status':\n", | |
| " if content.get('execution_state') == 'idle':\n", | |
| " break\n", | |
| "\n", | |
| " stdout = ''.join(stdout_parts)\n", | |
| " stderr = ''.join(stderr_parts)\n", | |
| "\n", | |
| " if stderr:\n", | |
| " return f'{stdout.rstrip()}\\n{stderr}' if stdout else stderr\n", | |
| "\n", | |
| " return stdout if stdout.strip() else '[WARN] No output. Use print() to see results.'\n", | |
| "\n", | |
| " def close(self):\n", | |
| "\n", | |
| " with contextlib.suppress(Exception):\n", | |
| " if self._client:\n", | |
| " self._client.stop_channels()\n", | |
| "\n", | |
| " if self._owns_kernel and self._km is not None:\n", | |
| " with contextlib.suppress(Exception):\n", | |
| " self._km.shutdown_kernel(now=True)\n", | |
| "\n", | |
| " with contextlib.suppress(Exception):\n", | |
| " self._km.cleanup_resources()\n", | |
| "\n", | |
| " def reset(self):\n", | |
| " \n", | |
| " self.execute(\n", | |
| " '%reset -f\\n'\n", | |
| " 'import math\\n'\n", | |
| " 'import numpy\\n'\n", | |
| " 'import sympy\\n'\n", | |
| " 'import itertools\\n'\n", | |
| " 'import collections\\n'\n", | |
| " 'import mpmath\\n'\n", | |
| " 'mpmath.mp.dps = 64\\n'\n", | |
| " )\n", | |
| "\n", | |
| " def __del__(self):\n", | |
| "\n", | |
| " self.close()\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "1e00f034", | |
| "metadata": { | |
| "execution": { | |
| "iopub.execute_input": "2026-03-02T23:35:47.805452Z", | |
| "iopub.status.busy": "2026-03-02T23:35:47.805284Z", | |
| "iopub.status.idle": "2026-03-02T23:35:47.810904Z", | |
| "shell.execute_reply": "2026-03-02T23:35:47.810552Z" | |
| }, | |
| "papermill": { | |
| "duration": 0.011156, | |
| "end_time": "2026-03-02T23:35:47.811662", | |
| "exception": false, | |
| "start_time": "2026-03-02T23:35:47.800506", | |
| "status": "completed" | |
| }, | |
| "tags": [] | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "class AIMO3Tool:\n", | |
| "\n", | |
| " def __init__(self, local_jupyter_timeout: float, tool_prompt: str, sandbox=None):\n", | |
| "\n", | |
| " self._local_jupyter_timeout = local_jupyter_timeout\n", | |
| " self._tool_prompt = tool_prompt\n", | |
| " self._jupyter_session = sandbox\n", | |
| " \n", | |
| " self._owns_session = sandbox is None\n", | |
| " \n", | |
| " self._execution_lock = threading.Lock()\n", | |
| " self._init_lock = threading.Lock()\n", | |
| "\n", | |
| " def _ensure_session(self):\n", | |
| "\n", | |
| " if self._jupyter_session is None:\n", | |
| " with self._init_lock:\n", | |
| " if self._jupyter_session is None:\n", | |
| " self._jupyter_session = AIMO3Sandbox(timeout=self._local_jupyter_timeout)\n", | |
| "\n", | |
| " def _ensure_last_print(self, code: str) -> str:\n", | |
| "\n", | |
| " lines = code.strip().split('\\n')\n", | |
| "\n", | |
| " if not lines:\n", | |
| " return code\n", | |
| "\n", | |
| " last_line = lines[-1].strip()\n", | |
| "\n", | |
| " if 'print' in last_line or 'import' in last_line:\n", | |
| " return code\n", | |
| "\n", | |
| " if not last_line:\n", | |
| " return code\n", | |
| "\n", | |
| " if last_line.startswith('#'):\n", | |
| " return code\n", | |
| "\n", | |
| " lines[-1] = 'print(' + last_line + ')'\n", | |
| "\n", | |
| " return '\\n'.join(lines)\n", | |
| "\n", | |
| " @property\n", | |
| " def instruction(self) -> str:\n", | |
| "\n", | |
| " return self._tool_prompt\n", | |
| "\n", | |
| " @property\n", | |
| " def tool_config(self) -> ToolNamespaceConfig:\n", | |
| "\n", | |
| " return ToolNamespaceConfig(\n", | |
| " name='python', \n", | |
| " description=self.instruction, \n", | |
| " tools=[]\n", | |
| " )\n", | |
| "\n", | |
| " def _make_response(self, output: str, channel: str | None = None) -> Message:\n", | |
| "\n", | |
| " content = TextContent(text=output)\n", | |
| " author = Author(role=Role.TOOL, name='python')\n", | |
| " message = Message(author=author, content=[content]).with_recipient('assistant')\n", | |
| "\n", | |
| " if channel:\n", | |
| " message = message.with_channel(channel)\n", | |
| "\n", | |
| " return message\n", | |
| "\n", | |
| " def process_sync_plus(self, message: Message) -> list[Message]:\n", | |
| "\n", | |
| " self._ensure_session()\n", | |
| " raw_script = message.content[0].text\n", | |
| " final_script = self._ensure_last_print(raw_script)\n", | |
| "\n", | |
| " with self._execution_lock:\n", | |
| " try:\n", | |
| " output = self._jupyter_session.execute(final_script)\n", | |
| "\n", | |
| " except TimeoutError as exc:\n", | |
| " output = f'[ERROR] {exc}'\n", | |
| "\n", | |
| " return [self._make_response(output, channel=message.channel)]\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "d9440d22", | |
| "metadata": { | |
| "execution": { | |
| "iopub.execute_input": "2026-03-02T23:35:47.820790Z", | |
| "iopub.status.busy": "2026-03-02T23:35:47.820648Z", | |
| "iopub.status.idle": "2026-03-02T23:35:47.844675Z", | |
| "shell.execute_reply": "2026-03-02T23:35:47.844288Z" | |
| }, | |
| "papermill": { | |
| "duration": 0.029855, | |
| "end_time": "2026-03-02T23:35:47.845598", | |
| "exception": false, | |
| "start_time": "2026-03-02T23:35:47.815743", | |
| "status": "completed" | |
| }, | |
| "tags": [] | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "class AIMO3Solver:\n", | |
| "\n", | |
| " def __init__(self, cfg, port: int = 8000):\n", | |
| " \n", | |
| " self.cfg = cfg\n", | |
| " self.port = port\n", | |
| " self.base_url = f'http://0.0.0.0:{port}/v1'\n", | |
| " self.api_key = 'sk-local'\n", | |
| " self.template = AIMO3Template()\n", | |
| " self.encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)\n", | |
| " self.stop_token_ids = self.encoding.stop_tokens_for_assistant_actions()\n", | |
| " \n", | |
| " self._preload_model_weights()\n", | |
| " \n", | |
| " self.server_process = self._start_server()\n", | |
| " \n", | |
| " self.client = OpenAI(\n", | |
| " base_url=self.base_url, \n", | |
| " api_key=self.api_key, \n", | |
| " timeout=self.cfg.session_timeout\n", | |
| " )\n", | |
| " \n", | |
| " self._wait_for_server()\n", | |
| " self._initialize_kernels()\n", | |
| " \n", | |
| " self.notebook_start_time = time.time()\n", | |
| " self.problems_remaining = 50\n", | |
| " \n", | |
| " def _preload_model_weights(self) -> None:\n", | |
| " \n", | |
| " print(f'Loading model weights from {self.cfg.model_path} into OS Page Cache...')\n", | |
| " start_time = time.time()\n", | |
| " \n", | |
| " files_to_load = []\n", | |
| " total_size = 0\n", | |
| " \n", | |
| " for root, _, files in os.walk(self.cfg.model_path):\n", | |
| " for file_name in files:\n", | |
| " file_path = os.path.join(root, file_name)\n", | |
| " \n", | |
| " if os.path.isfile(file_path):\n", | |
| " files_to_load.append(file_path)\n", | |
| " total_size += os.path.getsize(file_path)\n", | |
| " \n", | |
| " def _read_file(path: str) -> None:\n", | |
| " \n", | |
| " with open(path, 'rb') as file_object:\n", | |
| " while file_object.read(1024 * 1024 * 1024):\n", | |
| " pass\n", | |
| " \n", | |
| " with ThreadPoolExecutor(max_workers=self.cfg.workers) as executor:\n", | |
| " list(executor.map(_read_file, files_to_load))\n", | |
| " \n", | |
| " elapsed = time.time() - start_time\n", | |
| " print(f'Processed {len(files_to_load)} files ({total_size / 1e9:.2f} GB) in {elapsed:.2f} seconds.\\n')\n", | |
| " \n", | |
| " def _start_server(self) -> subprocess.Popen:\n", | |
| " \n", | |
| " cmd = [\n", | |
| " sys.executable, \n", | |
| " '-m', \n", | |
| " 'vllm.entrypoints.openai.api_server', \n", | |
| " '--seed', \n", | |
| " str(self.cfg.seed), \n", | |
| " '--model', \n", | |
| " self.cfg.model_path, \n", | |
| " '--served-model-name', \n", | |
| " self.cfg.served_model_name, \n", | |
| " '--tensor-parallel-size', \n", | |
| " '1', \n", | |
| " '--max-num-seqs', \n", | |
| " str(self.cfg.batch_size), \n", | |
| " '--gpu-memory-utilization', \n", | |
| " str(self.cfg.gpu_memory_utilization), \n", | |
| " '--host', \n", | |
| " '0.0.0.0', \n", | |
| " '--port', \n", | |
| " str(self.port), \n", | |
| " '--dtype', \n", | |
| " self.cfg.dtype, \n", | |
| " '--kv-cache-dtype', \n", | |
| " self.cfg.kv_cache_dtype, \n", | |
| " '--max-model-len', \n", | |
| " str(self.cfg.context_tokens), \n", | |
| " '--stream-interval', \n", | |
| " str(self.cfg.stream_interval), \n", | |
| " '--async-scheduling', \n", | |
| " '--disable-log-stats', \n", | |
| " '--enable-prefix-caching'\n", | |
| " ]\n", | |
| " \n", | |
| " self.log_file = open('vllm_server.log', 'w')\n", | |
| " \n", | |
| " return subprocess.Popen(\n", | |
| " cmd, \n", | |
| " stdout=self.log_file, \n", | |
| " stderr=subprocess.STDOUT, \n", | |
| " start_new_session=True\n", | |
| " )\n", | |
| " \n", | |
| " def _wait_for_server(self):\n", | |
| " \n", | |
| " print('Waiting for vLLM server...')\n", | |
| " start_time = time.time()\n", | |
| " \n", | |
| " for _ in range(self.cfg.server_timeout):\n", | |
| " return_code = self.server_process.poll()\n", | |
| " \n", | |
| " if return_code is not None:\n", | |
| " self.log_file.flush()\n", | |
| " \n", | |
| " with open('vllm_server.log', 'r') as log_file:\n", | |
| " logs = log_file.read()\n", | |
| " \n", | |
| " raise RuntimeError(f'Server died with code {return_code}. Full logs:\\n{logs}\\n')\n", | |
| " \n", | |
| " try:\n", | |
| " self.client.models.list()\n", | |
| " elapsed = time.time() - start_time\n", | |
| " print(f'Server is ready (took {elapsed:.2f} seconds).\\n')\n", | |
| " \n", | |
| " return\n", | |
| " \n", | |
| " except Exception:\n", | |
| " time.sleep(1)\n", | |
| " \n", | |
| " raise RuntimeError('Server failed to start (timeout).\\n')\n", | |
| " \n", | |
| " def _initialize_kernels(self) -> None:\n", | |
| " \n", | |
| " print(f'Initializing {self.cfg.workers} persistent Jupyter kernels...')\n", | |
| " start_time = time.time()\n", | |
| " \n", | |
| " self.sandbox_pool = queue.Queue()\n", | |
| " \n", | |
| " def _create_sandbox():\n", | |
| " \n", | |
| " return AIMO3Sandbox(timeout=self.cfg.jupyter_timeout)\n", | |
| " \n", | |
| " with ThreadPoolExecutor(max_workers=self.cfg.workers) as executor:\n", | |
| " futures = [executor.submit(_create_sandbox) for _ in range(self.cfg.workers)]\n", | |
| " \n", | |
| " for future in as_completed(futures):\n", | |
| " self.sandbox_pool.put(future.result())\n", | |
| " \n", | |
| " elapsed = time.time() - start_time\n", | |
| " print(f'Kernels initialized in {elapsed:.2f} seconds.\\n')\n", | |
| " \n", | |
| " def _scan_for_answer(self, text: str) -> int | None:\n", | |
| " \n", | |
| " pattern = r'\\\\boxed\\s*\\{\\s*([0-9,]+)\\s*\\}'\n", | |
| " matches = re.findall(pattern, text)\n", | |
| " \n", | |
| " if matches:\n", | |
| " try:\n", | |
| " clean_value = matches[-1].replace(',', '')\n", | |
| " value = int(clean_value)\n", | |
| " \n", | |
| " if 0 <= value <= 99999:\n", | |
| " return value\n", | |
| " \n", | |
| " except ValueError:\n", | |
| " pass\n", | |
| " \n", | |
| " pattern = r'final\\s+answer\\s+is\\s*([0-9,]+)'\n", | |
| " matches = re.findall(pattern, text, re.IGNORECASE)\n", | |
| " \n", | |
| " if matches:\n", | |
| " try:\n", | |
| " clean_value = matches[-1].replace(',', '')\n", | |
| " value = int(clean_value)\n", | |
| " \n", | |
| " if 0 <= value <= 99999:\n", | |
| " return value\n", | |
| " \n", | |
| " except ValueError:\n", | |
| " pass\n", | |
| " \n", | |
| " return None\n", | |
| " \n", | |
| " def _compute_mean_entropy(self, logprobs_buffer: list) -> float:\n", | |
| " \n", | |
| " if not logprobs_buffer:\n", | |
| " return float('inf')\n", | |
| " \n", | |
| " total_entropy = 0.0\n", | |
| " token_count = 0\n", | |
| " \n", | |
| " for top_logprobs_dict in logprobs_buffer:\n", | |
| " \n", | |
| " if not isinstance(top_logprobs_dict, dict):\n", | |
| " continue\n", | |
| " \n", | |
| " if not top_logprobs_dict:\n", | |
| " continue\n", | |
| " \n", | |
| " token_entropy = 0.0\n", | |
| " \n", | |
| " for token_str, log_prob in top_logprobs_dict.items():\n", | |
| " prob = math.exp(log_prob)\n", | |
| " \n", | |
| " if prob > 0:\n", | |
| " token_entropy -= prob * math.log2(prob)\n", | |
| " \n", | |
| " total_entropy += token_entropy\n", | |
| " token_count += 1\n", | |
| " \n", | |
| " if token_count == 0:\n", | |
| " return float('inf')\n", | |
| " \n", | |
| " return total_entropy / token_count\n", | |
| " \n", | |
| " def _process_attempt(\n", | |
| " self, \n", | |
| " problem: str, \n", | |
| " system_prompt: str, \n", | |
| " attempt_index: int, \n", | |
| " stop_event: threading.Event, \n", | |
| " deadline: float\n", | |
| " ) -> dict:\n", | |
| " \n", | |
| " if stop_event.is_set() or time.time() > deadline:\n", | |
| " return {\n", | |
| " 'Attempt': attempt_index + 1, \n", | |
| " 'Answer': None, \n", | |
| " 'Python Calls': 0, \n", | |
| " 'Python Errors': 0, \n", | |
| " 'Response Length': 0, \n", | |
| " 'Entropy': float('inf')\n", | |
| " }\n", | |
| " \n", | |
| " local_tool = None\n", | |
| " sandbox = None\n", | |
| " python_calls = 0\n", | |
| " python_errors = 0\n", | |
| " total_tokens = 0\n", | |
| " final_answer = None\n", | |
| " \n", | |
| " logprobs_buffer = []\n", | |
| " \n", | |
| " attempt_seed = int(math.pow(self.cfg.seed + attempt_index, 2))\n", | |
| " \n", | |
| " try:\n", | |
| " sandbox = self.sandbox_pool.get(timeout=self.cfg.sandbox_timeout)\n", | |
| " \n", | |
| " local_tool = AIMO3Tool(\n", | |
| " local_jupyter_timeout=self.cfg.jupyter_timeout, \n", | |
| " tool_prompt=self.cfg.tool_prompt, \n", | |
| " sandbox=sandbox\n", | |
| " )\n", | |
| " \n", | |
| " encoding = self.encoding\n", | |
| " messages = self.template.apply_chat_template(\n", | |
| " system_prompt, \n", | |
| " problem, \n", | |
| " local_tool.tool_config\n", | |
| " )\n", | |
| " \n", | |
| " conversation = Conversation.from_messages(messages)\n", | |
| " \n", | |
| " for _ in range(self.cfg.turns):\n", | |
| " if stop_event.is_set() or time.time() > deadline:\n", | |
| " break\n", | |
| " \n", | |
| " prompt_ids = encoding.render_conversation_for_completion(conversation, Role.ASSISTANT)\n", | |
| " max_tokens = self.cfg.context_tokens - len(prompt_ids)\n", | |
| " \n", | |
| " if max_tokens < self.cfg.buffer_tokens:\n", | |
| " break\n", | |
| " \n", | |
| " stream = self.client.completions.create(\n", | |
| " model=self.cfg.served_model_name, \n", | |
| " temperature=self.cfg.temperature, \n", | |
| " logprobs=self.cfg.top_logprobs, \n", | |
| " max_tokens=max_tokens, \n", | |
| " prompt=prompt_ids, \n", | |
| " seed=attempt_seed, \n", | |
| " stream=True, \n", | |
| " extra_body={\n", | |
| " 'min_p': self.cfg.min_p, \n", | |
| " 'stop_token_ids': self.stop_token_ids, \n", | |
| " 'return_token_ids': True\n", | |
| " }\n", | |
| " )\n", | |
| " \n", | |
| " try:\n", | |
| " token_buffer = []\n", | |
| " text_chunks = []\n", | |
| " \n", | |
| " for chunk in stream:\n", | |
| " if stop_event.is_set() or time.time() > deadline:\n", | |
| " break\n", | |
| " \n", | |
| " new_tokens = chunk.choices[0].token_ids\n", | |
| " new_text = chunk.choices[0].text\n", | |
| " \n", | |
| " if new_tokens:\n", | |
| " token_buffer.extend(new_tokens)\n", | |
| " total_tokens += len(new_tokens)\n", | |
| " text_chunks.append(new_text)\n", | |
| " \n", | |
| " chunk_logprobs = chunk.choices[0].logprobs\n", | |
| " \n", | |
| " if chunk_logprobs is not None:\n", | |
| " if chunk_logprobs.top_logprobs:\n", | |
| " logprobs_buffer.extend(chunk_logprobs.top_logprobs)\n", | |
| " \n", | |
| " if '}' in new_text:\n", | |
| " search_text = ''.join(text_chunks[-self.cfg.search_tokens:])\n", | |
| " answer = self._scan_for_answer(search_text)\n", | |
| " \n", | |
| " if answer is not None:\n", | |
| " final_answer = answer\n", | |
| " break\n", | |
| " \n", | |
| " finally:\n", | |
| " stream.close()\n", | |
| " \n", | |
| " if final_answer is not None:\n", | |
| " break\n", | |
| " \n", | |
| " if not token_buffer:\n", | |
| " break\n", | |
| " \n", | |
| " new_messages = encoding.parse_messages_from_completion_tokens(token_buffer, Role.ASSISTANT)\n", | |
| " conversation.messages.extend(new_messages)\n", | |
| " last_message = new_messages[-1]\n", | |
| " \n", | |
| " if last_message.channel == 'final':\n", | |
| " answer_text = last_message.content[0].text\n", | |
| " final_answer = self._scan_for_answer(answer_text)\n", | |
| " break\n", | |
| " \n", | |
| " if last_message.recipient == 'python':\n", | |
| " python_calls += 1\n", | |
| " tool_responses = local_tool.process_sync_plus(last_message)\n", | |
| " \n", | |
| " response_text = tool_responses[0].content[0].text\n", | |
| " \n", | |
| " if response_text.startswith('[ERROR]') or 'Traceback' in response_text or 'Error:' in response_text:\n", | |
| " python_errors += 1\n", | |
| " \n", | |
| " conversation.messages.extend(tool_responses)\n", | |
| " \n", | |
| " except Exception as exc:\n", | |
| " python_errors += 1\n", | |
| " \n", | |
| " finally:\n", | |
| " if sandbox is not None:\n", | |
| " sandbox.reset()\n", | |
| " self.sandbox_pool.put(sandbox)\n", | |
| " \n", | |
| " mean_entropy = self._compute_mean_entropy(logprobs_buffer)\n", | |
| " \n", | |
| " return {\n", | |
| " 'Attempt': attempt_index + 1, \n", | |
| " 'Response Length': total_tokens, \n", | |
| " 'Python Calls': python_calls, \n", | |
| " 'Python Errors': python_errors, \n", | |
| " 'Entropy': mean_entropy, \n", | |
| " 'Answer': final_answer\n", | |
| " }\n", | |
| " \n", | |
| " def _select_answer(self, detailed_results: list) -> int:\n", | |
| "\n", | |
| " answer_weights = defaultdict(float)\n", | |
| " answer_votes = defaultdict(int)\n", | |
| "\n", | |
| " for result in detailed_results:\n", | |
| " answer = result['Answer']\n", | |
| " entropy = result['Entropy']\n", | |
| " \n", | |
| " if answer is not None:\n", | |
| " weight = 1.0 / max(entropy, 1e-9)\n", | |
| " \n", | |
| " answer_weights[answer] += weight\n", | |
| " answer_votes[answer] += 1\n", | |
| "\n", | |
| " scored_answers = []\n", | |
| "\n", | |
| " for answer, total_weight in answer_weights.items():\n", | |
| " scored_answers.append({\n", | |
| " 'answer': answer, \n", | |
| " 'votes': answer_votes[answer], \n", | |
| " 'score': total_weight\n", | |
| " })\n", | |
| "\n", | |
| " scored_answers.sort(key=lambda x: x['score'], reverse=True)\n", | |
| "\n", | |
| " vote_data = []\n", | |
| "\n", | |
| " for item in scored_answers:\n", | |
| " vote_data.append((\n", | |
| " item['answer'], \n", | |
| " item['votes'], \n", | |
| " item['score']\n", | |
| " ))\n", | |
| "\n", | |
| " vote_dataframe = pd.DataFrame(\n", | |
| " vote_data, \n", | |
| " columns=['Answer', 'Votes', 'Score']\n", | |
| " )\n", | |
| "\n", | |
| " vote_dataframe = vote_dataframe.round({'Score': 3})\n", | |
| " display(vote_dataframe)\n", | |
| " \n", | |
| " if not scored_answers:\n", | |
| " print('\\nFinal Answer: 0\\n')\n", | |
| " return 0\n", | |
| "\n", | |
| " final_answer = scored_answers[0]['answer'] \n", | |
| " print(f'\\nFinal Answer: {final_answer}\\n')\n", | |
| "\n", | |
| " return final_answer\n", | |
| " \n", | |
| " def solve_problem(self, problem: str) -> int:\n", | |
| " \n", | |
| " print(f'\\nProblem: {problem}\\n')\n", | |
| " \n", | |
| " user_input = f'{problem} {self.cfg.preference_prompt}'\n", | |
| " \n", | |
| " elapsed_global = time.time() - self.notebook_start_time\n", | |
| " time_left = self.cfg.notebook_limit - elapsed_global\n", | |
| " problems_left_others = max(0, self.problems_remaining - 1)\n", | |
| " reserved_time = problems_left_others * self.cfg.base_problem_timeout\n", | |
| " \n", | |
| " budget = time_left - reserved_time\n", | |
| " budget = min(budget, self.cfg.high_problem_timeout)\n", | |
| " budget = max(budget, self.cfg.base_problem_timeout)\n", | |
| " \n", | |
| " deadline = time.time() + budget\n", | |
| " \n", | |
| " print(f'Budget: {budget:.2f} seconds | Deadline: {deadline:.2f}\\n')\n", | |
| " \n", | |
| " tasks = []\n", | |
| " \n", | |
| " for attempt_index in range(self.cfg.attempts):\n", | |
| " tasks.append((self.cfg.system_prompt, attempt_index))\n", | |
| " \n", | |
| " detailed_results = []\n", | |
| " valid_answers = []\n", | |
| " \n", | |
| " stop_event = threading.Event()\n", | |
| " \n", | |
| " executor = ThreadPoolExecutor(max_workers=self.cfg.workers)\n", | |
| " \n", | |
| " try:\n", | |
| " futures = []\n", | |
| " \n", | |
| " for (system_prompt, attempt_index) in tasks:\n", | |
| " future = executor.submit(\n", | |
| " self._process_attempt, \n", | |
| " user_input, \n", | |
| " system_prompt, \n", | |
| " attempt_index, \n", | |
| " stop_event, \n", | |
| " deadline\n", | |
| " )\n", | |
| " \n", | |
| " futures.append(future)\n", | |
| " \n", | |
| " for future in as_completed(futures):\n", | |
| " try:\n", | |
| " result = future.result()\n", | |
| " detailed_results.append(result)\n", | |
| " \n", | |
| " if result['Answer'] is not None:\n", | |
| " valid_answers.append(result['Answer'])\n", | |
| " \n", | |
| " counts = Counter(valid_answers).most_common(1)\n", | |
| " \n", | |
| " if counts and counts[0][1] >= self.cfg.early_stop:\n", | |
| " stop_event.set()\n", | |
| " \n", | |
| " for f in futures:\n", | |
| " f.cancel()\n", | |
| " \n", | |
| " break\n", | |
| " \n", | |
| " except Exception as exc:\n", | |
| " print(f'Future failed: {exc}')\n", | |
| " continue\n", | |
| " \n", | |
| " finally:\n", | |
| " stop_event.set()\n", | |
| " executor.shutdown(wait=True, cancel_futures=True)\n", | |
| " \n", | |
| " self.problems_remaining = max(0, self.problems_remaining - 1)\n", | |
| " \n", | |
| " if detailed_results:\n", | |
| " results_dataframe = pd.DataFrame(detailed_results)\n", | |
| " results_dataframe['Entropy'] = results_dataframe['Entropy'].round(3)\n", | |
| " results_dataframe['Answer'] = results_dataframe['Answer'].astype('Int64')\n", | |
| " \n", | |
| " display(results_dataframe)\n", | |
| " \n", | |
| " if not valid_answers:\n", | |
| " print('\\nResult: 0\\n')\n", | |
| " \n", | |
| " return 0\n", | |
| " \n", | |
| " return self._select_answer(detailed_results)\n", | |
| " \n", | |
| " def __del__(self):\n", | |
| " \n", | |
| " if hasattr(self, 'server_process'):\n", | |
| " self.server_process.terminate()\n", | |
| " self.server_process.wait()\n", | |
| " \n", | |
| " if hasattr(self, 'log_file'):\n", | |
| " self.log_file.close()\n", | |
| " \n", | |
| " if hasattr(self, 'sandbox_pool'):\n", | |
| " while not self.sandbox_pool.empty():\n", | |
| " try:\n", | |
| " sb = self.sandbox_pool.get_nowait()\n", | |
| " sb.close()\n", | |
| " \n", | |
| " except Exception:\n", | |
| " pass\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "e4b5d5c4", | |
| "metadata": { | |
| "_kg_hide-output": true, | |
| "execution": { | |
| "iopub.execute_input": "2026-03-02T23:35:47.855262Z", | |
| "iopub.status.busy": "2026-03-02T23:35:47.854844Z", | |
| "iopub.status.idle": "2026-03-02T23:39:40.006315Z", | |
| "shell.execute_reply": "2026-03-02T23:39:40.005829Z" | |
| }, | |
| "papermill": { | |
| "duration": 232.15726, | |
| "end_time": "2026-03-02T23:39:40.007229", | |
| "exception": false, | |
| "start_time": "2026-03-02T23:35:47.849969", | |
| "status": "completed" | |
| }, | |
| "tags": [] | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "solver = AIMO3Solver(CFG)\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "afc60312", | |
| "metadata": { | |
| "execution": { | |
| "iopub.execute_input": "2026-03-02T23:39:40.017673Z", | |
| "iopub.status.busy": "2026-03-02T23:39:40.017228Z", | |
| "iopub.status.idle": "2026-03-02T23:39:40.020526Z", | |
| "shell.execute_reply": "2026-03-02T23:39:40.020119Z" | |
| }, | |
| "papermill": { | |
| "duration": 0.009011, | |
| "end_time": "2026-03-02T23:39:40.021239", | |
| "exception": false, | |
| "start_time": "2026-03-02T23:39:40.012228", | |
| "status": "completed" | |
| }, | |
| "tags": [] | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def predict(id_: pl.DataFrame, question: pl.DataFrame, answer: Optional[pl.DataFrame] = None) -> pl.DataFrame:\n", | |
| " \n", | |
| " id_value = id_.item(0)\n", | |
| " question_text = question.item(0)\n", | |
| " \n", | |
| " gc.disable()\n", | |
| " \n", | |
| " final_answer = solver.solve_problem(question_text)\n", | |
| " \n", | |
| " gc.enable()\n", | |
| " gc.collect()\n", | |
| " \n", | |
| " return pl.DataFrame({'id': id_value, 'answer': final_answer})\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "ced14a16", | |
| "metadata": { | |
| "_kg_hide-output": true, | |
| "execution": { | |
| "iopub.execute_input": "2026-03-02T23:39:40.031483Z", | |
| "iopub.status.busy": "2026-03-02T23:39:40.031316Z", | |
| "iopub.status.idle": "2026-03-02T23:40:01.076426Z", | |
| "shell.execute_reply": "2026-03-02T23:40:01.075934Z" | |
| }, | |
| "papermill": { | |
| "duration": 21.051383, | |
| "end_time": "2026-03-02T23:40:01.077321", | |
| "exception": false, | |
| "start_time": "2026-03-02T23:39:40.025938", | |
| "status": "completed" | |
| }, | |
| "tags": [] | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "inference_server = kaggle_evaluation.aimo_3_inference_server.AIMO3InferenceServer(predict)\n", | |
| "\n", | |
| "if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):\n", | |
| " inference_server.serve()\n", | |
| " \n", | |
| "else:\n", | |
| " inference_server.run_local_gateway(\n", | |
| " (str(TEST_CSV),)\n", | |
| " )\n" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.12.12" | |
| }, | |
| "papermill": { | |
| "default_parameters": {}, | |
| "duration": 538.657343, | |
| "end_time": "2026-03-02T23:40:02.902800", | |
| "environment_variables": {}, | |
| "exception": null, | |
| "input_path": "__notebook__.ipynb", | |
| "output_path": "__notebook__.ipynb", | |
| "parameters": {}, | |
| "start_time": "2026-03-02T23:31:04.245457", | |
| "version": "2.6.0" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 5 | |
| } | |
Xet Storage Details
- Size:
- 60.4 kB
- Xet hash:
- bf3ce15ae9abdaf6a55838e462d9877ce30d5603d6af7cb69994fffefcc47cc2
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.