Upload Final_Submission_gpt-oss-score21.ipynb

This was the final submission made to AIMO competition. However, a higher scoring notebook was not submitted due to accidental deletion of the library - modified version of nemo skills

Files changed (1) hide show

Final_Submission_gpt-oss-score21.ipynb +1 -0

Final_Submission_gpt-oss-score21.ipynb ADDED Viewed

	@@ -0,0 +1 @@

+ {"metadata":{"kernelspec":{"name":"python3","display_name":"Python 3","language":"python"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.12.12"},"kaggle":{"accelerator":"none","dataSources":[{"sourceType":"competition","sourceId":118448,"databundleVersionId":14559231},{"sourceType":"datasetVersion","sourceId":15713265,"datasetId":10066925,"databundleVersionId":16653414},{"sourceType":"datasetVersion","sourceId":14846606,"datasetId":9495812,"databundleVersionId":15706375},{"sourceType":"modelInstanceVersion","sourceId":827437,"databundleVersionId":16606906,"modelInstanceId":629147,"modelId":641049},{"sourceType":"kernelVersion","sourceId":303511002},{"sourceType":"kernelVersion","sourceId":303518560}],"dockerImageVersionId":31329,"isInternetEnabled":false,"language":"python","sourceType":"notebook","isGpuEnabled":false}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"markdown","source":"\n# Setup The Environment","metadata":{"editable":true,"slideshow":{"slide_type":""},"tags":[]}},{"cell_type":"code","source":"# Track Overall Time\nimport time\nglobal_deadline = time.perf_counter() + 5*3600 \nglobal_remaining = global_deadline - time.perf_counter()\ncutoff_duration = global_remaining - 350\ndef get_global_remaining():\n return max(0, global_deadline - time.perf_counter())","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-13T17:56:46.433668Z","iopub.execute_input":"2026-04-13T17:56:46.434244Z","iopub.status.idle":"2026-04-13T17:56:46.443246Z","shell.execute_reply.started":"2026-04-13T17:56:46.434225Z","shell.execute_reply":"2026-04-13T17:56:46.442794Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"import os, sys\noriginal_pythonpath = os.environ.get(\"PYTHONPATH\", \"\")\npath1 = '/kaggle/input/datasets/hpkaur34/gpt-nemo/Gpt-oss'\npath2 = '/kaggle/usr/lib/notebooks/hpkaur34/install_utility_nemo_run/'\nnew_paths = f\"{path1}:{path2}\"\nmerged_pythonpath = f\"{new_paths}:{original_pythonpath}\" if original_pythonpath else new_path\nos.environ[\"PYTHONPATH\"] = merged_pythonpath\nsys.path.append('/kaggle/input/datasets/hpkaur34/gpt-nemo/Gpt-oss')\nsys.path.append('/kaggle/usr/lib/notebooks/hpkaur34/install_utility_nemo_run/')","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-13T17:56:46.443975Z","iopub.execute_input":"2026-04-13T17:56:46.444112Z","iopub.status.idle":"2026-04-13T17:56:46.458196Z","shell.execute_reply.started":"2026-04-13T17:56:46.444098Z","shell.execute_reply":"2026-04-13T17:56:46.457796Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"import subprocess\ndef set_env(input_archive, temp_dir):\n if not os.path.exists(temp_dir):\n os.makedirs(temp_dir, exist_ok=True)\n subprocess.run(['tar', '-xzf', input_archive, '-C', temp_dir], check=True)\n subprocess.run([\n sys.executable, \n '-m', \n 'pip', \n 'install', \n '--no-index', \n '--find-links', \n f'{temp_dir}/wheels',\n 'paramiko',\n 'math_verify', \n 'litellm',\n 'flashinfer-python', \n 'vllm==0.11.2', \n 'openai_harmony',\n ], check=False)\n \ntry:\n set_env(\n input_archive='/kaggle/usr/lib/notebooks/hpkaur34/aimo_utility_copy/wheels.tar.gz', \n temp_dir='/kaggle/tmp/setup'\n )\nexcept Exception as e:\n print(f\"⚠️ set_env failed: {e}\")\n print(\"Continuing execution...\")","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-13T17:56:46.458726Z","iopub.execute_input":"2026-04-13T17:56:46.458860Z","iopub.status.idle":"2026-04-13T18:00:21.404043Z","shell.execute_reply.started":"2026-04-13T17:56:46.458846Z","shell.execute_reply":"2026-04-13T18:00:21.403601Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"import os\nos.environ[\"CUDA_LAUNCH_BLOCKING\"] = \"1\"\nimport torch","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-13T18:00:21.405015Z","iopub.execute_input":"2026-04-13T18:00:21.405165Z","iopub.status.idle":"2026-04-13T18:00:25.893062Z","shell.execute_reply.started":"2026-04-13T18:00:21.405150Z","shell.execute_reply":"2026-04-13T18:00:25.892597Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"\"\"\"\nimport logging\nlogging.basicConfig(level=logging.DEBUG)\n\"\"\"","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-13T18:00:25.893681Z","iopub.execute_input":"2026-04-13T18:00:25.893899Z","iopub.status.idle":"2026-04-13T18:00:25.897662Z","shell.execute_reply.started":"2026-04-13T18:00:25.893881Z","shell.execute_reply":"2026-04-13T18:00:25.897296Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"import asyncio\nimport torch\nimport subprocess\nimport warnings\nimport glob\nimport kaggle_evaluation.aimo_3_inference_server\nimport pandas as pd\nimport traceback\nimport nest_asyncio\nimport httpx\nimport re\nimport time\nimport copy\nimport json\nimport requests\nimport pandas as pd\nimport polars as pl\nfrom collections import Counter\nfrom typing import List\nimport secrets\nimport json\npd.set_option('display.max_colwidth', None)\nwarnings.filterwarnings(\"ignore\", category=SyntaxWarning)\nnest_asyncio.apply()\nos.environ[\"TORCH_COMPILE_DISABLE\"] = \"1\"\nos.environ[\"TORCHDYNAMO_DISABLE\"] = \"1\"\nos.environ['TRANSFORMERS_NO_FLAX'] = '1'\nos.environ['CUDA_VISIBLE_DEVICES'] = '0'\nos.environ['TOKENIZERS_PARALLELISM'] = 'false'\nos.environ['TRITON_PTXAS_PATH'] = '/usr/local/cuda/bin/ptxas'\nos.environ['TIKTOKEN_RS_CACHE_DIR']= \"/kaggle/input/datasets/hpkaur34/harmony-encoding\"\nos.environ[\"TORCH_CUDA_ARCH_LIST\"] = '9.0'\nos.environ[\"PYTORCH_CUDA_ALLOC_CONF\"]=\"expandable_segments:True\"\n#os.environ[\"VLLM_USE_FLASHINFER_SAMPLER\"]= \"1\"\nfrom collections import Counter, defaultdict","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-13T18:00:25.898248Z","iopub.execute_input":"2026-04-13T18:00:25.898386Z","iopub.status.idle":"2026-04-13T18:00:30.046482Z","shell.execute_reply.started":"2026-04-13T18:00:25.898371Z","shell.execute_reply":"2026-04-13T18:00:30.046030Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# This will change in kaggle\nos.environ[\"TORCHINDUCTOR_CACHE_DIR\"] = \"torch_cache\"\n","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-13T18:00:30.047106Z","iopub.execute_input":"2026-04-13T18:00:30.047380Z","iopub.status.idle":"2026-04-13T18:00:30.049700Z","shell.execute_reply.started":"2026-04-13T18:00:30.047363Z","shell.execute_reply":"2026-04-13T18:00:30.049350Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"from nemo_skills.code_execution.sandbox import get_sandbox\nfrom nemo_skills.inference.model import get_code_execution_model\nfrom nemo_skills.prompt.utils import get_prompt\nfrom nemo_skills.inference.model import get_model","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-13T18:00:30.050229Z","iopub.execute_input":"2026-04-13T18:00:30.050352Z","iopub.status.idle":"2026-04-13T18:01:14.130302Z","shell.execute_reply.started":"2026-04-13T18:00:30.050340Z","shell.execute_reply":"2026-04-13T18:01:14.129864Z"}},"outputs":[],"execution_count":null},{"cell_type":"markdown","source":"# Configuration Parameters","metadata":{}},{"cell_type":"code","source":"host = \"127.0.0.1\"\nport = 5000\ntp_size = 1\nmax_public = 10\nmax_tokens = 38000\nmax_input_tokens = 2050\ntokens_to_generate = 35950 - 10\nmax_batch_size = 8\ntimeout_seconds = 300\nglobal_buffer = 350\nfinish_at_last_n = 2\nmax_code_output_characters = 1100\ncode_execution_timeout = 10\nmax_code_executions = 125\ng_score = 0\ng_count = 0\nprompt_score = Counter()\nsampling_params = {\n \"tokens_to_generate\": tokens_to_generate,\n \"temperature\": 1, # 0.2,\n \"top_p\": 1,\n}\n\nthoughts = [\"\"] * 50\nthoughts = thoughts[:max_batch_size]\ni = 0","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-13T18:01:14.130955Z","iopub.execute_input":"2026-04-13T18:01:14.131299Z","iopub.status.idle":"2026-04-13T18:01:14.135037Z","shell.execute_reply.started":"2026-04-13T18:01:14.131280Z","shell.execute_reply":"2026-04-13T18:01:14.134629Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"model_path = \"/kaggle/input/models/hpkaur34/gpt-oss-120b/transformers/default/1\"","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-13T18:01:14.136641Z","iopub.execute_input":"2026-04-13T18:01:14.136811Z","iopub.status.idle":"2026-04-13T18:01:14.159163Z","shell.execute_reply.started":"2026-04-13T18:01:14.136793Z","shell.execute_reply":"2026-04-13T18:01:14.158751Z"}},"outputs":[],"execution_count":null},{"cell_type":"markdown","source":"# Start Server - Load Model & Sandbox","metadata":{}},{"cell_type":"code","source":"server_started = False\ndef load_model():\n cmd = [\n \"python\",\n \"-m\",\n \"nemo_skills.inference.server.serve_vllm\",\n f\"--model={model_path}\",\n \"--port=5000\",\n \"--num_gpus=1\",\n \"--max-model-len=38000\", \n \"--max-num-batched-tokens=16384\",\n \"--max-num-seqs=11\", \n \"--max-cudagraph-capture-size=2048\",\n \"--gpu-memory-utilization=0.95\",\n \"--kv-cache-dtype=fp8_e4m3\",\n \"--stream-interval=200\",\n \"--enable-prefix-caching\",\n \"--uvicorn-log-level debug\",\n \"--enable-log-requests\",\n \"--enable-log-outputs\",\n \"--async-scheduling\", \n ]\n\n log_file = open(\"vllm.log\", \"w\")\n vllm_server = subprocess.Popen(\n cmd,\n stdout=log_file,\n stderr=log_file,\n text=True,\n bufsize=1 # line-buffered\n )\n return vllm_server","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-13T18:01:14.159748Z","iopub.execute_input":"2026-04-13T18:01:14.159899Z","iopub.status.idle":"2026-04-13T18:01:14.170928Z","shell.execute_reply.started":"2026-04-13T18:01:14.159881Z","shell.execute_reply":"2026-04-13T18:01:14.170557Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"vllm_server=load_model()","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-13T18:01:14.171466Z","iopub.execute_input":"2026-04-13T18:01:14.171616Z","iopub.status.idle":"2026-04-13T18:01:14.186612Z","shell.execute_reply.started":"2026-04-13T18:01:14.171596Z","shell.execute_reply":"2026-04-13T18:01:14.186249Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"def wait_for_server(url=f\"http://{host}:{port}\", timeout=1400):\n start = time.perf_counter()\n while True:\n try:\n r = requests.get(f\"{url}/docs\")\n if r.status_code == 200:\n print(\"✅ Server is ready\",time.perf_counter()-start)\n return True\n except Exception:\n pass\n\n if time.perf_counter() - start > timeout:\n raise TimeoutError(\"Server did not start in time\")\n\n time.sleep(1)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-13T18:01:14.187117Z","iopub.execute_input":"2026-04-13T18:01:14.187303Z","iopub.status.idle":"2026-04-13T18:01:14.195421Z","shell.execute_reply.started":"2026-04-13T18:01:14.187285Z","shell.execute_reply":"2026-04-13T18:01:14.195027Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"def sandbox_server():\n log_file = open(\"sandbox.log\", \"w\")\n sandbox_process = subprocess.Popen(\n [\"python\", \"-m\", \"nemo_skills.code_execution.local_sandbox.local_sandbox_server\"],\n stdout=log_file,\n stderr=log_file,\n text=True,\n bufsize=1)\n\n time.sleep(3)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-13T18:01:14.195964Z","iopub.execute_input":"2026-04-13T18:01:14.196105Z","iopub.status.idle":"2026-04-13T18:01:14.207285Z","shell.execute_reply.started":"2026-04-13T18:01:14.196088Z","shell.execute_reply":"2026-04-13T18:01:14.206802Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"time.sleep(2)\nsandbox_server()\nsandbox = get_sandbox() # localhost by default","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-13T18:01:14.207817Z","iopub.execute_input":"2026-04-13T18:01:14.207971Z","iopub.status.idle":"2026-04-13T18:01:19.262087Z","shell.execute_reply.started":"2026-04-13T18:01:14.207953Z","shell.execute_reply":"2026-04-13T18:01:19.261631Z"}},"outputs":[],"execution_count":null},{"cell_type":"markdown","source":"# Prompt Types and Updating Prompt","metadata":{}},{"cell_type":"code","source":"","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"default_prompt = (\n 'You are an elite mathematical problem solver with expertise at the International '\n 'Mathematical Olympiad (IMO) level. Your goal is to find the correct answer through '\n 'rigorous mathematical reasoning.\\n\\n'\n \n '# Problem-Solving Approach:\\n'\n '1. UNDERSTAND: Carefully read and rephrase the problem in your own words. '\n 'Identify what is given, what needs to be found, and any constraints.\\n'\n '2. EXPLORE: Consider multiple solution strategies. Think about relevant theorems, '\n 'techniques, patterns, or analogous problems. Don\\'t commit to one approach immediately.\\n'\n '3. PLAN: Select the most promising approach and outline key steps before executing.\\n'\n '4. EXECUTE: Work through your solution methodically. Show all reasoning steps clearly.\\n'\n '5. VERIFY: Check your answer by substituting back, testing edge cases, or using '\n 'alternative methods. Ensure logical consistency throughout.\\n\\n'\n \n '# Mathematical Reasoning Principles:\\n'\n '- Break complex problems into smaller, manageable sub-problems\\n'\n '- Look for patterns, symmetries, and special cases that provide insight\\n'\n '- Use concrete examples to build intuition before generalizing\\n'\n '- Consider extreme cases and boundary conditions\\n'\n '- If stuck, try working backwards from the desired result\\n'\n '- Be willing to restart with a different approach if needed\\n\\n'\n \n '# Verification Requirements:\\n'\n '- Cross-check arithmetic and algebraic manipulations\\n'\n '- Verify that your solution satisfies all problem constraints\\n'\n '- Test your answer with simple cases or special values when possible\\n'\n '- Ensure dimensional consistency and reasonableness of the result\\n\\n'\n \n \"#RESPONSE FORMAT:\\n\\n\"\n \"The final answer must be a non-negative integer.\\n. Instead of the \\\\boxed{} format use json format. Follow the instructions for the format-\"\n ' \"Answer\": <non-negative integer>,\"Confidence\": <number between 0 and 1>'\n \"Do not output any additional reasoning after this JSON.\\n\"\n )\n ","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-13T18:01:19.262726Z","iopub.execute_input":"2026-04-13T18:01:19.262870Z","iopub.status.idle":"2026-04-13T18:01:19.266048Z","shell.execute_reply.started":"2026-04-13T18:01:19.262851Z","shell.execute_reply":"2026-04-13T18:01:19.265674Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"answerext_prompt = (\n \"You are an answer extraction system for competition mathematics problems \"\n \"(olympiad, Putnam, HMMT, AMC/AIME style and similar).\\n\"\n \"You will be given a PROBLEM and a MODEL_RESPONSE. The response may be \"\n \"incomplete, truncated mid-reasoning, or cut off before a formal conclusion.\\n\\n\"\n \"YOUR JOB:\\n\"\n \"1. Read the problem and identify the SOUGHT QUANTITY — this could be a \"\n \"numerical value, an expression, a set of solutions, a function, a \"\n \"characterization, a bound, a count, an extremal quantity, a geometric \"\n \"measure, or a closed-form answer that the problem asks to find, \"\n \"determine, or compute.\\n\"\n \"2. Extract the model's best answer to that sought quantity from the \"\n \"response, even if:\\n\"\n \" - It is not explicitly labeled with 'the answer is' or 'therefore'\\n\"\n \" - It appears mid-sentence or mid-calculation\\n\"\n \" - The response was truncated before a formal conclusion\\n\"\n \" - It is stated with hedging language like 'seems to be,' 'so we get,' \"\n \"or 'this gives'\\n\\n\"\n \"RULES FOR IDENTIFYING THE ANSWER:\\n\"\n \"- The model's reasoning often explores multiple cases, subcases, or \"\n \"candidate values. Distinguish between:\\n\"\n \" (a) INTERMEDIATE SUB-RESULTS: values computed within a single case \"\n \"or step (e.g., 'LCM 60,' 'sum = 14,' 'this gives 42') that feed \"\n \"into the broader argument but do not directly answer the problem.\\n\"\n \" (b) THE CANDIDATE ANSWER: the value, expression, or characterization \"\n \"of the sought quantity that the model is building toward or \"\n \"accumulating evidence for across its reasoning.\\n\"\n \" Extract (b), not (a).\\n\"\n \"- In optimization or extremal problems, the model may test many \"\n \"configurations and compare them against a leading candidate value. \"\n \"If the model repeatedly checks whether alternatives can 'beat,' \"\n \"'exceed,' or 'improve upon' a particular value, and none do, \"\n \"treat that value as the candidate answer — even if the response \"\n \"ends before a formal conclusion.\\n\"\n \"- If the model arrives at the same value through multiple approaches \"\n \"or repeatedly returns to it as the best result, that is strong \"\n \"signal it is the intended answer.\\n\"\n \"- If the response is truncated but a candidate answer is visible \"\n \"from the reasoning so far, extract it. A truncated response with \"\n \"a clear leading candidate is better than no answer.\\n\"\n \"- If the problem asks to 'compute,' 'find,' or 'determine' a specific \"\n \"quantity, look for the last/best concrete value of THAT SPECIFIC \"\n \"quantity — not intermediate quantities used along the way.\\n\"\n \"- If the answer appears in LaTeX formatting (e.g., \\\\boxed{140}, \"\n \"$\\\\frac{7}{3}$, or similar), extract the value inside the formatting.\\n\\n\"\n \"RESPONSE FORMAT:\\n\"\n \"Your ONLY task is to output a single JSON object — no preamble, no explanation, no mathematical calculations.\\n\"\n \"The final answer must be a non-negative integer. Instead of the \\\\boxed{{}} format use json format. Follow the instructions for the format-\"\n ' {{\"Answer\": <non-negative integer>,\"Confidence\": <number between 0 and 1>}}'\n \"Do not output any additional reasoning after this JSON.\\n\"\n )","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-13T18:01:19.266607Z","iopub.execute_input":"2026-04-13T18:01:19.266782Z","iopub.status.idle":"2026-04-13T18:01:19.285215Z","shell.execute_reply.started":"2026-04-13T18:01:19.266757Z","shell.execute_reply":"2026-04-13T18:01:19.284859Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# Below will change\nsystem_message='{system_prompt}'\nprompt_template = get_prompt(prompt_config='gpt-oss/math',system_message=system_message,tokenizer=model_path,code_tags=\"gpt-oss\")\nchat_template_kwargs = {\n \"builtin_tools\": [\"python\"],\n \"reasoning_effort\":\"high\"\n \n}","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-13T18:01:19.285748Z","iopub.execute_input":"2026-04-13T18:01:19.285894Z","iopub.status.idle":"2026-04-13T18:01:20.567214Z","shell.execute_reply.started":"2026-04-13T18:01:19.285875Z","shell.execute_reply":"2026-04-13T18:01:20.566784Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"def safe_concat(a, b,function_name):\n if a is None or b is None:\n raise ValueError(f\"Cannot concatenate: a={a}, b={b}, Error Raised from function {function_name}\")\n return a + b","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-13T18:01:20.567806Z","iopub.execute_input":"2026-04-13T18:01:20.567949Z","iopub.status.idle":"2026-04-13T18:01:20.570675Z","shell.execute_reply.started":"2026-04-13T18:01:20.567931Z","shell.execute_reply":"2026-04-13T18:01:20.570272Z"}},"outputs":[],"execution_count":null},{"cell_type":"markdown","source":"# Data Extraction & Early Stopping","metadata":{}},{"cell_type":"code","source":"class Result:\n def __init__(self):\n self.early_stop_flag = False\n def best_voted_answer(self):\n return self.best_answer\n \n def majority_voting(self, answer_list):\n count = defaultdict(float)\n # Keep raw list separate; filter into valid_answers\n self.answer_list = answer_list\n self.valid_answers = [x[\"Answer\"] for x in self.answer_list if x[\"Answer\"] != -1]\n print(\"Answer_list after popping -1\", self.valid_answers, \"%%%%\")\n\n # BUG FIX: set fallback when all answers are invalid\n if len(self.valid_answers) == 0:\n self.best_answer = None\n self.best_count = 0\n self.second_count = 0\n self.sorted_answers = []\n return\n\n for a in self.valid_answers:\n count[a] += 1\n self.sorted_answers = sorted(count.items(), key=lambda x: x[1], reverse=True)\n\n self.best_answer, self.best_count = self.sorted_answers[0]\n self.second_count = self.sorted_answers[1][1] if len(self.sorted_answers) > 1 else 0\n\n if (\n self.best_count == 1\n and self.best_answer == 0\n and len(self.sorted_answers) > 1\n and self.sorted_answers[1] is not None\n ):\n\n self.best_answer, self.best_count = self.sorted_answers[1] \n \n\n def early_stop(self, answer_list, num_done):\n print(\"Num_done is\",num_done)\n self.num_done = num_done\n self.majority_voting(answer_list)\n n_valid = len(self.valid_answers)\n best = self.best_count\n gap = self.best_count - self.second_count\n print(f\"Num done: {self.num_done}, Valid answers: {n_valid}, \"\n f\"Best count: {best}, Second count: {self.second_count}\")\n\n if n_valid == 0:\n return False\n\n if best >= 4 and gap >= 2:\n self.early_stop_flag = True\n print(f\">>> EARLY STOP at {self.num_done} completions | \"\n f\"best={self.best_answer} (count={best}, gap={gap})\")\n\n return self.early_stop_flag \n \n def get_best_answer(self,answer_list, num_done, flag):\n if not flag:\n self.majority_voting(answer_list)\n else:\n self.early_stop(answer_list, num_done)\n return self.best_voted_answer(), self.early_stop_flag\n","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-13T18:01:20.571223Z","iopub.execute_input":"2026-04-13T18:01:20.571357Z","iopub.status.idle":"2026-04-13T18:01:20.583953Z","shell.execute_reply.started":"2026-04-13T18:01:20.571341Z","shell.execute_reply":"2026-04-13T18:01:20.583557Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"import re, requests\n\nclass Answer:\n def __init__(self):\n self.best_answer = None\n self.input_message = \"\"\n self.best_count = 0\n self.second_count = 0\n self.answer_list = [] # ← was None, init as empty list\n self.early_stop_flag = False\n self.sorted_answers = []\n self.valid_answers = [] # ← filtered list (no -1s), kept separate\n self.sampling_param = {\n \"tokens_to_generate\": 7000,\n \"temperature\": 0.9, # 0.2,\n \"top_p\": 0.95,\n }\n self.timeout = httpx.Timeout(\n connect=60.0,\n read=300.0,\n write=60.0,\n pool=120.0,\n )\n\n def clean_messages(self, text):\n cleaned = re.sub(r'<\\|[^|]*\\|>', '', text)\n return cleaned.strip()\n\n \n async def extract_answer(self, question, model_output):\n answer = -1\n confidence = -0.1\n seed = secrets.randbits(32)\n input_message = self.clean_messages(model_output)\n rid = secrets.token_hex(8)\n message = prompt_template.fill(\n input_dict={\n \"problem\": safe_concat(question,input_message,\"extract_answer\"),\n \"system_prompt\": answerext_prompt,\n },\n chat_template_kwargs = chat_template_kwargs,\n format_as_string=True\n )\n print(prompt_template)\n print(\"textd was called\")\n try:\n data, completion_tokens = await server_obj.generate_response(\n prompt=message,\n random_seed=seed,\n stream=True,\n calling_function = \"extract_answer\",\n extra_body={\"request_id\": rid},\n timeout = self.timeout,\n **self.sampling_param,\n )\n \n if data is not None and isinstance(data, dict):\n return data\n else:\n return {\"Answer\":-1, \"Confidence\":-0.1}\n \n except Exception as e:\n print(f\"[extract_answer failed] {type(e).__name__}: {e}\")\n return {\"Answer\":answer,\"Confidence\": confidence}\n","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-13T18:01:20.584510Z","iopub.execute_input":"2026-04-13T18:01:20.584662Z","iopub.status.idle":"2026-04-13T18:01:20.600319Z","shell.execute_reply.started":"2026-04-13T18:01:20.584644Z","shell.execute_reply":"2026-04-13T18:01:20.599926Z"}},"outputs":[],"execution_count":null},{"cell_type":"markdown","source":"# Inference","metadata":{}},{"cell_type":"code","source":"# Below will change in kaggle\n#Instantiate Server Object\nserver_obj = get_code_execution_model(server_type = 'vllm',\n model=model_path,\n base_url=\"http://127.0.0.1:5000/v1\",\n api_key='EMPTY',\n sandbox=sandbox,\n code_execution={\n 'max_code_output_characters': max_code_output_characters,\n 'code_execution_timeout': code_execution_timeout,\n 'max_code_executions': max_code_executions,\n }) \n\nasync def abort_request(request_ids: str | list[str]):\n \"\"\"Sequential best-effort server-side abort.\n Uses short timeouts so a slow/down server doesn't block.\n Silently ignores failures.\n \"\"\"\n if isinstance(request_ids, str):\n request_ids = [request_ids]\n\n timeout = httpx.Timeout(connect=1.0, read=2.0, write=1.0, pool=1.0)\n\n async with httpx.AsyncClient(timeout=timeout) as client:\n for rid in request_ids:\n try:\n await client.delete(f\"http://{host}:{port}/v1/requests/{rid}\")\n except Exception:\n # optionally log instead of silent pass\n pass\n await asyncio.sleep(0.05) # cooperative yield","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-13T18:01:20.600935Z","iopub.execute_input":"2026-04-13T18:01:20.601080Z","iopub.status.idle":"2026-04-13T18:01:21.408853Z","shell.execute_reply.started":"2026-04-13T18:01:20.601062Z","shell.execute_reply":"2026-04-13T18:01:21.408370Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"class ClientClass:\n def __init__(self, prompt):\n global sampling_params\n self.thresh_hold = 4 # minimum completions before checking early stop\n self.system_prompt = prompt\n self.answer = {}\n self.randomseed_list = []\n self.num_done = 0\n self.sampling_param = copy.deepcopy(sampling_params)\n self.question = \"\"\n self.finished_generations = []\n self.final_answer = None\n self.early_stop_flag = False\n self.flattened_prompt_list = []\n self.list_of_questions = []\n self.answer_list = []\n self.request_ids = [] # per-task IDs for server-side abort\n self.tasks = []\n self.timeout = httpx.Timeout(\n connect=30.0,\n read= 500.0 ,\n write=30.0,\n pool=120.0,\n )\n self.answerobj = Answer()\n \n async def send_request_to_server(self):\n print(\"Request sent\")\n self.request_ids = [secrets.token_hex(8) for _ in self.list_of_questions]\n self.randomseed_list = [k for k in range(len(self.list_of_questions))]\n for prompt, seed, rid in zip(self.list_of_questions, self.randomseed_list, self.request_ids): \n task = asyncio.create_task(\n server_obj.generate_async(\n prompt=prompt,\n random_seed=seed,\n timeout=self.timeout,\n remove_stop_phrases=False,\n stream = True, \n extra_body={\"request_id\": rid},\n **prompt_template.get_code_execution_args(),\n **self.sampling_param,\n )\n )\n self.tasks.append(task)\n \n try:\n processed = set()\n for completed in asyncio.as_completed(self.tasks):\n try:\n result = await completed\n print(\"Total number of generated tokens\", result[\"total_num_generated_tokens\"])\n self.num_done += 1\n processed.add(completed) # this adds the task to processed\n self.finished_generations.append(result[\"generation\"])\n if result[\"answer\"] is not None:\n self.answer = json.loads(result[\"answer\"])\n print(\"The answer and confidence after json parsing\", self.answer)\n yield self.answer\n else:\n self.answer = await self.answerobj.extract_answer(self.question, result[\"generation\"]) \n print(\"The answer and confidence after interaction with 2nd model\",self.answer) \n yield self.answer\n except GeneratorExit:\n return \n except Exception as e:\n traceback.print_exc()\n error_type = type(e).__name__\n print(f\"[ERROR] {error_type}\")\n traceback.print_exc()\n self.answer = {\n \"Answer\": -1,\n \"Confidence\": -0.1,\n } \n yield self.answer\n\n finally: \n #fallback in the Pipeline timeout handler. Timout\n for t in self.tasks:\n if t.done() and t not in processed:\n try:\n if not t.cancelled() and t.exception() is None:\n self.res = t.result()\n \n elif t.exception() is not None:\n # optional: handle failed tasks\n pass\n except Exception:\n pass\n elif not t.done():\n t.cancel() \n asyncio.create_task(abort_request(self.request_ids))\n \n # Fire server-side abort independently — survives parent cancellation\n\n def flatten_prompt_list(self):\n global max_batch_size\n self.flattened_prompt_list = [\n self.system_prompt\n # for system_prompt in self.prompts_list\n for _ in range(max_batch_size)\n ]\n\n def generate_question_copies(self, question):\n self.question = question\n self.list_of_questions = [\n prompt_template.fill(\n input_dict={\n \"problem\": question,\n \"system_prompt\": system_prompt,\n },\n chat_template_kwargs = chat_template_kwargs,\n format_as_string=True\n )\n for system_prompt in self.flattened_prompt_list\n ]\n \n \n async def predict_for_question(self, question):\n self.flatten_prompt_list()\n self.generate_question_copies(question)\n\n gen = self.send_request_to_server()\n\n try:\n async for answer in gen:\n yield answer\n\n except Exception as e:\n print(\"Error in predict_for_question:\", e)\n raise\n\n finally:\n try:\n await gen.aclose()\n except Exception:\n pass","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-13T18:01:21.409542Z","iopub.execute_input":"2026-04-13T18:01:21.409704Z","iopub.status.idle":"2026-04-13T18:01:21.420849Z","shell.execute_reply.started":"2026-04-13T18:01:21.409683Z","shell.execute_reply":"2026-04-13T18:01:21.420460Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"import math\n\nclass BufferBorrower:\n \"\"\"\n Dynamic buffer-time borrowing strategy for inference.\n\n Borrows from buffer time based on task difficulty and step-back\n token usage, using a sigmoid curve for smooth allocation.\n\n Parameters\n ----------\n max_difficulty : int or float\n The upper bound of the difficulty scale (e.g., 5 or 1.0).\n alpha : float\n Weight for the difficulty signal (default 0.6).\n beta : float\n Weight for the step-back token signal (default 0.4).\n b_max : float\n Maximum fraction of buffer that can be borrowed (default 0.7).\n k : float\n Steepness of the sigmoid transition (default 6).\n threshold : float\n Midpoint of the sigmoid curve (default 0.4).\n \"\"\"\n\n def __init__(\n self,\n b_max: float = 0.85,\n k: float = 6.0,\n threshold: float = 0.4,\n total_questions: int = 50,\n total_available_time: int = 15720,\n ):\n \n self.b_max = b_max\n self.k = k\n self.threshold = threshold\n self.total_questions = total_questions\n self.total_available_time = total_available_time\n\n def compute_time_pressure(\n self,\n remaining_time: float,\n questions_completed: int,\n global_buffer: float = 0.0,\n ) -> float:\n remaining_q = max(1, self.total_questions - questions_completed)\n if remaining_time <= 0:\n return 1.5\n ideal_pace = self.total_available_time / self.total_questions\n available_pace = remaining_time / remaining_q\n pressure = ideal_pace / available_pace\n return max(0.3, min(1.5, pressure))\n\n def allocate_time(\n self,\n remaining_time: float,\n questions_completed: int,\n global_buffer: float = 0.0,\n allowed_time : float = 320,\n ) -> dict:\n \"\"\"\n Allocate effective inference and remaining buffer time.\n\n Parameters\n ----------\n allowed_time : float\n Base inference time budget.\n global_buffer : float\n global buffer time budget.\n difficulty : float\n Task difficulty score.\n stepback_tokens : int\n Tokens used in step-back phase.\n stepback_budget : int\n Total step-back token budget.\n\n Returns\n -------\n dict\n Keys: effective_inference, remaining_buffer, borrowed,\n borrow_fraction.\n \"\"\"\n pressure = self.compute_time_pressure(\n remaining_time,\n questions_completed,\n global_buffer\n )\n borrow_fraction = 1/pressure\n max_borrowable = 130\n print(\"borrow fraction\", borrow_fraction)\n borrowed = min(pressure * global_buffer, max_borrowable)\n \n\n return {\n \"effective_inference\": allowed_time + borrowed,\n \"global_buffer\": global_buffer - borrowed,\n \"borrowed\": borrowed,\n \"borrow_fraction\": borrow_fraction,\n }\n","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-13T18:01:21.421407Z","iopub.execute_input":"2026-04-13T18:01:21.421557Z","iopub.status.idle":"2026-04-13T18:01:21.436513Z","shell.execute_reply.started":"2026-04-13T18:01:21.421539Z","shell.execute_reply":"2026-04-13T18:01:21.436126Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"class TimeBudget:\n def __init__(self, total_seconds):\n self.start = time.perf_counter()\n self.deadline = self.start + total_seconds\n \n @property\n def remaining(self):\n return max(0, self.deadline - time.perf_counter())\n\n @property\n def elapsed(self):\n return time.perf_counter() - self.start\n\n @property\n def expired(self):\n return self.remaining <= 0\n","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-13T18:01:21.437045Z","iopub.execute_input":"2026-04-13T18:01:21.437188Z","iopub.status.idle":"2026-04-13T18:01:21.449985Z","shell.execute_reply.started":"2026-04-13T18:01:21.437167Z","shell.execute_reply":"2026-04-13T18:01:21.449607Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"class Pipeline:\n def __init__(self):\n self.budget_seconds = 0 \n self.k = 1\n self.budget_seconds = 0\n async def get_prediction(self, problem_text):\n global global_buffer, i, borrower, max_batch_size,last_30, sampling_param\n budgetobj = None\n timeout = 60\n # Timeout at this level - see if needs to be implemented \n thresh_hold = 3\n num_done = 0\n max_generation_count = self.k*max_batch_size\n answer_list = []\n finalanswerobj = Result()\n print(\"Pipeline step 1\")\n deadline = 0\n allowed_time = 320\n self.budget_seconds = allowed_time\n if global_buffer> 0:\n result = borrower.allocate_time(\n remaining_time = get_global_remaining(),\n questions_completed = i,\n allowed_time = allowed_time,\n global_buffer = global_buffer\n ) \n\n self.budget_seconds = result[\"effective_inference\"] \n global_buffer = result[\"global_buffer\"] \n print(f'borrowed={result[\"borrowed\"]:.0f}') \n print(f\"Budget: base={allowed_time:.0f}s \"\n f\"= {self.budget_seconds:.0f}s (global remaining: {get_global_remaining():.0f}s)\")\n budgetobj = TimeBudget(self.budget_seconds)\n \n clientobj = ClientClass(default_prompt)\n deadline = max(deadline, budgetobj.remaining) \n operation_start_time = time.perf_counter()\n print(\"Deadline is\", deadline)\n gen = clientobj.predict_for_question(problem_text)\n try:\n async with asyncio.timeout(deadline):\n async for answer in gen:\n answer_list.append(answer) \n print(\"Answer list on timeout is:-\")\n print(answer_list) \n num_done = len(answer_list)\n if num_done >= thresh_hold and num_done < max_generation_count:\n prediction, early_stop_flag = finalanswerobj.get_best_answer(answer_list, num_done, True)\n if early_stop_flag:\n return prediction\n\n elif num_done == max_generation_count:\n prediction, _ = finalanswerobj.get_best_answer(answer_list, num_done, False)\n return prediction\n else:\n continue\n except (TimeoutError, asyncio.TimeoutError):\n traceback.print_exc()\n prediction, _ = finalanswerobj.get_best_answer(answer_list, num_done, False)\n return prediction\n\n except Exception as e:\n traceback.print_exc()\n print(f\"UNEXPECTED ERROR: {type(e).__name__} {e}\")\n if answer_list:\n prediction, _ = finalanswerobj.get_best_answer(answer_list, num_done, False)\n return prediction\n return None\n\n finally:\n await gen.aclose() \n print(\"Operation duration\", time.perf_counter()-operation_start_time)\n if budgetobj.elapsed > self.budget_seconds:\n global_buffer -= (budgetobj.elapsed - self.budget_seconds)\n else:\n global_buffer += (self.budget_seconds - budgetobj.elapsed)\n\n","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-13T18:01:21.450491Z","iopub.execute_input":"2026-04-13T18:01:21.450620Z","iopub.status.idle":"2026-04-13T18:01:21.462068Z","shell.execute_reply.started":"2026-04-13T18:01:21.450600Z","shell.execute_reply":"2026-04-13T18:01:21.461682Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"def predict(id_: pl.Series, problem: pl.Series) -> pl.DataFrame | pd.DataFrame:\n \"\"\"Make a prediction.\"\"\"\n global server_started, i\n start_pred_time = time.perf_counter()\n pipelineobj = Pipeline()\n if server_started is False:\n server_started = wait_for_server()\n\n id_ = id_.item(0)\n problem_text: str = problem.item(0)\n\n # BUG FIX: compare duration to duration (was comparing duration to absolute timestamp)\n if get_global_remaining() < 30:\n return pl.DataFrame({\"id\": id_, \"answer\": 29443})\n loop = asyncio.get_event_loop()\n prediction = loop.run_until_complete(pipelineobj.get_prediction(problem_text))\n\n # If prediction is still None after everything, use fallback\n if prediction is None:\n prediction = 29443\n\n i = i + 1\n \n print(\"Returned dataframe is \", pl.DataFrame({\"id\": id_, \"answer\": prediction}))\n return pl.DataFrame({\"id\": id_, \"answer\": prediction})\n","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-13T18:01:21.462620Z","iopub.execute_input":"2026-04-13T18:01:21.462775Z","iopub.status.idle":"2026-04-13T18:01:21.476368Z","shell.execute_reply.started":"2026-04-13T18:01:21.462754Z","shell.execute_reply":"2026-04-13T18:01:21.475981Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"#Change the path of the csv file\ninference_server = kaggle_evaluation.aimo_3_inference_server.AIMO3InferenceServer(\n predict\n)\nborrower = BufferBorrower(total_questions = 50, total_available_time = get_global_remaining())\n\nif os.getenv('KAGGLE_IS_COMPETITION_RERUN'):\n inference_server.serve()\n \nelse:\n inference_server.run_local_gateway(\n ('/kaggle/input/competitions/ai-mathematical-olympiad-progress-prize-3/test.csv',)\n )\n \n ","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-13T18:01:21.477997Z","iopub.execute_input":"2026-04-13T18:01:21.478163Z","execution_failed":"2026-04-13T18:04:10.507Z"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":"","metadata":{"trusted":true},"outputs":[],"execution_count":null}]}