| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| serve_vllm_on_kaggle = True |
| run_all_questions_on_kaggle = False |
|
|
| |
| save_communication_enabled = True |
| maybe_collaborate_enabled = True |
| replication_count_for_commit_runs = 10 |
| model_path = "/kaggle/input/models/huikang/gpt-oss-120b-aimo3/transformers/160a/9" |
|
|
| |
| import os |
| import time |
|
|
| start_time = time.time() |
| total_available_time = (4 * 60 + 58) * 60 |
| final_cutoff_time = start_time + total_available_time |
|
|
|
|
| def is_on_kaggle_commit() -> bool: |
| return os.getenv("KAGGLE_KERNEL_RUN_TYPE") == "Batch" and not bool( |
| os.getenv("KAGGLE_IS_COMPETITION_RERUN") |
| ) |
|
|
|
|
| def is_on_kaggle_interactive() -> bool: |
| return os.getenv("KAGGLE_KERNEL_RUN_TYPE") == "Interactive" and not bool( |
| os.getenv("KAGGLE_IS_COMPETITION_RERUN") |
| ) |
|
|
|
|
| def is_on_kaggle() -> bool: |
| return bool(os.getenv("KAGGLE_KERNEL_RUN_TYPE")) or bool( |
| os.getenv("KAGGLE_IS_COMPETITION_RERUN") |
| ) |
|
|
|
|
| REMOTE_VLLM_URL = "NOT_AVAILABLE" |
| if is_on_kaggle() and serve_vllm_on_kaggle: |
| |
| pass |
| else: |
| |
| from kaggle_secrets import UserSecretsClient |
|
|
| secrets = UserSecretsClient() |
| REMOTE_VLLM_URL = secrets.get_secret("REMOTE_VLLM_URL") |
|
|
| if is_on_kaggle(): |
| |
| save_communication_enabled = False |
|
|
|
|
| |
| os.environ["PYDEVD_DISABLE_FILE_VALIDATION"] = "1" |
| os.environ["NO_COLOR"] = "1" |
|
|
| |
| |
| print(f"{is_on_kaggle()=}") |
| print(f"{is_on_kaggle_interactive()=}") |
| print(f"{is_on_kaggle_commit()=}") |
| print(f"{serve_vllm_on_kaggle=}") |
| print(f"{run_all_questions_on_kaggle=}") |
| print(f"{REMOTE_VLLM_URL[::-1][:13][::-1]=}") |
|
|
| |
| |
|
|
| |
| import subprocess |
|
|
| if is_on_kaggle(): |
| subprocess.run( |
| [ |
| "pip", |
| "uninstall", |
| "--yes", |
| "tensorflow", |
| "matplotlib", |
| "keras", |
| "scikit-learn", |
| ] |
| ) |
|
|
| |
| import os |
| import subprocess |
|
|
|
|
| def cache_model( |
| path, exts=(".bin", ".pt", ".safetensors"), num_workers=None, chunk_mb=256 |
| ): |
| """ |
| Pre-read model weight files into the OS page cache to speed up later loads. |
| |
| Args: |
| path : Directory containing model files, or a single file path. |
| exts : File extensions treated as model weight files. |
| num_workers : Number of threads (default = min(CPU cores, 8)). |
| chunk_mb : Size of each read chunk in MB. |
| Returns: |
| Total bytes read (int). |
| """ |
| import os |
| import multiprocessing |
| import time |
| from concurrent.futures import ThreadPoolExecutor, as_completed |
|
|
| def warmup_file(fpath): |
| """Sequentially read an entire file in chunks.""" |
| chunk_size = chunk_mb * 1024 * 1024 |
| total = 0 |
| with open(fpath, "rb") as f: |
| while True: |
| data = f.read(chunk_size) |
| if not data: |
| break |
| total += len(data) |
| return fpath, total |
|
|
| |
| if os.path.isdir(path): |
| files = [ |
| os.path.join(root, name) |
| for root, _, names in os.walk(path) |
| for name in names |
| if name.endswith(exts) |
| ] |
| files.sort() |
| else: |
| files = [path] |
|
|
| if not files: |
| raise ValueError(f"No model files found under: {path}") |
|
|
| |
| if num_workers is None: |
| try: |
| num_workers = min(multiprocessing.cpu_count(), 8) |
| except Exception: |
| num_workers = 4 |
|
|
| print(f"[cache_model] {len(files)} file(s), {num_workers} worker(s)") |
|
|
| t0 = time.time() |
| total_bytes = 0 |
|
|
| |
| with ThreadPoolExecutor(max_workers=num_workers) as pool: |
| futures = {pool.submit(warmup_file, f): f for f in files} |
| for i, fut in enumerate(as_completed(futures), 1): |
| fpath, n = fut.result() |
| total_bytes += n |
| print(f"[{i}/{len(files)}] cached {os.path.basename(fpath)}") |
|
|
| elapsed = time.time() - t0 |
| gb = total_bytes / 1024**3 |
| speed = gb / elapsed if elapsed > 0 else 0 |
| print(f"[cache_model] total read ≈ {gb:.2f} GB") |
| print(f"[cache_model] elapsed {elapsed:.2f} s, ~{speed:.2f} GB/s") |
|
|
| return total_bytes |
|
|
|
|
| if is_on_kaggle(): |
| cache_model( |
| model_path, |
| num_workers=16, |
| chunk_mb=1024, |
| ) |
|
|
| |
| import numpy as np |
| import torch |
|
|
| cutoff_times: list[float] = [ |
| float(x) |
| for x in np.linspace( |
| final_cutoff_time, start_time + total_available_time / 3, 50 + 1 |
| ) |
| ] |
| cutoff_times.pop() |
|
|
|
|
| def reallocate_time(cutoff_times: list[float]) -> None: |
| """Reallocate cutoff_times in-place with equal intervals, except last interval could be larger.""" |
| n = len(cutoff_times) |
| if n <= 1: |
| return |
|
|
| |
| |
| |
| |
| |
| interval = 750 |
| cutoff_times[-1] = min(cutoff_times[-1], time.time() + interval) |
|
|
|
|
| from datetime import datetime |
|
|
| |
| os.makedirs("runs", exist_ok=True) |
| RUN_DIR = f"runs/{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}" |
| SOLUTIONS_DIR = f"{RUN_DIR}/solutions" |
| FINDINGS_DIR = f"{RUN_DIR}/findings" |
| COMMUNICATIONS_DIR = f"{RUN_DIR}/communications" |
|
|
| os.makedirs(SOLUTIONS_DIR, exist_ok=True) |
| os.makedirs(FINDINGS_DIR, exist_ok=True) |
| os.makedirs(COMMUNICATIONS_DIR, exist_ok=True) |
|
|
| |
| import pandas as pd |
|
|
| STATS_CSV_PATH = "stats.csv" |
| STATS_COLUMNS = [ |
| "question_id", |
| "final_answer", |
| "time_taken", |
| "time_available", |
| "active_solvers", |
| "answers", |
| "proposals", |
| "answer_history", |
| "proposal_history", |
| "submission_history", |
| "total_tokens", |
| "backtrack_counts", |
| "main_tokens", |
| "total_main_tokens", |
| "support_tokens", |
| "total_support_tokens", |
| "tool_use_counts", |
| "num_findings", |
| "num_acceptances", |
| "num_rejections", |
| ] |
|
|
| if __name__ == "__main__": |
| pd.DataFrame(columns=pd.Index(STATS_COLUMNS)).to_csv(STATS_CSV_PATH, index=False) |
|
|
|
|
| def save_stats( |
| question_id: str, |
| final_answer: int, |
| time_taken: float, |
| time_available: float, |
| active_solvers: set[int], |
| answers: dict[int, int], |
| proposals: dict[int, int], |
| answer_history: dict[int, list[int | None]], |
| proposal_history: dict[int, list[int | None]], |
| submission_history: dict[int, list[int]], |
| backtrack_counts: dict[int, int], |
| main_tokens: dict[int, int], |
| support_tokens: dict[int, int], |
| tool_use_counts: dict[int, int], |
| num_findings: int, |
| num_acceptances: int, |
| num_rejections: int, |
| ) -> None: |
| """Append stats for a question to the CSV file.""" |
| row_data = { |
| "question_id": question_id, |
| "final_answer": final_answer, |
| "time_taken": round(time_taken, 1), |
| "time_available": round(time_available, 1), |
| "active_solvers": str(sorted(active_solvers)), |
| "answers": str(answers), |
| "proposals": str(proposals), |
| "answer_history": str(dict(answer_history)), |
| "proposal_history": str(dict(proposal_history)), |
| "submission_history": str(dict(submission_history)), |
| "total_tokens": sum(main_tokens.values()) + sum(support_tokens.values()), |
| "backtrack_counts": str(dict(backtrack_counts)), |
| "main_tokens": str(dict(main_tokens)), |
| "total_main_tokens": sum(main_tokens.values()), |
| "support_tokens": str(dict(support_tokens)), |
| "total_support_tokens": sum(support_tokens.values()), |
| "tool_use_counts": str(dict(tool_use_counts)), |
| "num_findings": num_findings, |
| "num_acceptances": num_acceptances, |
| "num_rejections": num_rejections, |
| } |
| assert list(row_data.keys()) == STATS_COLUMNS, ( |
| f"Column mismatch: {list(row_data.keys())} != {STATS_COLUMNS}" |
| ) |
| row = pd.DataFrame([row_data]) |
| row.to_csv(STATS_CSV_PATH, mode="a", header=False, index=False) |
|
|
|
|
| if is_on_kaggle(): |
| if serve_vllm_on_kaggle: |
| assert torch.cuda.is_available() |
| assert torch.cuda.device_count() == 1 |
| else: |
| |
| import urllib.request |
| from urllib.error import URLError |
|
|
| try: |
| urllib.request.urlopen("https://modal.com", timeout=5) |
| print("Internet access confirmed") |
| except (URLError, TimeoutError) as e: |
| raise RuntimeError( |
| "Internet access required when serve_vllm_on_kaggle=False" |
| ) from e |
|
|
| |
| assert not torch.cuda.is_available() |
| assert torch.cuda.device_count() == 0 |
|
|
| |
| |
|
|
| |
| if is_on_kaggle(): |
| subprocess.run(["ls", "/kaggle/usr/lib/pip_install_aimo3_1/tiktoken_encodings"]) |
|
|
| |
| with open("a-vllm.log", "w") as f: |
| f.write("") |
|
|
| |
| import subprocess |
|
|
| num_generations = 8 |
| max_model_len = 131072 |
|
|
|
|
| def start_vllm_server() -> subprocess.Popen[bytes]: |
| """Start vLLM server in the background""" |
| os.environ["TRANSFORMERS_NO_TF"] = "1" |
| os.environ["TRANSFORMERS_NO_FLAX"] = "1" |
| os.environ["VLLM_ATTENTION_BACKEND"] = "FLASH_ATTN" |
| os.environ["VLLM_FLASH_ATTN_VERSION"] = "3" |
| os.environ["TRITON_PTXAS_PATH"] = "/usr/local/cuda/bin/ptxas" |
| os.environ["CUDA_VISIBLE_DEVICES"] = "0" |
| |
| os.environ["TIKTOKEN_ENCODINGS_BASE"] = ( |
| "/kaggle/usr/lib/pip_install_aimo3_1/tiktoken_encodings" |
| ) |
|
|
| command: list[str] = [ |
| "python", |
| "-m", |
| "vllm.entrypoints.openai.api_server", |
| "--model", |
| model_path, |
| "--served-model-name", |
| "vllm-model", |
| "--tensor-parallel-size", |
| "1", |
| "--max-num-seqs", |
| f"{num_generations + 4}", |
| "--gpu-memory-utilization", |
| "0.96", |
| "--host", |
| "0.0.0.0", |
| "--port", |
| "8000", |
| "--dtype", |
| "auto", |
| "--async-scheduling", |
| "--max-num-batched-tokens", |
| "2048", |
| "--max-model-len", |
| f"{max_model_len}", |
| ] |
|
|
| |
| with open("/kaggle/working/a-vllm.log", "w") as logfile: |
| process: subprocess.Popen[bytes] = subprocess.Popen( |
| command, stdout=logfile, stderr=subprocess.STDOUT, start_new_session=True |
| ) |
|
|
| print("Logs: /kaggle/working/a-vllm.log") |
| return process |
|
|
|
|
| |
| if is_on_kaggle() and serve_vllm_on_kaggle: |
| vllm_process: subprocess.Popen[bytes] = start_vllm_server() |
|
|
| |
| import os |
|
|
| from openai import OpenAI, Stream |
| from openai.types import Completion |
|
|
| |
| if is_on_kaggle() and serve_vllm_on_kaggle: |
| os.environ["OPENAI_API_BASE"] = "http://127.0.0.1:8000/v1" |
| else: |
| os.environ["OPENAI_API_BASE"] = REMOTE_VLLM_URL |
| if is_on_kaggle(): |
| |
| os.environ["TIKTOKEN_ENCODINGS_BASE"] = ( |
| "/kaggle/usr/lib/pip_install_aimo3_1/tiktoken_encodings" |
| ) |
| os.environ["OPENAI_API_KEY"] = "sk-local" |
|
|
| client: OpenAI = OpenAI( |
| base_url=os.environ["OPENAI_API_BASE"], api_key=os.environ["OPENAI_API_KEY"] |
| ) |
|
|
| |
| import time |
|
|
|
|
| def await_client(printing: bool = False) -> None: |
| for _ in range(15 * 60): |
| time.sleep(1) |
| try: |
| model_list = client.models.list() |
| if printing: |
| print(model_list) |
| except NameError: |
| raise |
| except Exception: |
| continue |
| break |
| else: |
| raise |
|
|
|
|
| if is_on_kaggle(): |
| |
| await_client() |
|
|
| |
| |
|
|
| |
| class LocalJupyterSession: |
| """Stateful helper that proxies execution through a local Jupyter kernel. |
| Extracted from gpt_oss.tools.python_docker.docker_tool. |
| Thread-safe: creates its own ZMQ context for use within a single thread. |
| """ |
|
|
| def __init__(self, timeout: float = 8.0) -> None: |
| import zmq |
| from jupyter_client.blocking.client import BlockingKernelClient |
| from jupyter_client.manager import KernelManager |
|
|
| self._default_timeout = timeout |
| |
| self._zmq_context = zmq.Context() |
| self._km = KernelManager(context=self._zmq_context) |
| |
| |
| self._km.start_kernel(extra_arguments=["--HistoryManager.enabled=False"]) |
| self._client: BlockingKernelClient = self._km.blocking_client() |
| self._client.start_channels() |
| self._client.wait_for_ready(timeout=None) |
| |
| |
| |
| self._client.execute("%colors NoColor", store_history=False) |
| self._client.execute("%xmode Plain", store_history=False) |
| |
| self._client.execute("import sys; sys.tracebacklimit = 10", store_history=False) |
| |
| self._pending_msg_id: str | None = None |
| |
| self._kernel_may_be_stuck: bool = False |
|
|
| def _drain_pending_output(self) -> str: |
| """Drain output from a previous timed-out execution. Interrupts if still running. |
| |
| If the kernel doesn't respond to interrupt (e.g., stuck in C code like BLAS), |
| restarts the kernel to ensure the next execution can proceed. |
| """ |
| |
| |
| |
| if self._kernel_may_be_stuck: |
| self._kernel_may_be_stuck = False |
| |
| self._restart_kernel() |
| return "[Previous execution killed - kernel restarted due to unresponsive C code]\n" |
|
|
| if self._pending_msg_id is None: |
| return "" |
|
|
| msg_id = self._pending_msg_id |
| self._pending_msg_id = None |
| client = self._client |
|
|
| stdout_parts: list[str] = [] |
| stderr_parts: list[str] = [] |
| execution_finished = False |
|
|
| |
| while True: |
| try: |
| msg = client.get_iopub_msg(timeout=0.1) |
| except queue.Empty: |
| break |
|
|
| parent_header = msg["parent_header"] |
| if ( |
| not parent_header |
| or "msg_id" not in parent_header |
| or parent_header["msg_id"] != msg_id |
| ): |
| continue |
|
|
| msg_type = msg["msg_type"] |
| content = msg["content"] |
|
|
| if msg_type == "stream": |
| text = content["text"] |
| if content["name"] == "stdout": |
| stdout_parts.append(text) |
| else: |
| stderr_parts.append(text) |
| elif msg_type == "error": |
| traceback_data = content["traceback"] |
| stderr_parts.append("\n".join(traceback_data)) |
| elif msg_type in {"execute_result", "display_data"}: |
| data = content["data"] |
| if "text/plain" in data: |
| text = data["text/plain"] |
| stdout_parts.append(text if text.endswith("\n") else f"{text}\n") |
| elif msg_type == "status" and content["execution_state"] == "idle": |
| execution_finished = True |
| break |
|
|
| |
| if not execution_finished: |
| self._km.interrupt_kernel() |
| |
| interrupt_timeout = 2.0 |
| while True: |
| try: |
| msg = client.get_iopub_msg(timeout=interrupt_timeout) |
| except queue.Empty: |
| |
| |
| self._restart_kernel() |
| return "[Previous execution killed - kernel restarted due to unresponsive C code]\n" |
| parent_header = msg["parent_header"] |
| if ( |
| not parent_header |
| or "msg_id" not in parent_header |
| or parent_header["msg_id"] != msg_id |
| ): |
| continue |
| msg_type = msg["msg_type"] |
| content = msg["content"] |
| if msg_type == "stream": |
| text = content["text"] |
| if content["name"] == "stdout": |
| stdout_parts.append(text) |
| else: |
| stderr_parts.append(text) |
| elif msg_type == "error": |
| traceback_data = content["traceback"] |
| stderr_parts.append("\n".join(traceback_data)) |
| elif msg_type == "status" and content["execution_state"] == "idle": |
| break |
|
|
| |
| while True: |
| try: |
| reply = client.get_shell_msg(timeout=0.1) |
| parent_header = reply["parent_header"] |
| if ( |
| parent_header |
| and "msg_id" in parent_header |
| and parent_header["msg_id"] == msg_id |
| ): |
| break |
| except queue.Empty: |
| break |
|
|
| |
| output = "".join(stdout_parts) |
| if stderr_parts: |
| output = ( |
| f"{output.rstrip()}\n{''.join(stderr_parts)}" |
| if output |
| else "".join(stderr_parts) |
| ) |
|
|
| if output.strip(): |
| end_marker = ( |
| "[End previous output]" |
| if execution_finished |
| else "[End previous output - interrupted]" |
| ) |
| return f"[Previous execution output]\n{output.rstrip()}\n{end_marker}\n" |
| return "" |
|
|
| def _restart_kernel(self) -> None: |
| """Restart the kernel, preserving the session but losing state.""" |
| import contextlib |
|
|
| from jupyter_client.blocking.client import BlockingKernelClient |
|
|
| print("[LocalJupyterSession] Restarting kernel due to unresponsive state") |
|
|
| |
| self._kernel_may_be_stuck = False |
| self._pending_msg_id = None |
|
|
| |
| with contextlib.suppress(Exception): |
| self._client.stop_channels() |
|
|
| |
| self._km.restart_kernel(now=True) |
|
|
| |
| self._client: BlockingKernelClient = self._km.blocking_client() |
| self._client.start_channels() |
| self._client.wait_for_ready(timeout=None) |
| |
| self._client.execute("%colors NoColor", store_history=False) |
| self._client.execute("%xmode Plain", store_history=False) |
| self._client.execute("import sys; sys.tracebacklimit = 10", store_history=False) |
|
|
| def execute(self, code: str, continue_executing_on_timeout: bool = False) -> str: |
| """Execute code in the kernel, returning combined stdout/stderr output.""" |
| |
| pending_output = self._drain_pending_output() |
|
|
| client = self._client |
| effective_timeout = self._default_timeout |
| msg_id = client.execute( |
| code, store_history=True, allow_stdin=False, stop_on_error=False |
| ) |
|
|
| stdout_parts: list[str] = [] |
| stderr_parts: list[str] = [] |
|
|
| while True: |
| try: |
| msg = client.get_iopub_msg(timeout=effective_timeout) |
| except queue.Empty: |
| if continue_executing_on_timeout: |
| |
| self._pending_msg_id = msg_id |
| error_msg = "[TIMEOUT] Execution still running. Will drain remaining output on next call." |
| else: |
| |
| self._km.interrupt_kernel() |
| |
| interrupt_responded = False |
| interrupt_timeout = 2.0 |
| while True: |
| try: |
| int_msg = client.get_iopub_msg(timeout=interrupt_timeout) |
| except queue.Empty: |
| |
| break |
| parent_header = int_msg["parent_header"] |
| if ( |
| not parent_header |
| or "msg_id" not in parent_header |
| or parent_header["msg_id"] != msg_id |
| ): |
| continue |
| int_msg_type = int_msg["msg_type"] |
| int_content = int_msg["content"] |
| if int_msg_type == "error": |
| |
| traceback_data = int_content["traceback"] |
| stderr_parts.append("\n".join(traceback_data)) |
| interrupt_responded = True |
| elif ( |
| int_msg_type == "status" |
| and int_content["execution_state"] == "idle" |
| ): |
| interrupt_responded = True |
| break |
| if not interrupt_responded: |
| |
| self._kernel_may_be_stuck = True |
| error_msg = "[TIMEOUT] Execution interrupted." |
| |
| partial_output = "".join(stdout_parts) |
| if stderr_parts: |
| partial_output = ( |
| f"{partial_output.rstrip()}\n{''.join(stderr_parts)}" |
| if partial_output |
| else "".join(stderr_parts) |
| ) |
| result = f"{partial_output.rstrip()}\n{error_msg}".lstrip() |
| return f"{pending_output}{result}" if pending_output else result |
|
|
| parent_header = msg["parent_header"] |
| if ( |
| not parent_header |
| or "msg_id" not in parent_header |
| or parent_header["msg_id"] != msg_id |
| ): |
| continue |
|
|
| msg_type = msg["msg_type"] |
| content = msg["content"] |
|
|
| if msg_type == "stream": |
| text = content["text"] |
| if content["name"] == "stdout": |
| stdout_parts.append(text) |
| else: |
| stderr_parts.append(text) |
| elif msg_type == "error": |
| traceback_data = content["traceback"] |
| stderr_parts.append("\n".join(traceback_data)) |
| elif msg_type in {"execute_result", "display_data"}: |
| data = content["data"] |
| if "text/plain" in data: |
| text = data["text/plain"] |
| stdout_parts.append(text if text.endswith("\n") else f"{text}\n") |
| elif msg_type == "status" and content["execution_state"] == "idle": |
| break |
|
|
| |
| while True: |
| try: |
| reply = client.get_shell_msg(timeout=effective_timeout) |
| except queue.Empty: |
| if continue_executing_on_timeout: |
| |
| self._pending_msg_id = msg_id |
| error_msg = "[TIMEOUT] Execution still running. Will drain remaining output on next call." |
| else: |
| |
| self._km.interrupt_kernel() |
| self._kernel_may_be_stuck = ( |
| True |
| ) |
| error_msg = "[TIMEOUT] Execution interrupted." |
| partial_output = "".join(stdout_parts) |
| if stderr_parts: |
| partial_output = ( |
| f"{partial_output.rstrip()}\n{''.join(stderr_parts)}" |
| if partial_output |
| else "".join(stderr_parts) |
| ) |
| result = f"{partial_output.rstrip()}\n{error_msg}".lstrip() |
| return f"{pending_output}{result}" if pending_output else result |
|
|
| reply_parent = reply["parent_header"] |
| if ( |
| reply_parent |
| and "msg_id" in reply_parent |
| and reply_parent["msg_id"] == msg_id |
| ): |
| break |
|
|
| stdout = "".join(stdout_parts) |
| stderr = "".join(stderr_parts) |
|
|
| if stderr: |
| if stdout: |
| stdout = f"{stdout.rstrip()}\n{stderr}" |
| else: |
| stdout = stderr |
|
|
| if not stdout.strip(): |
| stdout = "[WARN] No output available. Use print() to output anything to stdout to receive the output" |
|
|
| return f"{pending_output}{stdout}" if pending_output else stdout |
|
|
| def close(self) -> None: |
| import contextlib |
|
|
| |
| with contextlib.suppress(Exception): |
| self._client.stop_channels() |
|
|
| |
| with contextlib.suppress(Exception): |
| self._km.shutdown_kernel(now=True) |
|
|
| |
| with contextlib.suppress(Exception): |
| self._km.cleanup_resources() |
|
|
| |
| with contextlib.suppress(Exception): |
| self._zmq_context.destroy(linger=0) |
|
|
| def __del__(self) -> None: |
| |
| import sys |
|
|
| if sys.meta_path is not None: |
| self.close() |
|
|
|
|
| def execute_python_code(session: LocalJupyterSession, script: str) -> str: |
| """Execute Python code in a stateful Jupyter session.""" |
| return session.execute(script) |
|
|
| |
| |
|
|
| |
| |
| from openai_harmony import ( |
| Conversation, |
| DeveloperContent, |
| HarmonyEncodingName, |
| Message, |
| ReasoningEffort, |
| Role, |
| StreamableParser, |
| SystemContent, |
| load_harmony_encoding, |
| ) |
|
|
| harmony_encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS) |
| stop_token_ids: list[int] = list(harmony_encoding.stop_tokens_for_assistant_actions()) |
|
|
| |
| |
| |
| RETURN_TOKEN_ID = 200002 |
| END_TOKEN_ID = 200007 |
|
|
| |
| |
| import queue |
|
|
| from openai_harmony import Author, TextContent, ToolNamespaceConfig |
|
|
| |
| PYTHON_TOOL_INSTRUCTION = """ |
| Use this tool to execute Python code in your chain of thought. The code will not be shown to the user. This tool should be used for internal reasoning. |
| When you send a message containing Python code to python, it will be executed in a stateful Jupyter notebook environment. python will respond with the output of the execution. Internet access for this session is disabled. |
| """.strip() |
|
|
|
|
| python_tool_config = ToolNamespaceConfig( |
| name="python", description=PYTHON_TOOL_INSTRUCTION, tools=[] |
| ) |
|
|
|
|
| def make_python_tool_response(output: str, channel: str | None = None) -> Message: |
| """Create a tool response message for the Python tool.""" |
| content = TextContent(text=output) |
| author = Author(role=Role.TOOL, name="python") |
| message = Message(author=author, content=[content]).with_recipient("assistant") |
| if channel: |
| message = message.with_channel(channel) |
| return message |
|
|
|
|
| def build_prompt_token_ids( |
| system_content: str, |
| user_content: str, |
| reasoning_effort: ReasoningEffort, |
| enable_python_tool: bool = False, |
| ) -> list[int]: |
| """Convert system and user content to token IDs using harmony format.""" |
| system_content_obj = SystemContent.new().with_reasoning_effort(reasoning_effort) |
| if enable_python_tool: |
| |
| system_content_obj = system_content_obj.with_tools(python_tool_config) |
| system_message = Message.from_role_and_content(Role.SYSTEM, system_content_obj) |
| developer_message = Message.from_role_and_content( |
| Role.DEVELOPER, DeveloperContent.new().with_instructions(system_content) |
| ) |
| user_message = Message.from_role_and_content(Role.USER, user_content) |
| convo = Conversation.from_messages( |
| [system_message, developer_message, user_message] |
| ) |
| return list( |
| harmony_encoding.render_conversation_for_completion(convo, Role.ASSISTANT) |
| ) |
|
|
|
|
| import random |
| import string |
|
|
|
|
| def save_communication( |
| content: str, prefix: str = "", enabled: bool = save_communication_enabled |
| ) -> str: |
| """Save content to communications/ with a random 6 alphanumeric hash.""" |
| if not enabled: |
| return "" |
| hash_str = "".join(random.choices(string.ascii_lowercase + string.digits, k=6)) |
| filename = f"{COMMUNICATIONS_DIR}/{prefix}{hash_str}.txt" |
| with open(filename, "w") as f: |
| f.write(content) |
| return filename |
|
|
|
|
| def append_user_turn_token_ids( |
| all_token_ids: list[int], user_content: str |
| ) -> list[int]: |
| """Append a new user turn to the token IDs.""" |
| new_user_message = Message.from_role_and_content(Role.USER, user_content) |
| user_tokens = list( |
| harmony_encoding.render_conversation_for_completion( |
| Conversation.from_messages([new_user_message]), Role.ASSISTANT |
| ) |
| ) |
| return all_token_ids + user_tokens |
|
|
|
|
| import time |
|
|
|
|
| def append_tool_response_token_ids( |
| all_token_ids: list[int], tool_response: Message |
| ) -> list[int]: |
| """Append a tool response to the token IDs.""" |
| tool_tokens = list( |
| harmony_encoding.render_conversation_for_completion( |
| Conversation.from_messages([tool_response]), Role.ASSISTANT |
| ) |
| ) |
| return all_token_ids + tool_tokens |
|
|
| |
| import os |
| import time |
|
|
| import requests |
| from cachetools import TTLCache, cached |
|
|
|
|
| @cached(cache=TTLCache(maxsize=51, ttl=5)) |
| def get_gpu_kv_cache_usage(question_id: str | None = None) -> float: |
| |
| |
| try: |
| base_url = os.environ["OPENAI_API_BASE"] |
| |
| metrics_url = base_url.replace("/v1", "/metrics") |
| resp = requests.get(metrics_url, timeout=5) |
| for line in resp.text.split("\n"): |
| |
| if line.startswith("vllm:kv_cache_usage_perc"): |
| value = float(line.split()[-1]) |
| return value * 100 |
| except (requests.RequestException, ValueError, IndexError): |
| pass |
| return -1 |
|
|
|
|
| question_id_to_latest_termination_time: dict[str, float] = {} |
|
|
|
|
| def maybe_terminate_solver_for_gpu_usage( |
| question_id: str, |
| gpu_usage_thresholds: tuple[float, ...] = ( |
| 1.00, |
| 1.00, |
| 1.00, |
| 0.90, |
| 0.80, |
| 0.80, |
| 0.80, |
| 0.80, |
| 0.80, |
| 0.70, |
| 0.60, |
| ), |
| ) -> None: |
| """Terminate excess solvers based on GPU KV cache usage. |
| |
| Args: |
| question_id: The question being solved. |
| gpu_usage_thresholds: Descending list of thresholds mapping to allowed solvers. |
| E.g. [0.95, 0.85, 0.75, 0.65, 0.60, 0.55, 0.50, 0.45] means: |
| > 0.95 -> 0 solver |
| > 0.85 -> 1 solver |
| > ... |
| > 0.45 -> 7 solvers |
| """ |
| allowed_active_solvers = ( |
| num_generations * 2 |
| ) |
|
|
| gpu_kv_cache_usage = get_gpu_kv_cache_usage(question_id) / 100 |
| for allowed_active_solvers, threshold in enumerate(gpu_usage_thresholds): |
| if gpu_kv_cache_usage > threshold: |
| break |
|
|
| active_solvers = question_id_to_active_solver_indexes[question_id] |
| if len(active_solvers) <= allowed_active_solvers: |
| return |
|
|
| if question_id_to_latest_termination_time[question_id] + 25 > time.time(): |
| print(f"Skip terminating a solver at {100 * gpu_kv_cache_usage:.1f}") |
| return |
|
|
| print(f"Terminating a solver at {100 * gpu_kv_cache_usage:.1f}") |
| solver_index_to_answer = question_id_to_solver_index_to_answer[question_id] |
| print(f"{solver_index_to_answer=}") |
| solver_index_to_proposal = question_id_to_solver_index_to_proposal[question_id] |
| print(f"{solver_index_to_proposal=}") |
| active_solver_indexes = question_id_to_active_solver_indexes[question_id] |
| print(f"{active_solver_indexes=}") |
|
|
| solver_index_to_tool_use_count = question_id_to_solver_index_to_tool_use_count[ |
| question_id |
| ] |
|
|
| |
| |
| sorted_solvers = sorted( |
| active_solvers, |
| key=lambda solver_index: ( |
| solver_index in question_id_to_proposed_solver_indexes[question_id], |
| solver_index in solver_index_to_answer |
| or solver_index in solver_index_to_proposal, |
| solver_index in solver_index_to_answer, |
| solver_index_to_tool_use_count[solver_index], |
| ), |
| ) |
|
|
| for active_solver_index in sorted_solvers: |
| active_solvers.discard(active_solver_index) |
| print( |
| f"Terminated Solver {active_solver_index}, Solvers {active_solvers} remaining" |
| ) |
| vote_answer(question_id) |
| |
| question_id_to_latest_termination_time[question_id] = time.time() |
| return |
|
|
| |
| if is_on_kaggle_interactive(): |
| test_prompt_ids = build_prompt_token_ids( |
| system_content="Reply your answer in \\boxed{}", |
| user_content="How many r are there in strawberry?", |
| reasoning_effort=ReasoningEffort.HIGH, |
| ) |
| resp: Completion = client.completions.create( |
| model="vllm-model", |
| prompt=test_prompt_ids, |
| max_tokens=1024, |
| temperature=1.0, |
| extra_body=dict( |
| min_p=0.02, stop_token_ids=stop_token_ids, return_token_ids=True |
| ), |
| ) |
| save_communication(harmony_encoding.decode(test_prompt_ids), prefix="test-") |
|
|
| print("Token IDs:", resp.choices[0].token_ids) |
|
|
| print(resp.choices[0].text) |
|
|
| |
| |
|
|
| |
| def extract_boxed_text(text: str) -> str: |
| """Extract text inside \\boxed{} from LaTeX-formatted text""" |
| import re |
|
|
| pattern: str = r"oxed{(.*?)}" |
| matches: list[str] = re.findall(pattern, text) |
| if not matches: |
| return "" |
| for match in matches[::-1]: |
| if match != "": |
| return match |
| return "" |
|
|
|
|
| def is_valid_answer_string(text: str) -> bool: |
| try: |
| if int(text) == float(text): |
| if 0 <= int(text) <= 99_999: |
| |
| return True |
| except Exception: |
| pass |
| return False |
|
|
|
|
| def save_solver_trace( |
| question_id: str, |
| solver_index: int, |
| tool_call_count: int, |
| all_token_ids: list[int], |
| backtrack_count: int | None = None, |
| ) -> None: |
| detokenized_text = harmony_encoding.decode(all_token_ids) |
| boxed_text = extract_boxed_text(detokenized_text) |
| answer_suffix = "NA" |
| if is_valid_answer_string(boxed_text): |
| answer_suffix = boxed_text |
| total_tokens = len(all_token_ids) |
| backtrack_part = ( |
| f"-backtrack-{backtrack_count}" if backtrack_count is not None else "" |
| ) |
| base_path = f"{SOLUTIONS_DIR}/{question_id}/{solver_index:01d}-{total_tokens:05d}-{tool_call_count:02d}{backtrack_part}-{answer_suffix}" |
| with open(f"{base_path}-tokens.txt", "w") as f: |
| for token_id in all_token_ids: |
| f.write(f"{token_id}\n") |
| with open(f"{base_path}-text.txt", "w") as f: |
| f.write(detokenized_text) |
|
|
| |
| from collections import Counter |
|
|
| completed_question_ids: set[str] = set() |
| question_id_to_solver_index_to_answer: dict[str, dict[int, int]] = {} |
| question_id_to_solver_index_to_answer_history: dict[ |
| str, dict[int, list[int | None]] |
| ] = {} |
| question_id_to_solver_index_to_proposal_history: dict[ |
| str, dict[int, list[int | None]] |
| ] = {} |
| question_id_to_solver_index_to_submission_history: dict[str, dict[int, list[int]]] = {} |
| question_id_to_solver_index_to_proposal: dict[str, dict[int, int]] = {} |
| question_id_to_active_solver_indexes: dict[str, set[int]] = {} |
| question_id_to_proposed_solver_indexes: dict[ |
| str, set[int] |
| ] = {} |
| question_id_to_solver_index_to_backtrack_count: dict[str, dict[int, int]] = {} |
| question_id_to_solver_index_to_main_token_length: dict[str, dict[int, int]] = {} |
| question_id_to_solver_index_to_support_token_length: dict[str, dict[int, int]] = {} |
| question_id_to_solver_index_to_tool_use_count: dict[str, dict[int, int]] = {} |
|
|
|
|
| def get_vote_string(question_id: str, reader_solver_index: int) -> str: |
| solver_index_to_answer = question_id_to_solver_index_to_answer[question_id] |
| print(f"{solver_index_to_answer=}") |
| solver_index_to_proposal = question_id_to_solver_index_to_proposal[question_id] |
| print(f"{solver_index_to_proposal=}") |
| active_solver_indexes = question_id_to_active_solver_indexes[question_id] |
| print(f"{active_solver_indexes=}") |
| lines = ["These are the currently submitted answers:"] |
| for solver_index in sorted(active_solver_indexes): |
| solver_annotation = "" |
| if solver_index == reader_solver_index: |
| solver_annotation = " (you)" |
| if ( |
| solver_index in solver_index_to_answer |
| and solver_index in solver_index_to_proposal |
| ): |
| answer = solver_index_to_answer[solver_index] |
| proposal = solver_index_to_proposal[solver_index] |
| if answer == proposal: |
| lines.append( |
| f"Solver {solver_index}{solver_annotation} is submitting {answer}" |
| ) |
| else: |
| lines.append( |
| f"Solver {solver_index}{solver_annotation} is submitting {answer}, but considering {proposal}" |
| ) |
| elif solver_index in solver_index_to_proposal: |
| proposal = solver_index_to_proposal[solver_index] |
| lines.append( |
| f"Solver {solver_index}{solver_annotation} is considering {proposal}" |
| ) |
| else: |
| lines.append( |
| f"Solver {solver_index}{solver_annotation} has not submitted an answer." |
| ) |
| return "\n".join(lines) |
|
|
|
|
| def vote_answer(question_id: str, force_answer: bool = False) -> int | None: |
| """Vote for the best answer, with the side effect of adding to completed_question_ids""" |
| solver_index_to_answer = question_id_to_solver_index_to_answer[question_id] |
| print(f"{solver_index_to_answer=}") |
| solver_index_to_proposal = question_id_to_solver_index_to_proposal[question_id] |
| print(f"{solver_index_to_proposal=}") |
| active_solver_indexes = question_id_to_active_solver_indexes[question_id] |
| print(f"{active_solver_indexes=}") |
| solver_index_to_backtrack_count = question_id_to_solver_index_to_backtrack_count[ |
| question_id |
| ] |
| print(f"{solver_index_to_backtrack_count=}") |
|
|
| answer = vote_answer_inner( |
| solver_index_to_answer, |
| solver_index_to_proposal, |
| active_solver_indexes, |
| solver_index_to_backtrack_count, |
| force_answer, |
| ) |
| if answer is not None: |
| |
| |
| completed_question_ids.add(question_id) |
| return answer |
|
|
|
|
| def vote_answer_inner( |
| solver_index_to_answer: dict[int, int], |
| solver_index_to_proposal: dict[int, int], |
| active_solver_indexes: set[int], |
| solver_index_to_backtrack_count: dict[int, int], |
| force_answer: bool, |
| ) -> int | None: |
| """Vote for the best answer""" |
|
|
| if len(active_solver_indexes) == 0: |
| force_answer = True |
|
|
| answers_from_active_solver_indexes = [ |
| solver_index_to_answer[solver_index] |
| for solver_index in active_solver_indexes |
| if solver_index in solver_index_to_answer |
| and solver_index in solver_index_to_proposal |
| and solver_index_to_answer[solver_index] |
| == solver_index_to_proposal[solver_index] |
| ] |
| proposal_from_active_solver_indexes = [ |
| proposal |
| for solver_index, proposal in solver_index_to_proposal.items() |
| if solver_index in active_solver_indexes |
| ] |
|
|
| |
| proposal_to_backtrack_count: dict[int, int] = defaultdict(int) |
| for solver_index in active_solver_indexes: |
| if proposal := solver_index_to_proposal.get(solver_index): |
| proposal_to_backtrack_count[proposal] += ( |
| solver_index_to_backtrack_count.get(solver_index, 0) |
| ) |
|
|
| |
| active_proposal_counter = Counter(proposal_from_active_solver_indexes) |
| answer_counter = Counter(answers_from_active_solver_indexes) |
|
|
| ranked_proposals = sorted( |
| active_proposal_counter, |
| key=lambda proposal: ( |
| active_proposal_counter[proposal], |
| answer_counter[proposal], |
| proposal_to_backtrack_count[proposal], |
| proposal, |
| ), |
| ) |
|
|
| best_answer = ranked_proposals[-1] if ranked_proposals else 12453 |
|
|
| if force_answer: |
| print(f"force_answer, Current GPU usage {get_gpu_kv_cache_usage()}") |
| return best_answer |
|
|
| def check_threshold( |
| answer_threshold: tuple[int | None, int | None, int | None], |
| proposal_threshold: tuple[int | None, int | None, int | None], |
| ) -> bool: |
| """ |
| Check if voting thresholds are met. |
| Threshold tuple: (match, mismatch, unsubmitted), None means don't care. |
| match: at least this many matches (>=) |
| mismatch: at most this many mismatches (<=) |
| unsubmitted: at most this many unsubmitted (<=) |
| """ |
| answer_match, answer_mismatch, answer_unsubmitted = answer_threshold |
| proposal_match, proposal_mismatch, proposal_unsubmitted = proposal_threshold |
|
|
| |
| answer_match_count = answers_from_active_solver_indexes.count(best_answer) |
| answer_mismatch_count = ( |
| len(answers_from_active_solver_indexes) - answer_match_count |
| ) |
| answer_unsubmitted_count = len(active_solver_indexes) - len( |
| answers_from_active_solver_indexes |
| ) |
|
|
| |
| if answer_match is not None: |
| if answer_match_count < answer_match: |
| return False |
| |
| |
| |
| if answer_mismatch is not None: |
| if answer_mismatch_count > answer_mismatch: |
| return False |
| if answer_unsubmitted is not None: |
| if ( |
| answer_mismatch_count + answer_unsubmitted_count |
| > answer_mismatch + answer_unsubmitted |
| ): |
| return False |
|
|
| |
| proposal_match_count = proposal_from_active_solver_indexes.count(best_answer) |
| proposal_mismatch_count = ( |
| len(proposal_from_active_solver_indexes) - proposal_match_count |
| ) |
| proposal_unsubmitted_count = len(active_solver_indexes) - len( |
| proposal_from_active_solver_indexes |
| ) |
|
|
| |
| if proposal_match is not None: |
| if proposal_match_count < proposal_match: |
| return False |
| |
| |
| |
| if proposal_mismatch is not None: |
| if proposal_mismatch_count > proposal_mismatch: |
| return False |
| if proposal_unsubmitted is not None: |
| if ( |
| proposal_mismatch_count + proposal_unsubmitted_count |
| > proposal_mismatch + proposal_unsubmitted |
| ): |
| return False |
|
|
| def fmt(t: tuple[int | None, int | None, int | None]) -> str: |
| return "/".join("?" if x is None else str(x) for x in t) |
|
|
| print( |
| f"Vote for {best_answer} passed with answer {fmt(answer_threshold)}, proposal {fmt(proposal_threshold)}" |
| ) |
| return True |
|
|
| |
| if check_threshold((None, 0, 0), (None, None, None)): |
| return best_answer |
|
|
| if check_threshold((None, None, None), (None, 0, 0)): |
| return best_answer |
|
|
| if best_answer <= 3: |
| |
| return None |
|
|
| |
| if len(cutoff_times) <= 1: |
| return None |
|
|
| if check_threshold((0, 0, None), (2, 0, None)): |
| return best_answer |
|
|
| if check_threshold((0, 0, None), (3, 0, None)): |
| return best_answer |
|
|
| if check_threshold((0, 1, None), (4, 1, None)): |
| return best_answer |
| |
| if check_threshold((0, 0, None), (2, 1, 0)): |
| return best_answer |
|
|
| if check_threshold((0, 0, None), (1, 0, 2)): |
| return best_answer |
|
|
| |
| return None |
|
|
| |
| |
|
|
| |
| |
|
|
| from dataclasses import dataclass |
|
|
|
|
| @dataclass |
| class Finding: |
| statement: str |
| question_id: str |
| author_solver_index: int |
| author_solver_token_count: int |
| proposal: int | None |
| accepted_solver_indexes_and_reviews: dict[int, str] |
| rejected_solver_indexes_and_reviews: dict[int, str] |
|
|
| @property |
| def reviewed_solver_indexes(self) -> set[int]: |
| return set(self.accepted_solver_indexes_and_reviews.keys()) | set( |
| self.rejected_solver_indexes_and_reviews.keys() |
| ) |
|
|
|
|
| question_id_to_findings: dict[str, list[Finding]] = {"": []} |
|
|
| question_id_to_solver_index_to_proposals_read: dict[str, dict[int, set[int]]] = {} |
| question_id_to_solver_index_to_findings_read: dict[str, dict[int, set[int]]] = {} |
|
|
|
|
| def get_finding_texts_for_solver( |
| question_id: str, |
| solver_index: int, |
| skip_update_read_state: bool = False, |
| skip_none: bool = True, |
| ) -> list[str]: |
| """Get findings from other solvers that this solver hasn't seen yet.""" |
| findings = question_id_to_findings[question_id] |
| proposals_read = question_id_to_solver_index_to_proposals_read[question_id][ |
| solver_index |
| ] |
| findings_read = question_id_to_solver_index_to_findings_read[question_id][ |
| solver_index |
| ] |
| current_solver_proposal = question_id_to_solver_index_to_proposal[question_id].get( |
| solver_index |
| ) |
| if current_solver_proposal is None: |
| return [] |
|
|
| solver_index_to_finding_texts: dict[int, str] = {} |
| solver_seen_in_session = set() |
| for finding_idx in range(len(findings) - 1, -1, -1): |
| finding = findings[finding_idx] |
| if finding.author_solver_index == solver_index: |
| continue |
| if finding.author_solver_index in solver_seen_in_session: |
| continue |
| |
| if finding_idx in findings_read: |
| continue |
| if skip_none and finding.proposal is None: |
| continue |
| already_read_proposal = finding.proposal in proposals_read |
| if already_read_proposal: |
| finding.rejected_solver_indexes_and_reviews[solver_index] = ( |
| "Already read proposal" |
| ) |
| continue |
| if finding.proposal == current_solver_proposal: |
| finding.rejected_solver_indexes_and_reviews[solver_index] = ( |
| "Same proposal as current solver" |
| ) |
| continue |
| else: |
| finding.accepted_solver_indexes_and_reviews[solver_index] = ( |
| "Proposals differ for now" |
| ) |
| finding_string = ( |
| f"Another solver has provided this solution:\n\n{finding.statement}" |
| ) |
| solver_index_to_finding_texts[finding.author_solver_index] = finding_string |
| solver_seen_in_session.add(finding.author_solver_index) |
| if not skip_update_read_state: |
| findings_read.add(finding_idx) |
| if finding.proposal is not None: |
| proposals_read.add(finding.proposal) |
|
|
| finding_texts = list(solver_index_to_finding_texts.values()) |
| if finding_texts: |
| return finding_texts |
| if skip_none is True: |
| return get_finding_texts_for_solver( |
| question_id, solver_index, skip_update_read_state, skip_none=False |
| ) |
| return finding_texts |
|
|
|
|
| def get_finding_text( |
| all_token_ids: list[int], |
| question_id: str, |
| solver_index: int, |
| ) -> str: |
| """ |
| Use a separate LLM call to summarize useful findings from the current conversation. |
| Takes a copy of all_token_ids (the full conversation context) and appends a summarization request. |
| Returns the finding text, or an empty string if no finding was extracted. |
| Terminates early if the question is solved. |
| """ |
|
|
| finding_prompt = """ |
| Summarize your current approach in LESS THAN 250 characters, for other solvers to replicate your solution. |
| |
| Include: |
| - key insights |
| - key intermediate results |
| - key pitfalls |
| |
| Reminder |
| - Your job NOW is to SUMMARIZE. |
| - The limit is 250 characters. Keep your response very, very concise. Only include the important information. |
| """.strip() |
|
|
| finding_prefix = "Insights:" |
|
|
| |
| all_token_ids = append_user_turn_token_ids(all_token_ids, finding_prompt) |
| finding_input_tokens = all_token_ids + harmony_encoding.encode( |
| "<|channel|>analysis<|message|>My job here is to summarize. I must follow the character limit.<|end|>" |
| f"<|start|>assistant<|channel|>final<|message|>{finding_prefix}", |
| allowed_special="all", |
| ) |
|
|
| |
| finding_stream: Stream[Completion] = client.completions.create( |
| model="vllm-model", |
| prompt=finding_input_tokens, |
| max_tokens=1024, |
| temperature=0, |
| stream=True, |
| extra_body=dict(stop_token_ids=stop_token_ids, return_token_ids=True), |
| ) |
| save_communication( |
| harmony_encoding.decode(finding_input_tokens), prefix="findings-" |
| ) |
| finding_text = finding_prefix |
| for chunk in finding_stream: |
| question_id_to_solver_index_to_support_token_length[question_id][ |
| solver_index |
| ] += 1 |
| if chunk.choices[0].text: |
| finding_text += chunk.choices[0].text |
| if question_id in completed_question_ids: |
| break |
| if chunk.choices[0].finish_reason: |
| break |
| finding_stream.close() |
|
|
| |
| finding_text = finding_text.replace("\\boxed{", "{answer = ") |
|
|
| if not finding_text: |
| print("Warning: empty finding_text") |
| return "" |
|
|
| return finding_text |
|
|
|
|
| def get_proposal( |
| all_token_ids: list[int], |
| question_id: str, |
| solver_index: int, |
| ) -> int | None: |
| """ |
| Use a separate LLM call to return the current proposal. |
| """ |
|
|
| |
|
|
| proposal_prompt = """ |
| Have you computed a final answer? If you have computed a final answer, return that integer answer in \\boxed{}. |
| If you have not computed a final answer, return \\boxed{None} instead. |
| """.strip() |
|
|
| |
| all_token_ids = append_user_turn_token_ids(all_token_ids, proposal_prompt) |
| proposal_text_prefix = "\\boxed{" |
| proposal_input_tokens = all_token_ids + harmony_encoding.encode( |
| f"<|channel|>final<|message|>{proposal_text_prefix}", allowed_special="all" |
| ) |
|
|
| |
| proposal_resp = client.completions.create( |
| model="vllm-model", |
| prompt=proposal_input_tokens, |
| max_tokens=10, |
| temperature=0, |
| extra_body=dict(stop_token_ids=stop_token_ids, return_token_ids=True), |
| ) |
| question_id_to_solver_index_to_support_token_length[question_id][solver_index] += ( |
| len(getattr(proposal_resp.choices[0], "token_ids", [])) |
| ) |
| save_communication( |
| harmony_encoding.decode(proposal_input_tokens), prefix="proposal-" |
| ) |
| proposal_text = proposal_text_prefix + proposal_resp.choices[0].text |
| boxed_text = extract_boxed_text(proposal_text) |
| if is_valid_answer_string(boxed_text): |
| return int(boxed_text) |
| return None |
|
|
|
|
| def get_answer_confidence( |
| all_token_ids: list[int], |
| answer: int, |
| question_id: str, |
| solver_index: int, |
| enabled: bool = maybe_collaborate_enabled, |
| ) -> bool: |
| """ |
| Use a separate LLM call to get the confidence of the currently proposed answer. |
| """ |
| if not enabled: |
| return True |
|
|
| confidence_prompt = f""" |
| Did you confirm that {answer} is wrong? |
| Start your reply with exactly either `I have confirmed that {answer} is wrong` or `I have yet to confirm that {answer} is wrong`. |
| """.strip() |
|
|
| |
| all_token_ids = all_token_ids + harmony_encoding.encode( |
| "<|end|>", allowed_special="all" |
| ) |
|
|
| |
| all_token_ids = append_user_turn_token_ids(all_token_ids, confidence_prompt) |
| assistant_prefix = "I have" |
| confidence_input_tokens = all_token_ids + harmony_encoding.encode( |
| f"<|channel|>final<|message|>{assistant_prefix}", allowed_special="all" |
| ) |
|
|
| |
| confidence_resp = client.completions.create( |
| model="vllm-model", |
| prompt=confidence_input_tokens, |
| max_tokens=32, |
| temperature=0, |
| extra_body=dict(stop_token_ids=stop_token_ids, return_token_ids=True), |
| ) |
| question_id_to_solver_index_to_support_token_length[question_id][solver_index] += ( |
| len(getattr(confidence_resp.choices[0], "token_ids", [])) |
| ) |
| confidence_text = assistant_prefix + confidence_resp.choices[0].text.replace( |
| "\n", " " |
| ) |
| save_communication( |
| harmony_encoding.decode(confidence_input_tokens), prefix="confidence-" |
| ) |
| if f"I have confirmed that {answer} is wrong" in confidence_text: |
| print(f"get_answer_confidence False {answer} {confidence_text}") |
| return False |
| print(f"get_answer_confidence True {answer} {confidence_text}") |
| return True |
|
|
|
|
| def contains_sublist(big: list, small: list): |
| return any( |
| big[i : i + len(small)] == small for i in range(len(big) - len(small) + 1) |
| ) |
|
|
|
|
| |
| ASSISTANT_STARTER: list[int] = [200006, 173781, 200005, 17196, 200008] |
|
|
|
|
| def backtrack_all_tokens(all_token_ids: list[int], backtrack_count: int) -> list[int]: |
| all_token_ids = all_token_ids.copy() |
| for _ in range(backtrack_count): |
| tokens_popped = 0 |
| if all_token_ids and contains_sublist(all_token_ids[:-1], ASSISTANT_STARTER): |
| tokens_popped += 1 |
| all_token_ids.pop() |
| while ( |
| len(all_token_ids) >= len(ASSISTANT_STARTER) |
| and all_token_ids[-len(ASSISTANT_STARTER) :] != ASSISTANT_STARTER |
| ): |
| all_token_ids.pop() |
| tokens_popped += 1 |
| all_token_ids.pop() |
| print(f"Backtracking, {tokens_popped=}") |
| return all_token_ids |
|
|
|
|
| def delete_answer( |
| all_token_ids: list[int], |
| question_id: str, |
| solver_index: int, |
| ) -> None: |
| |
| current_answer = question_id_to_solver_index_to_answer[question_id].get( |
| solver_index |
| ) |
| if current_answer is not None: |
| if ( |
| question_id_to_solver_index_to_answer_history[question_id][solver_index][-1] |
| is None |
| ): |
| del question_id_to_solver_index_to_answer[question_id][solver_index] |
| print(f"Solver {solver_index} deleted current answer {current_answer}") |
|
|
| current_proposal = question_id_to_solver_index_to_proposal[question_id].get( |
| solver_index |
| ) |
| if ( |
| current_proposal is not None |
| and len(question_id_to_active_solver_indexes) >= 4 |
| ): |
| |
| del question_id_to_solver_index_to_proposal[question_id][solver_index] |
| print( |
| f"Solver {solver_index} deleted current proposal {current_proposal}" |
| ) |
| else: |
| question_id_to_solver_index_to_answer_history[question_id][ |
| solver_index |
| ].append(None) |
| print( |
| f"Solver {solver_index} is planning to delete current answer {current_answer}" |
| ) |
| maybe_terminate_solver_for_gpu_usage(question_id) |
| vote_answer(question_id) |
| maybe_create_finding(all_token_ids, question_id, solver_index) |
| else: |
| print( |
| f"Solver {solver_index} is attempting to delete an answer that no longer exists" |
| ) |
|
|
|
|
| def update_proposal( |
| all_token_ids: list[int], |
| question_id: str, |
| solver_index: int, |
| ) -> None: |
| current_answer = question_id_to_solver_index_to_answer[question_id].get( |
| solver_index |
| ) |
| if current_answer is not None: |
| answer_still_confident = get_answer_confidence( |
| all_token_ids.copy(), current_answer, question_id, solver_index |
| ) |
| if not answer_still_confident: |
| delete_answer(all_token_ids, question_id, solver_index) |
| else: |
| if ( |
| question_id_to_solver_index_to_answer_history[question_id][ |
| solver_index |
| ][-1] |
| is None |
| ): |
| print( |
| f"Solver {solver_index} is not planning to delete current answer {current_answer}" |
| ) |
| question_id_to_solver_index_to_answer_history[question_id][ |
| solver_index |
| ].pop() |
|
|
| |
| new_proposal = get_proposal(all_token_ids.copy(), question_id, solver_index) |
| if solver_index not in question_id_to_solver_index_to_proposal[question_id]: |
| if new_proposal is None: |
| |
| pass |
| else: |
| |
| question_id_to_solver_index_to_proposal[question_id][solver_index] = ( |
| new_proposal |
| ) |
| question_id_to_solver_index_to_proposal_history[question_id][ |
| solver_index |
| ].append(new_proposal) |
| print( |
| f"Solver {solver_index} is proposing its first proposal {new_proposal} at token {len(all_token_ids)}" |
| ) |
| maybe_terminate_solver_for_gpu_usage(question_id) |
| vote_answer(question_id) |
| maybe_create_finding(all_token_ids, question_id, solver_index) |
| else: |
| current_proposal = question_id_to_solver_index_to_proposal[question_id][ |
| solver_index |
| ] |
|
|
| if current_proposal != new_proposal: |
| if new_proposal is None: |
| if ( |
| len(question_id_to_active_solver_indexes[question_id]) <= 5 |
| and cutoff_times[-1] - time.time() >= 120 |
| ): |
| print(f"Proposal deletion blocked for Solver {solver_index}") |
| else: |
| print(f"Proposal deletion allowed for Solver {solver_index}") |
| del question_id_to_solver_index_to_proposal[question_id][ |
| solver_index |
| ] |
| else: |
| question_id_to_solver_index_to_proposal[question_id][solver_index] = ( |
| new_proposal |
| ) |
| question_id_to_solver_index_to_proposal_history[question_id][ |
| solver_index |
| ].append(new_proposal) |
| print( |
| f"Solver {solver_index} is proposing proposal {new_proposal} at token {len(all_token_ids)} to overturn proposal {current_proposal}" |
| ) |
| maybe_terminate_solver_for_gpu_usage(question_id) |
| vote_answer(question_id) |
| maybe_create_finding(all_token_ids, question_id, solver_index) |
|
|
| for _ in range(10): |
| if question_id in completed_question_ids: |
| break |
| time.sleep(1) |
| question_id_to_proposing_answer[question_id].discard( |
| solver_index |
| ) |
|
|
|
|
| def maybe_update_proposal( |
| token_ids: list[int], |
| question_id: str, |
| solver_index: int, |
| enabled: bool = maybe_collaborate_enabled, |
| ) -> None: |
| """Update proposed answer if not already working on it.""" |
| if not enabled: |
| return |
| if solver_index in question_id_to_proposing_answer[question_id]: |
| return |
| question_id_to_proposing_answer[question_id].add(solver_index) |
| threading.Thread( |
| target=update_proposal, |
| args=(token_ids.copy(), question_id, solver_index), |
| ).start() |
|
|
|
|
| def create_finding( |
| all_token_ids: list[int], question_id: str, solver_index: int |
| ) -> None: |
| """Create a finding from the current solver and share it.""" |
| all_token_ids = all_token_ids.copy() |
| finding_text = get_finding_text(all_token_ids.copy(), question_id, solver_index) |
| author_proposal = question_id_to_solver_index_to_proposal[question_id].get( |
| solver_index |
| ) |
| finding = Finding( |
| statement=finding_text, |
| question_id=question_id, |
| author_solver_index=solver_index, |
| author_solver_token_count=len(all_token_ids), |
| proposal=author_proposal, |
| accepted_solver_indexes_and_reviews={}, |
| rejected_solver_indexes_and_reviews={}, |
| ) |
| question_id_to_findings[question_id].append(finding) |
| if len(finding_text) >= 250: |
| truncated_finding_text = ( |
| finding_text.replace("\n", " ")[:100] |
| + "..." |
| + finding_text.replace("\n", " ")[::-1][:100][::-1] |
| ) |
| else: |
| truncated_finding_text = finding_text.replace("\n", " ") |
| print( |
| f"Solver {solver_index:01d} shared Finding {len(question_id_to_findings[question_id]) - 1}: {len(finding_text)=} {truncated_finding_text}" |
| ) |
| for _ in range(90): |
| if question_id in completed_question_ids: |
| break |
| time.sleep(1) |
| question_id_to_creating_finding[question_id].discard(solver_index) |
|
|
|
|
| import threading |
| from collections import defaultdict |
|
|
| |
| question_id_to_creating_finding: defaultdict[str, set[int]] = defaultdict(set) |
| question_id_to_proposing_answer: defaultdict[str, set[int]] = defaultdict(set) |
|
|
|
|
| def maybe_create_finding( |
| token_ids: list[int], |
| question_id: str, |
| solver_index: int, |
| enabled: bool = maybe_collaborate_enabled, |
| ) -> None: |
| """Create finding if not already working on it.""" |
| if not enabled: |
| return |
| if solver_index in question_id_to_creating_finding[question_id]: |
| return |
| question_id_to_creating_finding[question_id].add(solver_index) |
| threading.Thread( |
| target=create_finding, args=(token_ids.copy(), question_id, solver_index) |
| ).start() |
|
|
|
|
| def save_findings(question_id: str, final_answer: int, time_taken: float) -> None: |
| """Save all findings for a question to findings/ directory as JSON.""" |
| import json |
|
|
| findings = question_id_to_findings[question_id] |
| data = { |
| "question_id": question_id, |
| "final_answer": final_answer, |
| "time_taken": round(time_taken, 1), |
| "findings": [ |
| { |
| "finding_idx": finding_idx, |
| "solver_index": finding.author_solver_index, |
| "finding": finding.statement, |
| "accepted_solver_indexes": list( |
| finding.accepted_solver_indexes_and_reviews.keys() |
| ), |
| "rejected_solver_indexes": list( |
| finding.rejected_solver_indexes_and_reviews.keys() |
| ), |
| "author_solver_token_count": finding.author_solver_token_count, |
| } |
| for finding_idx, finding in enumerate(findings) |
| ], |
| } |
| with open(f"{FINDINGS_DIR}/{question_id}.json", "w") as f: |
| json.dump(data, f, indent=2) |
|
|
| |
| |
|
|
| |
| def solve_question( |
| question_text: str, |
| question_id: str = "", |
| solver_index: int = 0, |
| ) -> str: |
| await_client() |
| print(f"Client connected for Solver {solver_index}") |
| if question_id in completed_question_ids: |
| return "" |
| if time.time() >= cutoff_times[-1]: |
| return "" |
|
|
| |
| question_id_to_solver_index_to_proposals_read[question_id][solver_index] = set() |
| question_id_to_solver_index_to_findings_read[question_id][solver_index] = set() |
|
|
| system_content = """\ |
| You will solve the problem and return the final answer in \\boxed{}. |
| The answer is expected to be an integer between 0 and 99999, inclusive. |
| |
| You may be provided solutions from other solvers. |
| - Reminder: if you find something wrong in your intermediate step, be sure to recompute all the variables from the intermediate step to the answer.""" |
|
|
| |
| |
| jupyter_session = LocalJupyterSession(timeout=10.0) |
| execute_python_code(jupyter_session, "import sympy as sp") |
| print(f"Solver {solver_index} created jupyter_session") |
|
|
| |
| try: |
| |
| all_token_ids: list[int] = build_prompt_token_ids( |
| system_content=system_content, |
| user_content=question_text, |
| reasoning_effort=ReasoningEffort.HIGH, |
| enable_python_tool=True, |
| ) |
|
|
| generated_token_count = 0 |
| generated_token_count_current_iteration = 0 |
| tool_call_count = 0 |
|
|
| for iteration in range(1024): |
| |
| |
|
|
| |
| |
| stream_parser = StreamableParser(harmony_encoding, role=Role.ASSISTANT) |
|
|
| while True: |
| |
| |
| exit_solve_question = False |
| continue_tool_use_loop = False |
| resample_completion = False |
|
|
| |
| if len(all_token_ids) >= max_model_len - 16384: |
| print(f"Terminating Solver {solver_index} by length") |
| exit_solve_question = True |
| break |
|
|
| |
| stream: Stream[Completion] = client.completions.create( |
| model="vllm-model", |
| prompt=all_token_ids, |
| max_tokens=max_model_len - len(all_token_ids) - 8192, |
| temperature=1.0, |
| stream=True, |
| extra_body=dict( |
| min_p=0.02, |
| stop_token_ids=stop_token_ids, |
| return_token_ids=True, |
| ), |
| ) |
| |
| |
| |
|
|
| number_of_paragraphs_after_proposal_match = -1 |
|
|
| for chunk in stream: |
| generated_token_count += 1 |
| generated_token_count_current_iteration += 1 |
| |
| chunk_token_ids = getattr(chunk.choices[0], "token_ids", None) |
| if chunk_token_ids: |
| |
| |
| |
| question_id_to_solver_index_to_main_token_length[question_id][ |
| solver_index |
| ] += 1 |
| for token_id in chunk_token_ids: |
| |
| all_token_ids.append(token_id) |
| stream_parser.process(token_id) |
|
|
| |
| finish_reason = chunk.choices[0].finish_reason |
| if finish_reason: |
| break |
|
|
| if ( |
| solver_index |
| not in question_id_to_active_solver_indexes[question_id] |
| ): |
| print( |
| f"Final generation for Solver {solver_index} before finally terminating" |
| ) |
| maybe_create_finding( |
| all_token_ids + [END_TOKEN_ID], question_id, solver_index |
| ) |
| |
| exit_solve_question = True |
| break |
| if question_id in completed_question_ids: |
| |
| exit_solve_question = True |
| break |
| if time.time() >= cutoff_times[-1]: |
| |
| completed_question_ids.add(question_id) |
| exit_solve_question = True |
| break |
|
|
| if all_token_ids[-2:] == [17196, 200008]: |
| |
| |
| answer_token_ids_to_force = [59, 172278, 90] |
| for answer_token_id_to_force in answer_token_ids_to_force: |
| all_token_ids.append(answer_token_id_to_force) |
| stream_parser.process(answer_token_id_to_force) |
| resample_completion = True |
|
|
| if resample_completion: |
| break |
|
|
| |
| chunk_text = chunk.choices[0].text |
|
|
| if ( |
| "\n\n" in chunk_text |
| and stream_parser.current_recipient is None |
| and generated_token_count_current_iteration >= 1024 |
| |
| and number_of_paragraphs_after_proposal_match == 3 |
| |
| ): |
| maybe_create_finding(all_token_ids, question_id, solver_index) |
|
|
| if ( |
| solver_index |
| in question_id_to_proposed_solver_indexes[question_id] |
| |
| |
| |
| ): |
| maybe_update_proposal( |
| all_token_ids, |
| question_id, |
| solver_index, |
| ) |
| |
| number_of_paragraphs_after_proposal_match = -1 |
|
|
| if "\n\n" in chunk_text: |
| if number_of_paragraphs_after_proposal_match >= 0: |
| number_of_paragraphs_after_proposal_match += 1 |
|
|
| for matching_text in [ |
| "answer", |
| "final", |
| ]: |
| if matching_text in chunk_text.lower(): |
| number_of_paragraphs_after_proposal_match = 0 |
|
|
| if ( |
| solver_index |
| in question_id_to_solver_index_to_proposal[question_id] |
| ): |
| for matching_text in [ |
| "oops", |
| "mistake", |
| "missed", |
| "missing", |
| "wrong", |
| "correct", |
| ]: |
| if matching_text in chunk_text.lower(): |
| number_of_paragraphs_after_proposal_match = 0 |
|
|
| |
| if len(all_token_ids) > 20_000 and len(all_token_ids) % 1000 == 0: |
| |
| |
| |
| |
| |
| maybe_terminate_solver_for_gpu_usage(question_id) |
|
|
| |
| |
| if chunk_text and "}" in chunk_text: |
| |
| |
| text_suffix = harmony_encoding.decode(all_token_ids[-20:]) |
| |
| boxed_text = extract_boxed_text(text_suffix) |
| if is_valid_answer_string(boxed_text): |
| answer = int(boxed_text) |
|
|
| |
| question_id_to_solver_index_to_proposal[question_id][ |
| solver_index |
| ] = answer |
| question_id_to_solver_index_to_proposal_history[ |
| question_id |
| ][solver_index].append(answer) |
|
|
| if stream_parser.current_channel != "analysis": |
| |
| |
| if ( |
| solver_index |
| not in question_id_to_proposed_solver_indexes[ |
| question_id |
| ] |
| ): |
| |
| print( |
| f"Solver {solver_index} will propose before answering." |
| ) |
| else: |
| |
| prev_answer = question_id_to_solver_index_to_answer[ |
| question_id |
| ].get(solver_index) |
| if prev_answer != answer: |
| question_id_to_solver_index_to_answer_history[ |
| question_id |
| ][solver_index].append(answer) |
| question_id_to_solver_index_to_answer[question_id][ |
| solver_index |
| ] = answer |
|
|
| question_id_to_proposed_solver_indexes[question_id].add( |
| solver_index |
| ) |
|
|
| vote_answer(question_id) |
|
|
| print( |
| f"Solver {solver_index} boxed {answer} in " |
| f"{stream_parser.current_channel} to {stream_parser.current_recipient}" |
| ) |
|
|
| if ( |
| stream_parser.current_channel == "analysis" |
| and stream_parser.current_recipient is None |
| and len( |
| get_finding_texts_for_solver( |
| question_id, |
| solver_index, |
| skip_update_read_state=True, |
| ) |
| ) |
| > 0 |
| |
| |
| |
| ): |
| print( |
| f"Forcing final and read finding for Solver {solver_index}" |
| ) |
| |
| force_termination_token_ids = ( |
| [ |
| 200007, |
| 200006, |
| 173781, |
| 200005, |
| 17196, |
| 200008, |
| 59, |
| 172278, |
| 90, |
| ] |
| + harmony_encoding.encode(boxed_text) |
| + [92, 200007] |
| ) |
| for ( |
| force_termination_token_id |
| ) in force_termination_token_ids: |
| all_token_ids.append(force_termination_token_id) |
| break |
|
|
| |
| |
| |
|
|
| |
| |
| stream.close() |
|
|
| |
| |
| if all_token_ids and all_token_ids[-1] == RETURN_TOKEN_ID: |
| all_token_ids[-1] = END_TOKEN_ID |
|
|
| if exit_solve_question: |
| break |
|
|
| if resample_completion: |
| |
| continue |
|
|
| |
| |
| parsed_messages = stream_parser.messages |
| |
| stream_parser = StreamableParser(harmony_encoding, role=Role.ASSISTANT) |
|
|
| if parsed_messages: |
| last_message = parsed_messages[-1] |
| if ( |
| last_message.recipient is not None |
| and last_message.recipient.startswith("python") |
| ): |
| continue_tool_use_loop = True |
| tool_call_count += 1 |
| question_id_to_solver_index_to_tool_use_count[question_id][ |
| solver_index |
| ] += 1 |
| |
| python_code = "" |
| if last_message.content: |
| first_block = last_message.content[0] |
| if isinstance(first_block, TextContent): |
| python_code = first_block.text |
| if python_code: |
| print( |
| f"Solver {solver_index:01d} iteration {iteration:01d} tool {tool_call_count:02d} token {len(all_token_ids):05d}", |
| flush=True, |
| ) |
| |
| output = execute_python_code(jupyter_session, python_code) |
| if len(output) > 10_000: |
| output = output[:3000] + "(truncated)" + output[-3000:] |
|
|
| |
| tool_response = make_python_tool_response( |
| output, channel=last_message.channel |
| ) |
| |
| all_token_ids = append_tool_response_token_ids( |
| all_token_ids, tool_response |
| ) |
|
|
| if not continue_tool_use_loop: |
| break |
|
|
| if exit_solve_question: |
| break |
|
|
| |
| current_proposal = question_id_to_solver_index_to_proposal[question_id].get( |
| solver_index |
| ) |
| submissions = question_id_to_solver_index_to_submission_history[ |
| question_id |
| ][solver_index] |
| if current_proposal is not None: |
| submissions.append(current_proposal) |
| if len(submissions) >= 2 and len(set(submissions[-2:])) == 1: |
| submissions.pop() |
| submissions.pop() |
| print( |
| f"Backtracking Solver {solver_index} for repeating proposal {current_proposal}" |
| ) |
| |
| question_id_to_solver_index_to_backtrack_count[question_id][ |
| solver_index |
| ] += 1 |
| |
| if question_id: |
| save_solver_trace( |
| question_id, |
| solver_index, |
| tool_call_count, |
| all_token_ids, |
| backtrack_count=question_id_to_solver_index_to_backtrack_count[ |
| question_id |
| ][solver_index], |
| ) |
| all_token_ids = backtrack_all_tokens( |
| all_token_ids, backtrack_count=2 |
| ) |
| continue |
|
|
| maybe_create_finding(all_token_ids, question_id, solver_index) |
| |
| finding_texts = get_finding_texts_for_solver(question_id, solver_index) |
|
|
| boxed_text = extract_boxed_text(harmony_encoding.decode(all_token_ids)) |
| print( |
| f"Solver {solver_index:01d} iteration {iteration:01d} tool {tool_call_count:02d} token {len(all_token_ids):05d}" |
| ) |
| if not is_valid_answer_string(boxed_text): |
| print(f"Solver {solver_index} follow-up - ask boxed answer") |
| user_follow_up = ( |
| "Figure out the correct answer. " |
| "The answer is expected to be an integer between 0 and 99999, inclusive. " |
| "Place your final answer in \\boxed{}. " |
| "Do not give up. Do not guess the answer. Do not put a placeholder. " |
| "If you are uncertain, continue working on the problem. There is no time limit." |
| ) |
| else: |
| print(f"Solver {solver_index} follow-up - continue verifying") |
|
|
| follow_up_instructions = "" |
| if len(finding_texts) == 0: |
| follow_up_instructions = ( |
| "Scrutinize your solution." |
| + "\nIf you spot any critical mistake in your solution, work towards figuring out the correct answer." |
| + "\nPrioritize scrutinizing your solution so you can find any mistakes as soon as possible." |
| ) |
| else: |
| follow_up_instructions = ( |
| "\n\n".join(finding_texts) |
| + "\n\nScrutinize your solution, using other solutions as a reference." |
| + "\nIf you spot any critical mistake in your solution, work towards figuring out the correct answer." |
| + "\nPrioritize scrutinizing your solution so you can find any mistakes as soon as possible." |
| ) |
| |
| |
| user_follow_up = f"""\ |
| {get_vote_string(question_id, solver_index)} |
| |
| {follow_up_instructions}""" |
|
|
| |
| all_token_ids = append_user_turn_token_ids(all_token_ids, user_follow_up) |
| generated_token_count_current_iteration = 0 |
|
|
| detokenized_text = harmony_encoding.decode(all_token_ids) |
| boxed_text = extract_boxed_text(detokenized_text) |
|
|
| if question_id not in completed_question_ids: |
| |
| question_id_to_active_solver_indexes[question_id].discard(solver_index) |
| vote_answer(question_id) |
|
|
| if question_id: |
| |
| if is_valid_answer_string(boxed_text): |
| print( |
| f"Solver {solver_index:01d} token {len(all_token_ids):05d} submits {boxed_text}" |
| ) |
| save_solver_trace( |
| question_id, |
| solver_index, |
| tool_call_count, |
| all_token_ids, |
| ) |
|
|
| return boxed_text |
|
|
| finally: |
| |
| if jupyter_session is not None: |
| print( |
| f"Cleaning up Jupyter session for Solver {solver_index} in {question_id}" |
| ) |
| jupyter_session.close() |
|
|
| |
| if is_on_kaggle_interactive(): |
| solve_question("What is 1+1?") |
|
|
| |
| def solve(question_text: str, question_id: str = "") -> int: |
| print(f"processing {question_id}") |
| reallocate_time(cutoff_times) |
| time_available = cutoff_times[-1] - time.time() |
| print(f"time_available {time_available:.1f}s") |
| question_start_time = time.time() |
| os.makedirs(f"{SOLUTIONS_DIR}/{question_id}", exist_ok=True) |
| question_id_to_solver_index_to_answer[question_id] = {} |
| question_id_to_solver_index_to_answer_history[question_id] = defaultdict(list) |
| question_id_to_solver_index_to_proposal_history[question_id] = defaultdict(list) |
| question_id_to_solver_index_to_submission_history[question_id] = defaultdict(list) |
| question_id_to_solver_index_to_proposal[question_id] = {} |
| question_id_to_active_solver_indexes[question_id] = set(range(num_generations)) |
| question_id_to_proposed_solver_indexes[question_id] = set() |
| question_id_to_findings[question_id] = [] |
| question_id_to_solver_index_to_proposals_read[question_id] = {} |
| question_id_to_solver_index_to_findings_read[question_id] = {} |
| question_id_to_creating_finding[question_id] = set() |
| question_id_to_solver_index_to_backtrack_count[question_id] = defaultdict(int) |
| question_id_to_solver_index_to_main_token_length[question_id] = defaultdict(int) |
| question_id_to_solver_index_to_support_token_length[question_id] = defaultdict(int) |
| question_id_to_solver_index_to_tool_use_count[question_id] = defaultdict(int) |
| question_id_to_latest_termination_time[question_id] = time.time() |
| completed_question_ids.discard(question_id) |
|
|
| if question_id and time.time() > cutoff_times[-1]: |
| print("timeout did not solve") |
| return 12314 |
|
|
| get_gpu_kv_cache_usage( |
| question_id |
| ) |
|
|
| |
| |
| for solver_index in range(num_generations): |
| threading.Thread( |
| target=solve_question, |
| args=(question_text, question_id, solver_index), |
| ).start() |
|
|
| |
| for _ in range(int(time_available) + 1): |
| if question_id in completed_question_ids: |
| break |
| time.sleep(1) |
| else: |
| print("Solve timeout - continuing to submission") |
|
|
| completed_question_ids.add(question_id) |
| final_answer = vote_answer(question_id, force_answer=True) |
| assert final_answer is not None |
| time_taken = time.time() - question_start_time |
| print(f"Submitting {final_answer} for {question_id} in {time_taken:.1f}s") |
| save_findings(question_id, final_answer, time_taken) |
| save_stats( |
| question_id=question_id, |
| final_answer=final_answer, |
| time_taken=time_taken, |
| time_available=time_available, |
| active_solvers=question_id_to_active_solver_indexes[question_id], |
| answers=question_id_to_solver_index_to_answer[question_id], |
| proposals=question_id_to_solver_index_to_proposal[question_id], |
| answer_history=dict(question_id_to_solver_index_to_answer_history[question_id]), |
| proposal_history=dict( |
| question_id_to_solver_index_to_proposal_history[question_id] |
| ), |
| submission_history=dict( |
| question_id_to_solver_index_to_submission_history[question_id] |
| ), |
| backtrack_counts=dict( |
| question_id_to_solver_index_to_backtrack_count[question_id] |
| ), |
| main_tokens=dict(question_id_to_solver_index_to_main_token_length[question_id]), |
| support_tokens=dict( |
| question_id_to_solver_index_to_support_token_length[question_id] |
| ), |
| tool_use_counts=dict( |
| question_id_to_solver_index_to_tool_use_count[question_id] |
| ), |
| num_findings=len(question_id_to_findings[question_id]), |
| num_acceptances=sum( |
| len(finding.accepted_solver_indexes_and_reviews) |
| for finding in question_id_to_findings[question_id] |
| ), |
| num_rejections=sum( |
| len(finding.rejected_solver_indexes_and_reviews) |
| for finding in question_id_to_findings[question_id] |
| ), |
| ) |
| return final_answer |
|
|
| |
| if is_on_kaggle_interactive(): |
| solve("What is 1+1?") |
|
|
|
|
| if not is_on_kaggle() and __name__ == "__main__": |
| |
| print("solving") |
|
|
| question_id = "dd7f5e" |
| question_text = """ |
| Let $\\mathcal{F}$ be the set of functions $\\alpha \\colon \\mathbb{Z}\\to \\mathbb{Z}$ for which there are only finitely many $n \\in \\mathbb{Z}$ such that $\\alpha(n) \\neq 0$. |
| |
| For two functions $\\alpha$ and $\\beta$ in $\\mathcal{F}$, define their product $\\alpha\\star\\beta$ to be $\\sum\\limits_{n\\in\\mathbb{Z}} \\alpha(n)\\cdot \\beta(n)$. Also, for $n\\in\\mathbb{Z}$, define a shift operator $S_n \\colon \\mathcal{F}\\to \\mathcal{F}$ by $S_n(\\alpha)(t)=\\alpha(t+n)$ for all $t \\in \\mathbb{Z}$. |
| |
| A function $\\alpha \\in \\mathcal{F}$ is called \\emph{shifty} if |
| \\begin{itemize} |
| \\item $\\alpha(m)=0$ for all integers $m<0$ and $m>8$ and |
| \\item There exists $\\beta \\in \\mathcal{F}$ and integers $k \\neq l$ such that for all $n \\in \\mathbb{Z}$ |
| \\begin{equation*} |
| S_n(\\alpha)\\star\\beta = |
| \\begin{cases} |
| 1 & n \\in \\{k,l\\} \\\\ |
| 0 & n \\not \\in \\{k,l\\} |
| \\end{cases} |
| \\; . |
| \\end{equation*} |
| \\end{itemize} |
| How many shifty functions are there in $\\mathcal{F}$? |
| """.strip() |
|
|
| |
| |
| |
| |
|
|
| |
| |
| |
|
|
| |
| |
|
|
| question_id = "86e8e5" |
| question_text = """ |
| Let $n \\geq 6$ be a positive integer. We call a positive integer $n$-Norwegian if it has three distinct positive divisors whose sum is equal to $n$. Let $f(n)$ denote the smallest $n$-Norwegian positive integer. Let $M=3^{2025!}$ and for a non-negative integer $c$ define |
| \\begin{equation*} |
| g(c)=\\frac{1}{2025!}\\left\\lfloor \\frac{2025! f(M+c)}{M}\\right\\rfloor. |
| \\end{equation*} |
| We can write |
| \\begin{equation*} |
| g(0)+g(4M)+g(1848374)+g(10162574)+g(265710644)+g(44636594)=\\frac{p}{q} |
| \\end{equation*} |
| where $p$ and $q$ are coprime positive integers. What is the remainder when $p+q$ is divided by $99991$? |
| """.strip() |
|
|
| |
| |
| |
|
|
| |
|
|
| |
| |
|
|
| |
|
|
| |
| |
|
|
| os.makedirs(f"{SOLUTIONS_DIR}/{question_id}", exist_ok=True) |
| solve(question_text, question_id) |
| exit() |
|
|
| |
| |
|
|
| |
| import os |
|
|
| import pandas as pd |
| import polars as pl |
|
|
| import kaggle_evaluation.aimo_3_inference_server |
|
|
| if is_on_kaggle(): |
| pd.read_csv( |
| "/kaggle/input/ai-mathematical-olympiad-progress-prize-3/reference.csv" |
| ).drop("answer", axis=1).to_csv("reference.csv", index=False) |
|
|
|
|
| if is_on_kaggle(): |
| if run_all_questions_on_kaggle is True: |
| |
| replication_count_for_commit_runs = min(replication_count_for_commit_runs, 5) |
|
|
| df = pd.read_csv( |
| "/kaggle/input/ai-mathematical-olympiad-progress-prize-3/reference.csv" |
| ).drop("answer", axis=1) |
| dfs = [] |
| for replication_idx in range(replication_count_for_commit_runs): |
| df_copy = df.copy() |
| df_copy["id"] = df_copy["id"] + f"_{replication_idx}" |
| dfs.append(df_copy) |
| pd.concat(dfs, ignore_index=True).to_csv("reference.csv", index=False) |
|
|
|
|
| |
| |
| def predict(id_: pl.Series, problem: pl.Series) -> pl.DataFrame | pd.DataFrame: |
| """Make a prediction.""" |
| |
| question_id: str = id_.item(0) |
| question_text: str = problem.item(0) |
|
|
| if not run_all_questions_on_kaggle: |
| if is_on_kaggle_commit(): |
| if serve_vllm_on_kaggle: |
| |
| if not ( |
| "Norwe" in question_text |
| or "Alice" in question_text |
| or "tournament" in question_text |
| or "KNK" in question_text |
| or "shifty" in question_text |
| ): |
| print("on kaggle commit serving vllm, skipping question") |
| |
| return pl.DataFrame({"id": id_, "answer": 12315}) |
| else: |
| |
| if not ( |
| "shifty" in question_text |
| |
| |
| ): |
| print("on kaggle commit remote vllm, skipping question") |
| |
| return pl.DataFrame({"id": id_, "answer": 12315}) |
|
|
| if not is_on_kaggle(): |
| |
| |
| if not ( |
| "shifty" in question_text |
| or "tournament" in question_text |
| or "KNK" in question_text |
| or "Norwe" in question_text |
| ): |
| print("not on kaggle, skipping question") |
| |
| return pl.DataFrame({"id": id_, "answer": 12315}) |
|
|
| |
| prediction = solve(question_text, question_id=question_id) |
| completed_question_ids.add(question_id) |
| cutoff_times.pop() |
| return pl.DataFrame({"id": id_, "answer": prediction}) |
|
|
|
|
| inference_server = kaggle_evaluation.aimo_3_inference_server.AIMO3InferenceServer( |
| predict |
| ) |
|
|
| print("Starting submission server") |
| if __name__ == "__main__": |
| if os.getenv("KAGGLE_IS_COMPETITION_RERUN"): |
| inference_server.serve() |
| else: |
| inference_server.run_local_gateway(("reference.csv",)) |
|
|
| |
|
|