bbkdevops's picture
download
raw
8.02 kB
"""ARC-AGI-3 public toolkit evaluation runner for TinyMind."""
from __future__ import annotations
from datetime import datetime, timezone
import hashlib
import json
import os
from pathlib import Path
from typing import Any, Iterable
DEFAULT_ARC_GAMES = ("ls20", "ft09", "vc33")
def _json_safe(value: Any) -> Any:
if value is None or isinstance(value, (str, int, float, bool)):
return value
if isinstance(value, dict):
return {str(k): _json_safe(v) for k, v in value.items()}
if isinstance(value, (list, tuple)):
return [_json_safe(v) for v in value]
if hasattr(value, "value"):
return value.value
return str(value)
def _frame_hash(resp: Any) -> str:
frame = getattr(resp, "frame", None)
try:
if not frame:
payload = b""
else:
payload = json.dumps(_json_safe(frame), sort_keys=True, separators=(",", ":")).encode("utf-8")
except Exception:
payload = repr(frame).encode("utf-8", errors="replace")
return hashlib.sha256(payload).hexdigest()
def _choose_action(action_space: list[Any], step: int) -> tuple[Any, dict[str, int] | None, str]:
from arcengine import GameAction
if not action_space:
return GameAction.ACTION1, None, "fallback_action1"
simple = [a for a in action_space if a != GameAction.RESET and a != GameAction.ACTION6]
click = [a for a in action_space if a == GameAction.ACTION6]
if click and (step % 5 == 4 or not simple):
points = (
(32, 32),
(16, 16),
(48, 16),
(16, 48),
(48, 48),
(8, 32),
(56, 32),
(32, 8),
(32, 56),
)
x, y = points[(step // 5) % len(points)]
return GameAction.ACTION6, {"x": x, "y": y}, f"spatial_click_{x}_{y}"
action = simple[step % len(simple)] if simple else action_space[step % len(action_space)]
return action, None, f"cycle_{getattr(action, 'name', str(action))}"
def _scorecard_dict(scorecard: Any) -> dict:
if scorecard is None:
return {}
if hasattr(scorecard, "model_dump"):
return _json_safe(scorecard.model_dump(exclude={"api_key"}))
return _json_safe(scorecard)
def _available_base_games(arcade: Any) -> list[str]:
bases = sorted({str(env.game_id).split("-", 1)[0] for env in arcade.available_environments})
return bases
def run_arc_agi3_eval(
out_dir: str | Path,
games: Iterable[str] | None = None,
max_steps: int = 128,
seed: int = 0,
use_all_available: bool = False,
) -> dict:
import arc_agi
out = Path(out_dir)
out.mkdir(parents=True, exist_ok=True)
env_dir = out / "environment_files"
recordings_dir = out / "recordings"
api_key_present = bool(os.environ.get("ARC_API_KEY"))
arc = arc_agi.Arcade(environments_dir=str(env_dir), recordings_dir=str(recordings_dir))
available_games = _available_base_games(arc)
selected_games = list(games or DEFAULT_ARC_GAMES)
if use_all_available:
selected_games = available_games
card_id = arc.create_scorecard(
source_url="https://docs.arcprize.org/",
tags=["tinymind", "arc-agi-3", "public-toolkit-smoke"],
opaque={"agent": "tinymind_deterministic_public_baseline", "max_steps": int(max_steps)},
)
runs: list[dict] = []
for index, game_id in enumerate(selected_games):
run = {
"game_id": game_id,
"started": False,
"steps_attempted": 0,
"levels_completed": 0,
"state": "missing",
"error": None,
}
try:
env = arc.make(
game_id,
seed=seed + index,
scorecard_id=card_id,
save_recording=True,
include_frame_data=False,
)
if env is None:
run["error"] = "environment_unavailable"
runs.append(run)
continue
obs = env.reset()
run["started"] = obs is not None
last_hash = _frame_hash(obs)
for step in range(max(0, int(max_steps))):
action, data, reason = _choose_action(env.action_space, step)
obs = env.step(
action,
data=data,
reasoning={"policy": "deterministic_public_baseline", "reason": reason, "step": step},
)
run["steps_attempted"] = step + 1
if obs is None:
run["error"] = "step_returned_none"
break
current_hash = _frame_hash(obs)
if current_hash != last_hash:
run.setdefault("state_changes", 0)
run["state_changes"] = int(run["state_changes"]) + 1
last_hash = current_hash
run["levels_completed"] = int(getattr(obs, "levels_completed", 0) or 0)
state = getattr(obs, "state", None)
run["state"] = getattr(state, "value", str(state))
if run["state"] in {"WIN", "GAME_OVER"}:
break
except Exception as exc:
run["error"] = f"{type(exc).__name__}: {exc}"
runs.append(run)
scorecard = arc.close_scorecard(card_id)
scorecard_payload = _scorecard_dict(scorecard)
report = {
"schema_version": "tinymind-arc-agi3-public-toolkit-eval-v1",
"created_at": datetime.now(timezone.utc).isoformat(),
"docs_url": "https://docs.arcprize.org/",
"arc_page_url": "https://arcprize.org/arc-agi/3",
"api_key_present": api_key_present,
"api_key_saved": False,
"package": "arc-agi",
"agent": {
"name": "tinymind_deterministic_public_baseline",
"policy": "cycle legal simple actions with sparse spatial clicks; no learned ARC policy yet",
"max_steps_per_game": int(max_steps),
"seed": int(seed),
},
"available_games": available_games,
"selected_games": selected_games,
"runs": runs,
"scorecard": scorecard_payload,
"score": float(scorecard_payload.get("score", 0.0) or 0.0),
"total_levels_completed": int(scorecard_payload.get("total_levels_completed", 0) or 0),
"total_levels": int(scorecard_payload.get("total_levels", 0) or 0),
"total_actions": int(scorecard_payload.get("total_actions", 0) or 0),
"official_external_rank_claim_allowed": False,
"notes": (
"This is a measured public ARC-AGI-3 toolkit run, not a competition rank. "
"Score depends on the simple baseline policy, not the full TinyMind research target."
),
}
json_path = out / "arc_agi3_eval_report.json"
scorecard_path = out / "arc_agi3_scorecard.json"
md_path = out / "arc_agi3_eval_report.md"
report["json_path"] = str(json_path)
report["scorecard_path"] = str(scorecard_path)
report["markdown_path"] = str(md_path)
json_path.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")
scorecard_path.write_text(json.dumps(scorecard_payload, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")
md_path.write_text(_markdown(report), encoding="utf-8")
return report
def _markdown(report: dict) -> str:
return "\n".join(
[
"# TinyMind ARC-AGI-3 Public Toolkit Eval",
"",
f"- Score: {report['score']}",
f"- Levels completed: {report['total_levels_completed']}/{report['total_levels']}",
f"- Total actions: {report['total_actions']}",
f"- Games: {', '.join(report['selected_games'])}",
f"- API key present: {report['api_key_present']}",
"- API key saved: false",
"- Official external rank claim allowed: false",
"",
report["notes"],
"",
]
)

Xet Storage Details

Size:
8.02 kB
·
Xet hash:
0ba11b242cab2c190d45b7f769160af86ac757cdafb08154f392cc13113210ce

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.