Buckets:
bbkdevops/unicosys-hypergraph-bucket / tinymind-native-colab-handoff /bundle /evaluation /arc_agi3_eval.py
| """ARC-AGI-3 public toolkit evaluation runner for TinyMind.""" | |
| from __future__ import annotations | |
| from datetime import datetime, timezone | |
| import hashlib | |
| import json | |
| import os | |
| from pathlib import Path | |
| from typing import Any, Iterable | |
| DEFAULT_ARC_GAMES = ("ls20", "ft09", "vc33") | |
| def _json_safe(value: Any) -> Any: | |
| if value is None or isinstance(value, (str, int, float, bool)): | |
| return value | |
| if isinstance(value, dict): | |
| return {str(k): _json_safe(v) for k, v in value.items()} | |
| if isinstance(value, (list, tuple)): | |
| return [_json_safe(v) for v in value] | |
| if hasattr(value, "value"): | |
| return value.value | |
| return str(value) | |
| def _frame_hash(resp: Any) -> str: | |
| frame = getattr(resp, "frame", None) | |
| try: | |
| if not frame: | |
| payload = b"" | |
| else: | |
| payload = json.dumps(_json_safe(frame), sort_keys=True, separators=(",", ":")).encode("utf-8") | |
| except Exception: | |
| payload = repr(frame).encode("utf-8", errors="replace") | |
| return hashlib.sha256(payload).hexdigest() | |
| def _choose_action(action_space: list[Any], step: int) -> tuple[Any, dict[str, int] | None, str]: | |
| from arcengine import GameAction | |
| if not action_space: | |
| return GameAction.ACTION1, None, "fallback_action1" | |
| simple = [a for a in action_space if a != GameAction.RESET and a != GameAction.ACTION6] | |
| click = [a for a in action_space if a == GameAction.ACTION6] | |
| if click and (step % 5 == 4 or not simple): | |
| points = ( | |
| (32, 32), | |
| (16, 16), | |
| (48, 16), | |
| (16, 48), | |
| (48, 48), | |
| (8, 32), | |
| (56, 32), | |
| (32, 8), | |
| (32, 56), | |
| ) | |
| x, y = points[(step // 5) % len(points)] | |
| return GameAction.ACTION6, {"x": x, "y": y}, f"spatial_click_{x}_{y}" | |
| action = simple[step % len(simple)] if simple else action_space[step % len(action_space)] | |
| return action, None, f"cycle_{getattr(action, 'name', str(action))}" | |
| def _scorecard_dict(scorecard: Any) -> dict: | |
| if scorecard is None: | |
| return {} | |
| if hasattr(scorecard, "model_dump"): | |
| return _json_safe(scorecard.model_dump(exclude={"api_key"})) | |
| return _json_safe(scorecard) | |
| def _available_base_games(arcade: Any) -> list[str]: | |
| bases = sorted({str(env.game_id).split("-", 1)[0] for env in arcade.available_environments}) | |
| return bases | |
| def run_arc_agi3_eval( | |
| out_dir: str | Path, | |
| games: Iterable[str] | None = None, | |
| max_steps: int = 128, | |
| seed: int = 0, | |
| use_all_available: bool = False, | |
| ) -> dict: | |
| import arc_agi | |
| out = Path(out_dir) | |
| out.mkdir(parents=True, exist_ok=True) | |
| env_dir = out / "environment_files" | |
| recordings_dir = out / "recordings" | |
| api_key_present = bool(os.environ.get("ARC_API_KEY")) | |
| arc = arc_agi.Arcade(environments_dir=str(env_dir), recordings_dir=str(recordings_dir)) | |
| available_games = _available_base_games(arc) | |
| selected_games = list(games or DEFAULT_ARC_GAMES) | |
| if use_all_available: | |
| selected_games = available_games | |
| card_id = arc.create_scorecard( | |
| source_url="https://docs.arcprize.org/", | |
| tags=["tinymind", "arc-agi-3", "public-toolkit-smoke"], | |
| opaque={"agent": "tinymind_deterministic_public_baseline", "max_steps": int(max_steps)}, | |
| ) | |
| runs: list[dict] = [] | |
| for index, game_id in enumerate(selected_games): | |
| run = { | |
| "game_id": game_id, | |
| "started": False, | |
| "steps_attempted": 0, | |
| "levels_completed": 0, | |
| "state": "missing", | |
| "error": None, | |
| } | |
| try: | |
| env = arc.make( | |
| game_id, | |
| seed=seed + index, | |
| scorecard_id=card_id, | |
| save_recording=True, | |
| include_frame_data=False, | |
| ) | |
| if env is None: | |
| run["error"] = "environment_unavailable" | |
| runs.append(run) | |
| continue | |
| obs = env.reset() | |
| run["started"] = obs is not None | |
| last_hash = _frame_hash(obs) | |
| for step in range(max(0, int(max_steps))): | |
| action, data, reason = _choose_action(env.action_space, step) | |
| obs = env.step( | |
| action, | |
| data=data, | |
| reasoning={"policy": "deterministic_public_baseline", "reason": reason, "step": step}, | |
| ) | |
| run["steps_attempted"] = step + 1 | |
| if obs is None: | |
| run["error"] = "step_returned_none" | |
| break | |
| current_hash = _frame_hash(obs) | |
| if current_hash != last_hash: | |
| run.setdefault("state_changes", 0) | |
| run["state_changes"] = int(run["state_changes"]) + 1 | |
| last_hash = current_hash | |
| run["levels_completed"] = int(getattr(obs, "levels_completed", 0) or 0) | |
| state = getattr(obs, "state", None) | |
| run["state"] = getattr(state, "value", str(state)) | |
| if run["state"] in {"WIN", "GAME_OVER"}: | |
| break | |
| except Exception as exc: | |
| run["error"] = f"{type(exc).__name__}: {exc}" | |
| runs.append(run) | |
| scorecard = arc.close_scorecard(card_id) | |
| scorecard_payload = _scorecard_dict(scorecard) | |
| report = { | |
| "schema_version": "tinymind-arc-agi3-public-toolkit-eval-v1", | |
| "created_at": datetime.now(timezone.utc).isoformat(), | |
| "docs_url": "https://docs.arcprize.org/", | |
| "arc_page_url": "https://arcprize.org/arc-agi/3", | |
| "api_key_present": api_key_present, | |
| "api_key_saved": False, | |
| "package": "arc-agi", | |
| "agent": { | |
| "name": "tinymind_deterministic_public_baseline", | |
| "policy": "cycle legal simple actions with sparse spatial clicks; no learned ARC policy yet", | |
| "max_steps_per_game": int(max_steps), | |
| "seed": int(seed), | |
| }, | |
| "available_games": available_games, | |
| "selected_games": selected_games, | |
| "runs": runs, | |
| "scorecard": scorecard_payload, | |
| "score": float(scorecard_payload.get("score", 0.0) or 0.0), | |
| "total_levels_completed": int(scorecard_payload.get("total_levels_completed", 0) or 0), | |
| "total_levels": int(scorecard_payload.get("total_levels", 0) or 0), | |
| "total_actions": int(scorecard_payload.get("total_actions", 0) or 0), | |
| "official_external_rank_claim_allowed": False, | |
| "notes": ( | |
| "This is a measured public ARC-AGI-3 toolkit run, not a competition rank. " | |
| "Score depends on the simple baseline policy, not the full TinyMind research target." | |
| ), | |
| } | |
| json_path = out / "arc_agi3_eval_report.json" | |
| scorecard_path = out / "arc_agi3_scorecard.json" | |
| md_path = out / "arc_agi3_eval_report.md" | |
| report["json_path"] = str(json_path) | |
| report["scorecard_path"] = str(scorecard_path) | |
| report["markdown_path"] = str(md_path) | |
| json_path.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8") | |
| scorecard_path.write_text(json.dumps(scorecard_payload, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8") | |
| md_path.write_text(_markdown(report), encoding="utf-8") | |
| return report | |
| def _markdown(report: dict) -> str: | |
| return "\n".join( | |
| [ | |
| "# TinyMind ARC-AGI-3 Public Toolkit Eval", | |
| "", | |
| f"- Score: {report['score']}", | |
| f"- Levels completed: {report['total_levels_completed']}/{report['total_levels']}", | |
| f"- Total actions: {report['total_actions']}", | |
| f"- Games: {', '.join(report['selected_games'])}", | |
| f"- API key present: {report['api_key_present']}", | |
| "- API key saved: false", | |
| "- Official external rank claim allowed: false", | |
| "", | |
| report["notes"], | |
| "", | |
| ] | |
| ) | |
Xet Storage Details
- Size:
- 8.02 kB
- Xet hash:
- 0ba11b242cab2c190d45b7f769160af86ac757cdafb08154f392cc13113210ce
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.