SpatialBench / configs /experiments.yaml
weijiang99's picture
Upload folder using huggingface_hub
cffeecf verified
# =============================================================================
# SpatialBench Experiment Configuration
# =============================================================================
# This file is the single source of truth for all experiments.
# Add a new model by adding an entry under `models`.
# Add a new grid size by extending `grid_sizes` in any task.
# All paths are relative to llm-maze-solver/ (the repo root).
# =============================================================================
# ---------------------------------------------------------------------------
# Global defaults — overridden per-task or per-experiment as needed
# ---------------------------------------------------------------------------
defaults:
n_test_mazes: 50
seed: 42
temperature: 0.1
max_tokens: 8192
sbatch:
cpus: 2
mem: "8G"
time: "10:00:00"
partition: "short"
log_dir: "maze-solver/eval_llm_logs"
# ---------------------------------------------------------------------------
# Models
# Each entry defines the model identifier used in API calls and the
# environment variable that must hold the API key.
# ---------------------------------------------------------------------------
models:
gemini-2.5-flash:
api_key_env: GEMINI_API_KEY
display_name: "Gemini 2.5 Flash"
gpt-5-mini:
api_key_env: OPENAI_API_KEY
display_name: "GPT-5 Mini"
claude-haiku-4-5:
api_key_env: ANTHROPIC_API_KEY
display_name: "Claude Haiku 4.5"
deepseek-chat:
api_key_env: DEEPSEEK_API_KEY
display_name: "DeepSeek Chat"
# ---------------------------------------------------------------------------
# Maze Navigation (Planning)
# Paper: Table 1, Table 5 (3-shot), Table 6 (5-shot)
# Script: maze-solver/eval_llm_maze_solver.py
# ---------------------------------------------------------------------------
maze_navigation:
description: >
Models find shortest paths through mazes represented in two formats
(raw tokenized adjacency lists vs visual character grids), tested
across k-shot settings and three prompting strategies.
script: "maze-solver/eval_llm_maze_solver.py"
working_dir: "maze-solver"
output_base: "maze-solver/llm-maze-evaluation-results"
# Grid sizes: paper used 5-9; extend freely up to maze-dataset limits
grid_sizes: [5, 6, 7, 8, 9]
# Input representations
input_formats: ["raw", "visual"]
# Prompting strategies (maps to script flags)
prompt_strategies:
base:
flags: []
display_name: "Base"
cot:
flags: ["--chain_of_thought"]
display_name: "Chain-of-Thought"
reasoning:
flags: ["--reasoning"]
display_name: "Post-hoc Reasoning"
# K-shot values tested simultaneously in one script run
k_shots: "0,3,5"
# Fixed params
maze_type: "cycles"
percolation_p: 0.2
visualize: true
sbatch:
time: "10:00:00"
# ---------------------------------------------------------------------------
# Sequential Reasoning with Point Reuse (Q3 = Q0)
# Paper: Table 2, Table 7
# Script: maze-solver/spatial_reasoning/eval_proximity_comparison.py
# ---------------------------------------------------------------------------
point_reuse:
description: >
Models answer four sequential proximity questions about the same maze.
Q3 is identical to Q0, probing whether models reuse previously
computed spatial information or treat each question independently.
script: "maze-solver/spatial_reasoning/eval_proximity_comparison.py"
working_dir: "spatial_reasoning/spatial_reasoning_experiments"
# Paper used 5-9; extend freely
grid_sizes: [5, 6, 7, 8, 9]
input_format: "raw"
strategy: "point_reuse"
reuse_pattern: "last_first_same"
n_questions_per_maze: 4
sequential_questions: true
# Prompting strategies
prompt_strategies:
base:
prompt_type: "baseline"
display_name: "Base"
cot:
prompt_type: "cot"
display_name: "Chain-of-Thought"
reasoning:
prompt_type: "reasoning"
display_name: "Post-hoc Reasoning"
output_base: "spatial_reasoning/spatial-reasoning-results-point-reuse-q3-q0"
visualize: true
save_details: true
sbatch:
time: "10:30:00"
# ---------------------------------------------------------------------------
# Compositional Distance Comparison
# Paper: Table 3, Table 8, Table 9
# Script: maze-solver/spatial_reasoning/eval_extended_experiments.py
# Corner pattern: corners_to_center (Q0: top-left→center,
# Q1: bottom-right→center,
# Q2: corner→corner compositional)
# ---------------------------------------------------------------------------
compositional_distance:
description: >
Models answer three questions about maze corners (A=top-left,
B=top-right, C=bottom-left, D=bottom-right) and center M.
Q2 can be composed from information established in Q0 and Q1,
probing whether models build cumulative spatial knowledge.
script: "maze-solver/spatial_reasoning/eval_extended_experiments.py"
working_dir: "spatial_reasoning/spatial_reasoning_experiments"
# Paper reported 5-9 in Tables 8/9; scripts originally only ran 5-7
# Extended to match paper
grid_sizes: [5, 6, 7, 8, 9]
input_format: "raw"
strategy: "orthogonal"
corner_pattern: "corners_to_center" # matches paper Q0/Q1/Q2 design
n_questions_per_maze: 3
# Prompting strategies
prompt_strategies:
base:
prompt_type: "baseline"
display_name: "Base"
cot:
prompt_type: "cot"
display_name: "Chain-of-Thought"
reasoning:
prompt_type: "reasoning"
display_name: "Post-hoc Reasoning"
output_base: "spatial_reasoning/spatial-reasoning-results-orthogonal"
visualize: true
save_details: true
sbatch:
time: "06:30:00"