# ============================================================================= # SpatialBench Experiment Configuration # ============================================================================= # This file is the single source of truth for all experiments. # Add a new model by adding an entry under `models`. # Add a new grid size by extending `grid_sizes` in any task. # All paths are relative to llm-maze-solver/ (the repo root). # ============================================================================= # --------------------------------------------------------------------------- # Global defaults — overridden per-task or per-experiment as needed # --------------------------------------------------------------------------- defaults: n_test_mazes: 50 seed: 42 temperature: 0.1 max_tokens: 8192 sbatch: cpus: 2 mem: "8G" time: "10:00:00" partition: "short" log_dir: "maze-solver/eval_llm_logs" # --------------------------------------------------------------------------- # Models # Each entry defines the model identifier used in API calls and the # environment variable that must hold the API key. # --------------------------------------------------------------------------- models: gemini-2.5-flash: api_key_env: GEMINI_API_KEY display_name: "Gemini 2.5 Flash" gpt-5-mini: api_key_env: OPENAI_API_KEY display_name: "GPT-5 Mini" claude-haiku-4-5: api_key_env: ANTHROPIC_API_KEY display_name: "Claude Haiku 4.5" deepseek-chat: api_key_env: DEEPSEEK_API_KEY display_name: "DeepSeek Chat" # --------------------------------------------------------------------------- # Maze Navigation (Planning) # Paper: Table 1, Table 5 (3-shot), Table 6 (5-shot) # Script: maze-solver/eval_llm_maze_solver.py # --------------------------------------------------------------------------- maze_navigation: description: > Models find shortest paths through mazes represented in two formats (raw tokenized adjacency lists vs visual character grids), tested across k-shot settings and three prompting strategies. script: "maze-solver/eval_llm_maze_solver.py" working_dir: "maze-solver" output_base: "maze-solver/llm-maze-evaluation-results" # Grid sizes: paper used 5-9; extend freely up to maze-dataset limits grid_sizes: [5, 6, 7, 8, 9] # Input representations input_formats: ["raw", "visual"] # Prompting strategies (maps to script flags) prompt_strategies: base: flags: [] display_name: "Base" cot: flags: ["--chain_of_thought"] display_name: "Chain-of-Thought" reasoning: flags: ["--reasoning"] display_name: "Post-hoc Reasoning" # K-shot values tested simultaneously in one script run k_shots: "0,3,5" # Fixed params maze_type: "cycles" percolation_p: 0.2 visualize: true sbatch: time: "10:00:00" # --------------------------------------------------------------------------- # Sequential Reasoning with Point Reuse (Q3 = Q0) # Paper: Table 2, Table 7 # Script: maze-solver/spatial_reasoning/eval_proximity_comparison.py # --------------------------------------------------------------------------- point_reuse: description: > Models answer four sequential proximity questions about the same maze. Q3 is identical to Q0, probing whether models reuse previously computed spatial information or treat each question independently. script: "maze-solver/spatial_reasoning/eval_proximity_comparison.py" working_dir: "spatial_reasoning/spatial_reasoning_experiments" # Paper used 5-9; extend freely grid_sizes: [5, 6, 7, 8, 9] input_format: "raw" strategy: "point_reuse" reuse_pattern: "last_first_same" n_questions_per_maze: 4 sequential_questions: true # Prompting strategies prompt_strategies: base: prompt_type: "baseline" display_name: "Base" cot: prompt_type: "cot" display_name: "Chain-of-Thought" reasoning: prompt_type: "reasoning" display_name: "Post-hoc Reasoning" output_base: "spatial_reasoning/spatial-reasoning-results-point-reuse-q3-q0" visualize: true save_details: true sbatch: time: "10:30:00" # --------------------------------------------------------------------------- # Compositional Distance Comparison # Paper: Table 3, Table 8, Table 9 # Script: maze-solver/spatial_reasoning/eval_extended_experiments.py # Corner pattern: corners_to_center (Q0: top-left→center, # Q1: bottom-right→center, # Q2: corner→corner compositional) # --------------------------------------------------------------------------- compositional_distance: description: > Models answer three questions about maze corners (A=top-left, B=top-right, C=bottom-left, D=bottom-right) and center M. Q2 can be composed from information established in Q0 and Q1, probing whether models build cumulative spatial knowledge. script: "maze-solver/spatial_reasoning/eval_extended_experiments.py" working_dir: "spatial_reasoning/spatial_reasoning_experiments" # Paper reported 5-9 in Tables 8/9; scripts originally only ran 5-7 # Extended to match paper grid_sizes: [5, 6, 7, 8, 9] input_format: "raw" strategy: "orthogonal" corner_pattern: "corners_to_center" # matches paper Q0/Q1/Q2 design n_questions_per_maze: 3 # Prompting strategies prompt_strategies: base: prompt_type: "baseline" display_name: "Base" cot: prompt_type: "cot" display_name: "Chain-of-Thought" reasoning: prompt_type: "reasoning" display_name: "Post-hoc Reasoning" output_base: "spatial_reasoning/spatial-reasoning-results-orthogonal" visualize: true save_details: true sbatch: time: "06:30:00"