File size: 5,815 Bytes
cffeecf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
# =============================================================================
# SpatialBench Experiment Configuration
# =============================================================================
# This file is the single source of truth for all experiments.
# Add a new model by adding an entry under `models`.
# Add a new grid size by extending `grid_sizes` in any task.
# All paths are relative to llm-maze-solver/ (the repo root).
# =============================================================================

# ---------------------------------------------------------------------------
# Global defaults — overridden per-task or per-experiment as needed
# ---------------------------------------------------------------------------
defaults:
  n_test_mazes: 50
  seed: 42
  temperature: 0.1
  max_tokens: 8192
  sbatch:
    cpus: 2
    mem: "8G"
    time: "10:00:00"
    partition: "short"
    log_dir: "maze-solver/eval_llm_logs"

# ---------------------------------------------------------------------------
# Models
# Each entry defines the model identifier used in API calls and the
# environment variable that must hold the API key.
# ---------------------------------------------------------------------------
models:
  gemini-2.5-flash:
    api_key_env: GEMINI_API_KEY
    display_name: "Gemini 2.5 Flash"
  gpt-5-mini:
    api_key_env: OPENAI_API_KEY
    display_name: "GPT-5 Mini"
  claude-haiku-4-5:
    api_key_env: ANTHROPIC_API_KEY
    display_name: "Claude Haiku 4.5"
  deepseek-chat:
    api_key_env: DEEPSEEK_API_KEY
    display_name: "DeepSeek Chat"

# ---------------------------------------------------------------------------
# Maze Navigation (Planning)
# Paper: Table 1, Table 5 (3-shot), Table 6 (5-shot)
# Script: maze-solver/eval_llm_maze_solver.py
# ---------------------------------------------------------------------------
maze_navigation:
  description: >
    Models find shortest paths through mazes represented in two formats
    (raw tokenized adjacency lists vs visual character grids), tested
    across k-shot settings and three prompting strategies.
  script: "maze-solver/eval_llm_maze_solver.py"
  working_dir: "maze-solver"
  output_base: "maze-solver/llm-maze-evaluation-results"

  # Grid sizes: paper used 5-9; extend freely up to maze-dataset limits
  grid_sizes: [5, 6, 7, 8, 9]

  # Input representations
  input_formats: ["raw", "visual"]

  # Prompting strategies  (maps to script flags)
  prompt_strategies:
    base:
      flags: []
      display_name: "Base"
    cot:
      flags: ["--chain_of_thought"]
      display_name: "Chain-of-Thought"
    reasoning:
      flags: ["--reasoning"]
      display_name: "Post-hoc Reasoning"

  # K-shot values tested simultaneously in one script run
  k_shots: "0,3,5"

  # Fixed params
  maze_type: "cycles"
  percolation_p: 0.2
  visualize: true

  sbatch:
    time: "10:00:00"

# ---------------------------------------------------------------------------
# Sequential Reasoning with Point Reuse (Q3 = Q0)
# Paper: Table 2, Table 7
# Script: maze-solver/spatial_reasoning/eval_proximity_comparison.py
# ---------------------------------------------------------------------------
point_reuse:
  description: >
    Models answer four sequential proximity questions about the same maze.
    Q3 is identical to Q0, probing whether models reuse previously
    computed spatial information or treat each question independently.
  script: "maze-solver/spatial_reasoning/eval_proximity_comparison.py"
  working_dir: "spatial_reasoning/spatial_reasoning_experiments"

  # Paper used 5-9; extend freely
  grid_sizes: [5, 6, 7, 8, 9]

  input_format: "raw"
  strategy: "point_reuse"
  reuse_pattern: "last_first_same"
  n_questions_per_maze: 4
  sequential_questions: true

  # Prompting strategies
  prompt_strategies:
    base:
      prompt_type: "baseline"
      display_name: "Base"
    cot:
      prompt_type: "cot"
      display_name: "Chain-of-Thought"
    reasoning:
      prompt_type: "reasoning"
      display_name: "Post-hoc Reasoning"

  output_base: "spatial_reasoning/spatial-reasoning-results-point-reuse-q3-q0"
  visualize: true
  save_details: true

  sbatch:
    time: "10:30:00"

# ---------------------------------------------------------------------------
# Compositional Distance Comparison
# Paper: Table 3, Table 8, Table 9
# Script: maze-solver/spatial_reasoning/eval_extended_experiments.py
# Corner pattern: corners_to_center  (Q0: top-left→center,
#                                     Q1: bottom-right→center,
#                                     Q2: corner→corner compositional)
# ---------------------------------------------------------------------------
compositional_distance:
  description: >
    Models answer three questions about maze corners (A=top-left,
    B=top-right, C=bottom-left, D=bottom-right) and center M.
    Q2 can be composed from information established in Q0 and Q1,
    probing whether models build cumulative spatial knowledge.
  script: "maze-solver/spatial_reasoning/eval_extended_experiments.py"
  working_dir: "spatial_reasoning/spatial_reasoning_experiments"

  # Paper reported 5-9 in Tables 8/9; scripts originally only ran 5-7
  # Extended to match paper
  grid_sizes: [5, 6, 7, 8, 9]

  input_format: "raw"
  strategy: "orthogonal"
  corner_pattern: "corners_to_center"   # matches paper Q0/Q1/Q2 design
  n_questions_per_maze: 3

  # Prompting strategies
  prompt_strategies:
    base:
      prompt_type: "baseline"
      display_name: "Base"
    cot:
      prompt_type: "cot"
      display_name: "Chain-of-Thought"
    reasoning:
      prompt_type: "reasoning"
      display_name: "Post-hoc Reasoning"

  output_base: "spatial_reasoning/spatial-reasoning-results-orthogonal"
  visualize: true
  save_details: true

  sbatch:
    time: "06:30:00"