Spaces:

weijiang99
/

SpatialBench

Running

File size: 5,815 Bytes

cffeecf

# =============================================================================
# SpatialBench Experiment Configuration
# =============================================================================
# This file is the single source of truth for all experiments.
# Add a new model by adding an entry under `models`.
# Add a new grid size by extending `grid_sizes` in any task.
# All paths are relative to llm-maze-solver/ (the repo root).
# =============================================================================

# ---------------------------------------------------------------------------
# Global defaults — overridden per-task or per-experiment as needed
# ---------------------------------------------------------------------------
defaults:
  n_test_mazes: 50
  seed: 42
  temperature: 0.1
  max_tokens: 8192
  sbatch:
    cpus: 2
    mem: "8G"
    time: "10:00:00"
    partition: "short"
    log_dir: "maze-solver/eval_llm_logs"

# ---------------------------------------------------------------------------
# Models
# Each entry defines the model identifier used in API calls and the
# environment variable that must hold the API key.
# ---------------------------------------------------------------------------
models:
  gemini-2.5-flash:
    api_key_env: GEMINI_API_KEY
    display_name: "Gemini 2.5 Flash"
  gpt-5-mini:
    api_key_env: OPENAI_API_KEY
    display_name: "GPT-5 Mini"
  claude-haiku-4-5:
    api_key_env: ANTHROPIC_API_KEY
    display_name: "Claude Haiku 4.5"
  deepseek-chat:
    api_key_env: DEEPSEEK_API_KEY
    display_name: "DeepSeek Chat"

# ---------------------------------------------------------------------------
# Maze Navigation (Planning)
# Paper: Table 1, Table 5 (3-shot), Table 6 (5-shot)
# Script: maze-solver/eval_llm_maze_solver.py
# ---------------------------------------------------------------------------
maze_navigation:
  description: >
    Models find shortest paths through mazes represented in two formats
    (raw tokenized adjacency lists vs visual character grids), tested
    across k-shot settings and three prompting strategies.
  script: "maze-solver/eval_llm_maze_solver.py"
  working_dir: "maze-solver"
  output_base: "maze-solver/llm-maze-evaluation-results"

  # Grid sizes: paper used 5-9; extend freely up to maze-dataset limits
  grid_sizes: [5, 6, 7, 8, 9]

  # Input representations
  input_formats: ["raw", "visual"]

  # Prompting strategies  (maps to script flags)
  prompt_strategies:
    base:
      flags: []
      display_name: "Base"
    cot:
      flags: ["--chain_of_thought"]
      display_name: "Chain-of-Thought"
    reasoning:
      flags: ["--reasoning"]
      display_name: "Post-hoc Reasoning"

  # K-shot values tested simultaneously in one script run
  k_shots: "0,3,5"

  # Fixed params
  maze_type: "cycles"
  percolation_p: 0.2
  visualize: true

  sbatch:
    time: "10:00:00"

# ---------------------------------------------------------------------------
# Sequential Reasoning with Point Reuse (Q3 = Q0)
# Paper: Table 2, Table 7
# Script: maze-solver/spatial_reasoning/eval_proximity_comparison.py
# ---------------------------------------------------------------------------
point_reuse:
  description: >
    Models answer four sequential proximity questions about the same maze.
    Q3 is identical to Q0, probing whether models reuse previously
    computed spatial information or treat each question independently.
  script: "maze-solver/spatial_reasoning/eval_proximity_comparison.py"
  working_dir: "spatial_reasoning/spatial_reasoning_experiments"

  # Paper used 5-9; extend freely
  grid_sizes: [5, 6, 7, 8, 9]

  input_format: "raw"
  strategy: "point_reuse"
  reuse_pattern: "last_first_same"
  n_questions_per_maze: 4
  sequential_questions: true

  # Prompting strategies
  prompt_strategies:
    base:
      prompt_type: "baseline"
      display_name: "Base"
    cot:
      prompt_type: "cot"
      display_name: "Chain-of-Thought"
    reasoning:
      prompt_type: "reasoning"
      display_name: "Post-hoc Reasoning"

  output_base: "spatial_reasoning/spatial-reasoning-results-point-reuse-q3-q0"
  visualize: true
  save_details: true

  sbatch:
    time: "10:30:00"

# ---------------------------------------------------------------------------
# Compositional Distance Comparison
# Paper: Table 3, Table 8, Table 9
# Script: maze-solver/spatial_reasoning/eval_extended_experiments.py
# Corner pattern: corners_to_center  (Q0: top-left→center,
#                                     Q1: bottom-right→center,
#                                     Q2: corner→corner compositional)
# ---------------------------------------------------------------------------
compositional_distance:
  description: >
    Models answer three questions about maze corners (A=top-left,
    B=top-right, C=bottom-left, D=bottom-right) and center M.
    Q2 can be composed from information established in Q0 and Q1,
    probing whether models build cumulative spatial knowledge.
  script: "maze-solver/spatial_reasoning/eval_extended_experiments.py"
  working_dir: "spatial_reasoning/spatial_reasoning_experiments"

  # Paper reported 5-9 in Tables 8/9; scripts originally only ran 5-7
  # Extended to match paper
  grid_sizes: [5, 6, 7, 8, 9]

  input_format: "raw"
  strategy: "orthogonal"
  corner_pattern: "corners_to_center"   # matches paper Q0/Q1/Q2 design
  n_questions_per_maze: 3

  # Prompting strategies
  prompt_strategies:
    base:
      prompt_type: "baseline"
      display_name: "Base"
    cot:
      prompt_type: "cot"
      display_name: "Chain-of-Thought"
    reasoning:
      prompt_type: "reasoning"
      display_name: "Post-hoc Reasoning"

  output_base: "spatial_reasoning/spatial-reasoning-results-orthogonal"
  visualize: true
  save_details: true

  sbatch:
    time: "06:30:00"