Spaces:

weijiang99
/

SpatialBench

Running

App Files Files Community

SpatialBench / configs /experiments.yaml

weijiang99

Upload folder using huggingface_hub

cffeecf verified about 1 month ago

raw

history blame contribute delete

5.82 kB

	# =============================================================================
	# SpatialBench Experiment Configuration
	# =============================================================================
	# This file is the single source of truth for all experiments.
	# Add a new model by adding an entry under `models`.
	# Add a new grid size by extending `grid_sizes` in any task.
	# All paths are relative to llm-maze-solver/ (the repo root).
	# =============================================================================

	# ---------------------------------------------------------------------------
	# Global defaults — overridden per-task or per-experiment as needed
	# ---------------------------------------------------------------------------
	defaults:
	n_test_mazes: 50
	seed: 42
	temperature: 0.1
	max_tokens: 8192
	sbatch:
	cpus: 2
	mem: "8G"
	time: "10:00:00"
	partition: "short"
	log_dir: "maze-solver/eval_llm_logs"

	# ---------------------------------------------------------------------------
	# Models
	# Each entry defines the model identifier used in API calls and the
	# environment variable that must hold the API key.
	# ---------------------------------------------------------------------------
	models:
	gemini-2.5-flash:
	api_key_env: GEMINI_API_KEY
	display_name: "Gemini 2.5 Flash"
	gpt-5-mini:
	api_key_env: OPENAI_API_KEY
	display_name: "GPT-5 Mini"
	claude-haiku-4-5:
	api_key_env: ANTHROPIC_API_KEY
	display_name: "Claude Haiku 4.5"
	deepseek-chat:
	api_key_env: DEEPSEEK_API_KEY
	display_name: "DeepSeek Chat"

	# ---------------------------------------------------------------------------
	# Maze Navigation (Planning)
	# Paper: Table 1, Table 5 (3-shot), Table 6 (5-shot)
	# Script: maze-solver/eval_llm_maze_solver.py
	# ---------------------------------------------------------------------------
	maze_navigation:
	description: >
	Models find shortest paths through mazes represented in two formats
	(raw tokenized adjacency lists vs visual character grids), tested
	across k-shot settings and three prompting strategies.
	script: "maze-solver/eval_llm_maze_solver.py"
	working_dir: "maze-solver"
	output_base: "maze-solver/llm-maze-evaluation-results"

	# Grid sizes: paper used 5-9; extend freely up to maze-dataset limits
	grid_sizes: [5, 6, 7, 8, 9]

	# Input representations
	input_formats: ["raw", "visual"]

	# Prompting strategies (maps to script flags)
	prompt_strategies:
	base:
	flags: []
	display_name: "Base"
	cot:
	flags: ["--chain_of_thought"]
	display_name: "Chain-of-Thought"
	reasoning:
	flags: ["--reasoning"]
	display_name: "Post-hoc Reasoning"

	# K-shot values tested simultaneously in one script run
	k_shots: "0,3,5"

	# Fixed params
	maze_type: "cycles"
	percolation_p: 0.2
	visualize: true

	sbatch:
	time: "10:00:00"

	# ---------------------------------------------------------------------------
	# Sequential Reasoning with Point Reuse (Q3 = Q0)
	# Paper: Table 2, Table 7
	# Script: maze-solver/spatial_reasoning/eval_proximity_comparison.py
	# ---------------------------------------------------------------------------
	point_reuse:
	description: >
	Models answer four sequential proximity questions about the same maze.
	Q3 is identical to Q0, probing whether models reuse previously
	computed spatial information or treat each question independently.
	script: "maze-solver/spatial_reasoning/eval_proximity_comparison.py"
	working_dir: "spatial_reasoning/spatial_reasoning_experiments"

	# Paper used 5-9; extend freely
	grid_sizes: [5, 6, 7, 8, 9]

	input_format: "raw"
	strategy: "point_reuse"
	reuse_pattern: "last_first_same"
	n_questions_per_maze: 4
	sequential_questions: true

	# Prompting strategies
	prompt_strategies:
	base:
	prompt_type: "baseline"
	display_name: "Base"
	cot:
	prompt_type: "cot"
	display_name: "Chain-of-Thought"
	reasoning:
	prompt_type: "reasoning"
	display_name: "Post-hoc Reasoning"

	output_base: "spatial_reasoning/spatial-reasoning-results-point-reuse-q3-q0"
	visualize: true
	save_details: true

	sbatch:
	time: "10:30:00"

	# ---------------------------------------------------------------------------
	# Compositional Distance Comparison
	# Paper: Table 3, Table 8, Table 9
	# Script: maze-solver/spatial_reasoning/eval_extended_experiments.py
	# Corner pattern: corners_to_center (Q0: top-left→center,
	# Q1: bottom-right→center,
	# Q2: corner→corner compositional)
	# ---------------------------------------------------------------------------
	compositional_distance:
	description: >
	Models answer three questions about maze corners (A=top-left,
	B=top-right, C=bottom-left, D=bottom-right) and center M.
	Q2 can be composed from information established in Q0 and Q1,
	probing whether models build cumulative spatial knowledge.
	script: "maze-solver/spatial_reasoning/eval_extended_experiments.py"
	working_dir: "spatial_reasoning/spatial_reasoning_experiments"

	# Paper reported 5-9 in Tables 8/9; scripts originally only ran 5-7
	# Extended to match paper
	grid_sizes: [5, 6, 7, 8, 9]

	input_format: "raw"
	strategy: "orthogonal"
	corner_pattern: "corners_to_center" # matches paper Q0/Q1/Q2 design
	n_questions_per_maze: 3

	# Prompting strategies
	prompt_strategies:
	base:
	prompt_type: "baseline"
	display_name: "Base"
	cot:
	prompt_type: "cot"
	display_name: "Chain-of-Thought"
	reasoning:
	prompt_type: "reasoning"
	display_name: "Post-hoc Reasoning"

	output_base: "spatial_reasoning/spatial-reasoning-results-orthogonal"
	visualize: true
	save_details: true

	sbatch:
	time: "06:30:00"