Spaces:

Rishabh12j
/

DepthLens

Sleeping

Rishabh Jain

Initial upload — depth-aware scene description system

5412d82 about 1 month ago

3 kB

	"""
	Central configuration for the Depth-Aware Scene Description system.
	All constants, paths, and hyperparameters live here.
	"""
	from pathlib import Path

	# ── Paths ──
	PROJECT_ROOT = Path(__file__).parent.parent
	DATA_DIR = PROJECT_ROOT / "data"
	TEST_IMAGES_DIR = DATA_DIR / "test_images"
	GROUND_TRUTH_DIR = DATA_DIR / "ground_truth"
	REFERENCES_DIR = DATA_DIR / "references"
	OUTPUTS_DIR = PROJECT_ROOT / "outputs"
	RESULTS_DIR = OUTPUTS_DIR / "results"
	ARKITSCENES_TMP_DIR = DATA_DIR / "_arkit_tmp"

	# ── Camera ──
	HFOV_DEG = 70 # typical smartphone horizontal field of view
	DEPTH_MIN_CM = 20 # minimum mapped depth (cm)
	DEPTH_MAX_CM = 200 # maximum mapped depth (cm)

	# ── Models ──
	YOLO_MODEL = "yolov8n.pt"
	DEPTH_MODEL = "depth-anything/Depth-Anything-V2-Small-hf"
	MOBILE_SAM_CKPT = "mobile_sam.pt"
	MOBILE_SAM_URL = "https://github.com/ChaoningZhang/MobileSAM/raw/master/weights/mobile_sam.pt"

	# Moondream (Colab T4)
	MOONDREAM_ID = "vikhyatk/moondream2"
	MOONDREAM_REVISION = "2025-06-21"

	# Qwen (RTX 5060)
	QWEN_ID = "Qwen/Qwen2.5-VL-3B-Instruct"
	QWEN_MIN_PIXELS = 256 * 28 * 28 # 256 tokens
	QWEN_MAX_PIXELS = 512 * 28 * 28 # 512 tokens

	# Gemma 4 E2B IT (production — requires transformers>=5.5.0)
	GEMMA4_ID = "google/gemma-4-E2B-it"
	GEMMA4_MAX_NEW_TOKENS = 300

	# ── Detection ──
	CONF_THRESHOLD = 0.3

	# ── Evaluation ──
	N_TEST_IMAGES = 50
	N_SPATIAL_SCENES = 30
	N_SIZE_OBJECTS = 20
	N_LATENCY_RUNS = 10

	# ── Assistive prompt templates ──
	ASSISTIVE_PROMPT = (
	"Describe this scene for a visually impaired person. "
	"Use the depth measurements provided to give a spatial description: "
	"how deep or large the space appears to be, "
	"what objects are present and whether each one is to the left, centre, or right, "
	"roughly how far away each object is, "
	"and any navigation hazards or obstacles. "
	"Be concise and natural, as if guiding someone who cannot see."
	)

	QUERY_MODES = {
	"identify": "What is this object? Provide a one-sentence identification.",
	"describe": (
	"Describe this object for a visually impaired user: material, "
	"condition, any visible text or branding, and one practical detail."
	),
	"spatial": (
	"Describe the spatial arrangement of all objects in the scene "
	"using the depth measurements provided. Use directions relative to "
	"the viewer (ahead, left, right) and include distances."
	),
	"measure": (
	"Using the estimated physical dimensions provided, describe how "
	"large this object is. Compare its size to a common everyday "
	"reference object a visually impaired person would know by touch."
	),
	"navigate": (
	"Describe this scene as navigation guidance for a visually impaired "
	"person. Mention obstacles, clear paths, distances to key objects, "
	"and any potential hazards like edges or steps."
	),
	"full_scene": ASSISTIVE_PROMPT,
	}