""" Central configuration for the Depth-Aware Scene Description system. All constants, paths, and hyperparameters live here. """ from pathlib import Path # ── Paths ── PROJECT_ROOT = Path(__file__).parent.parent DATA_DIR = PROJECT_ROOT / "data" TEST_IMAGES_DIR = DATA_DIR / "test_images" GROUND_TRUTH_DIR = DATA_DIR / "ground_truth" REFERENCES_DIR = DATA_DIR / "references" OUTPUTS_DIR = PROJECT_ROOT / "outputs" RESULTS_DIR = OUTPUTS_DIR / "results" ARKITSCENES_TMP_DIR = DATA_DIR / "_arkit_tmp" # ── Camera ── HFOV_DEG = 70 # typical smartphone horizontal field of view DEPTH_MIN_CM = 20 # minimum mapped depth (cm) DEPTH_MAX_CM = 200 # maximum mapped depth (cm) # ── Models ── YOLO_MODEL = "yolov8n.pt" DEPTH_MODEL = "depth-anything/Depth-Anything-V2-Small-hf" MOBILE_SAM_CKPT = "mobile_sam.pt" MOBILE_SAM_URL = "https://github.com/ChaoningZhang/MobileSAM/raw/master/weights/mobile_sam.pt" # Moondream (Colab T4) MOONDREAM_ID = "vikhyatk/moondream2" MOONDREAM_REVISION = "2025-06-21" # Qwen (RTX 5060) QWEN_ID = "Qwen/Qwen2.5-VL-3B-Instruct" QWEN_MIN_PIXELS = 256 * 28 * 28 # 256 tokens QWEN_MAX_PIXELS = 512 * 28 * 28 # 512 tokens # Gemma 4 E2B IT (production — requires transformers>=5.5.0) GEMMA4_ID = "google/gemma-4-E2B-it" GEMMA4_MAX_NEW_TOKENS = 300 # ── Detection ── CONF_THRESHOLD = 0.3 # ── Evaluation ── N_TEST_IMAGES = 50 N_SPATIAL_SCENES = 30 N_SIZE_OBJECTS = 20 N_LATENCY_RUNS = 10 # ── Assistive prompt templates ── ASSISTIVE_PROMPT = ( "Describe this scene for a visually impaired person. " "Use the depth measurements provided to give a spatial description: " "how deep or large the space appears to be, " "what objects are present and whether each one is to the left, centre, or right, " "roughly how far away each object is, " "and any navigation hazards or obstacles. " "Be concise and natural, as if guiding someone who cannot see." ) QUERY_MODES = { "identify": "What is this object? Provide a one-sentence identification.", "describe": ( "Describe this object for a visually impaired user: material, " "condition, any visible text or branding, and one practical detail." ), "spatial": ( "Describe the spatial arrangement of all objects in the scene " "using the depth measurements provided. Use directions relative to " "the viewer (ahead, left, right) and include distances." ), "measure": ( "Using the estimated physical dimensions provided, describe how " "large this object is. Compare its size to a common everyday " "reference object a visually impaired person would know by touch." ), "navigate": ( "Describe this scene as navigation guidance for a visually impaired " "person. Mention obstacles, clear paths, distances to key objects, " "and any potential hazards like edges or steps." ), "full_scene": ASSISTIVE_PROMPT, }