Spaces:
Sleeping
Sleeping
| """ | |
| Central configuration for the Depth-Aware Scene Description system. | |
| All constants, paths, and hyperparameters live here. | |
| """ | |
| from pathlib import Path | |
| # ββ Paths ββ | |
| PROJECT_ROOT = Path(__file__).parent.parent | |
| DATA_DIR = PROJECT_ROOT / "data" | |
| TEST_IMAGES_DIR = DATA_DIR / "test_images" | |
| GROUND_TRUTH_DIR = DATA_DIR / "ground_truth" | |
| REFERENCES_DIR = DATA_DIR / "references" | |
| OUTPUTS_DIR = PROJECT_ROOT / "outputs" | |
| RESULTS_DIR = OUTPUTS_DIR / "results" | |
| ARKITSCENES_TMP_DIR = DATA_DIR / "_arkit_tmp" | |
| # ββ Camera ββ | |
| HFOV_DEG = 70 # typical smartphone horizontal field of view | |
| DEPTH_MIN_CM = 20 # minimum mapped depth (cm) | |
| DEPTH_MAX_CM = 200 # maximum mapped depth (cm) | |
| # ββ Models ββ | |
| YOLO_MODEL = "yolov8n.pt" | |
| DEPTH_MODEL = "depth-anything/Depth-Anything-V2-Small-hf" | |
| MOBILE_SAM_CKPT = "mobile_sam.pt" | |
| MOBILE_SAM_URL = "https://github.com/ChaoningZhang/MobileSAM/raw/master/weights/mobile_sam.pt" | |
| # Moondream (Colab T4) | |
| MOONDREAM_ID = "vikhyatk/moondream2" | |
| MOONDREAM_REVISION = "2025-06-21" | |
| # Qwen (RTX 5060) | |
| QWEN_ID = "Qwen/Qwen2.5-VL-3B-Instruct" | |
| QWEN_MIN_PIXELS = 256 * 28 * 28 # 256 tokens | |
| QWEN_MAX_PIXELS = 512 * 28 * 28 # 512 tokens | |
| # Gemma 4 E2B IT (production β requires transformers>=5.5.0) | |
| GEMMA4_ID = "google/gemma-4-E2B-it" | |
| GEMMA4_MAX_NEW_TOKENS = 300 | |
| # ββ Detection ββ | |
| CONF_THRESHOLD = 0.3 | |
| # ββ Evaluation ββ | |
| N_TEST_IMAGES = 50 | |
| N_SPATIAL_SCENES = 30 | |
| N_SIZE_OBJECTS = 20 | |
| N_LATENCY_RUNS = 10 | |
| # ββ Assistive prompt templates ββ | |
| ASSISTIVE_PROMPT = ( | |
| "Describe this scene for a visually impaired person. " | |
| "Use the depth measurements provided to give a spatial description: " | |
| "how deep or large the space appears to be, " | |
| "what objects are present and whether each one is to the left, centre, or right, " | |
| "roughly how far away each object is, " | |
| "and any navigation hazards or obstacles. " | |
| "Be concise and natural, as if guiding someone who cannot see." | |
| ) | |
| QUERY_MODES = { | |
| "identify": "What is this object? Provide a one-sentence identification.", | |
| "describe": ( | |
| "Describe this object for a visually impaired user: material, " | |
| "condition, any visible text or branding, and one practical detail." | |
| ), | |
| "spatial": ( | |
| "Describe the spatial arrangement of all objects in the scene " | |
| "using the depth measurements provided. Use directions relative to " | |
| "the viewer (ahead, left, right) and include distances." | |
| ), | |
| "measure": ( | |
| "Using the estimated physical dimensions provided, describe how " | |
| "large this object is. Compare its size to a common everyday " | |
| "reference object a visually impaired person would know by touch." | |
| ), | |
| "navigate": ( | |
| "Describe this scene as navigation guidance for a visually impaired " | |
| "person. Mention obstacles, clear paths, distances to key objects, " | |
| "and any potential hazards like edges or steps." | |
| ), | |
| "full_scene": ASSISTIVE_PROMPT, | |
| } | |