File size: 2,995 Bytes
5412d82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
"""
Central configuration for the Depth-Aware Scene Description system.
All constants, paths, and hyperparameters live here.
"""
from pathlib import Path

# ── Paths ──
PROJECT_ROOT = Path(__file__).parent.parent
DATA_DIR = PROJECT_ROOT / "data"
TEST_IMAGES_DIR = DATA_DIR / "test_images"
GROUND_TRUTH_DIR = DATA_DIR / "ground_truth"
REFERENCES_DIR = DATA_DIR / "references"
OUTPUTS_DIR = PROJECT_ROOT / "outputs"
RESULTS_DIR = OUTPUTS_DIR / "results"
ARKITSCENES_TMP_DIR = DATA_DIR / "_arkit_tmp"

# ── Camera ──
HFOV_DEG = 70          # typical smartphone horizontal field of view
DEPTH_MIN_CM = 20      # minimum mapped depth (cm)
DEPTH_MAX_CM = 200     # maximum mapped depth (cm)

# ── Models ──
YOLO_MODEL = "yolov8n.pt"
DEPTH_MODEL = "depth-anything/Depth-Anything-V2-Small-hf"
MOBILE_SAM_CKPT = "mobile_sam.pt"
MOBILE_SAM_URL = "https://github.com/ChaoningZhang/MobileSAM/raw/master/weights/mobile_sam.pt"

# Moondream (Colab T4)
MOONDREAM_ID = "vikhyatk/moondream2"
MOONDREAM_REVISION = "2025-06-21"

# Qwen (RTX 5060)
QWEN_ID = "Qwen/Qwen2.5-VL-3B-Instruct"
QWEN_MIN_PIXELS = 256 * 28 * 28   # 256 tokens
QWEN_MAX_PIXELS = 512 * 28 * 28   # 512 tokens

# Gemma 4 E2B IT (production β€” requires transformers>=5.5.0)
GEMMA4_ID = "google/gemma-4-E2B-it"
GEMMA4_MAX_NEW_TOKENS = 300

# ── Detection ──
CONF_THRESHOLD = 0.3

# ── Evaluation ──
N_TEST_IMAGES = 50
N_SPATIAL_SCENES = 30
N_SIZE_OBJECTS = 20
N_LATENCY_RUNS = 10

# ── Assistive prompt templates ──
ASSISTIVE_PROMPT = (
    "Describe this scene for a visually impaired person. "
    "Use the depth measurements provided to give a spatial description: "
    "how deep or large the space appears to be, "
    "what objects are present and whether each one is to the left, centre, or right, "
    "roughly how far away each object is, "
    "and any navigation hazards or obstacles. "
    "Be concise and natural, as if guiding someone who cannot see."
)

QUERY_MODES = {
    "identify": "What is this object? Provide a one-sentence identification.",
    "describe": (
        "Describe this object for a visually impaired user: material, "
        "condition, any visible text or branding, and one practical detail."
    ),
    "spatial": (
        "Describe the spatial arrangement of all objects in the scene "
        "using the depth measurements provided. Use directions relative to "
        "the viewer (ahead, left, right) and include distances."
    ),
    "measure": (
        "Using the estimated physical dimensions provided, describe how "
        "large this object is. Compare its size to a common everyday "
        "reference object a visually impaired person would know by touch."
    ),
    "navigate": (
        "Describe this scene as navigation guidance for a visually impaired "
        "person. Mention obstacles, clear paths, distances to key objects, "
        "and any potential hazards like edges or steps."
    ),
    "full_scene": ASSISTIVE_PROMPT,
}