{ "model_type": "foveated_vlm", "architectures": [ "FoveatedVLM" ], "llm_name": "HuggingFaceTB/SmolLM2-1.7B-Instruct", "dino_name": "facebook/dinov2-small", "llm_dim": 2048, "dino_dim": 384, "query_dim": 384, "visual_scale": 0.14, "lambda_coarse": 0.0, "deep_query": true, "total_params": 1835967616, "training_stages": [ "Stage 1: Visual Alignment (4.3h, 31250 steps)", "Stage 2: Vision-Language SFT (9.5h, 31250 steps)", "Stage 3: DPO (1.9h, 2593 steps)" ] }