{ "model_type": "foveated_vlm", "architectures": [ "FoveatedVLM" ], "llm_name": "HuggingFaceTB/SmolLM2-135M-Instruct", "dino_name": "facebook/dinov2-small", "llm_dim": 576, "dino_dim": 384, "query_dim": 384, "visual_scale": 0.14, "lambda_coarse": 0.0, "deep_query": true, "total_params": 185622528, "training_stages": [ "Stage 1: Visual Alignment (OpenVid + WebVid + text retention)", "Stage 2: Vision-Language SFT (Cauldron + video + text retention)", "Stage 3: DPO (RLAIF-V preference pairs)" ] }