fVLM-135M / config.json
sanps's picture
Upload fVLM-135M: Foveated Vision-Language Model (Stage 3 DPO)
6d320d6 verified
raw
history blame contribute delete
542 Bytes
{
"model_type": "foveated_vlm",
"architectures": [
"FoveatedVLM"
],
"llm_name": "HuggingFaceTB/SmolLM2-135M-Instruct",
"dino_name": "facebook/dinov2-small",
"llm_dim": 576,
"dino_dim": 384,
"query_dim": 384,
"visual_scale": 0.14,
"lambda_coarse": 0.0,
"deep_query": true,
"total_params": 185622528,
"training_stages": [
"Stage 1: Visual Alignment (OpenVid + WebVid + text retention)",
"Stage 2: Vision-Language SFT (Cauldron + video + text retention)",
"Stage 3: DPO (RLAIF-V preference pairs)"
]
}