fVLM-135M / config.json

Upload fVLM-135M: Foveated Vision-Language Model (Stage 3 DPO)

6d320d6 verified 8 days ago

542 Bytes

	{
	"model_type": "foveated_vlm",
	"architectures": [
	"FoveatedVLM"
	],
	"llm_name": "HuggingFaceTB/SmolLM2-135M-Instruct",
	"dino_name": "facebook/dinov2-small",
	"llm_dim": 576,
	"dino_dim": 384,
	"query_dim": 384,
	"visual_scale": 0.14,
	"lambda_coarse": 0.0,
	"deep_query": true,
	"total_params": 185622528,
	"training_stages": [
	"Stage 1: Visual Alignment (OpenVid + WebVid + text retention)",
	"Stage 2: Vision-Language SFT (Cauldron + video + text retention)",
	"Stage 3: DPO (RLAIF-V preference pairs)"
	]
	}