MoTIF / utils /core /vision_encoder /config.py

Upload folder using huggingface_hub

3cf4fff verified 3 months ago

5.18 kB

	# Copyright (c) Meta Platforms, Inc. and affiliates.

	"""
	Include all available vision encoder configurations.
	"""

	from dataclasses import dataclass, replace

	from functools import partial
	from typing import Callable, Optional, Sequence, Tuple, List

	from huggingface_hub import hf_hub_download



	def fetch_pe_checkpoint(name: str, path: Optional[str] = None):
	path = path or f"hf://facebook/{name}:{name}.pt"

	if path.startswith("hf://"):
	# Load from huggingface
	path = path[len("hf://"):]
	repo, file = path.split(":")

	return hf_hub_download(repo_id=repo, filename=file)
	else:
	return path




	@dataclass
	class PEConfig:
	""" Vision Tower Config. """
	patch_size: int
	width: int
	layers: int
	heads: int
	mlp_ratio: float
	output_dim: Optional[int]

	ls_init_value: float = None
	drop_path: float = 0.0

	image_size: int = 224,
	use_abs_posemb: bool = True
	use_cls_token: bool = False
	use_rope2d: bool = True

	pool_type: str = "attn"
	attn_pooler_heads: int = 8

	use_ln_pre: bool = True
	use_ln_post: bool = True


	@dataclass
	class PETextConfig:
	""" Text Tower Config. """
	context_length: int
	width: int
	heads: int
	layers: int

	output_dim: int

	mlp_ratio: float = 4.0
	vocab_size: int = 49408




	PE_VISION_CONFIG = {}
	PE_TEXT_CONFIG = {}



	#########################################
	# PE CORE #
	#########################################

	PE_VISION_CONFIG["PE-Core-G14-448"] = PEConfig(
	image_size=448,
	patch_size=14,
	width=1536,
	layers=50,
	heads=16,
	mlp_ratio=8960 / 1536,
	pool_type="attn",
	output_dim=1280,
	use_cls_token=False,
	)
	PE_TEXT_CONFIG["PE-Core-G14-448"] = PETextConfig(
	context_length=72,
	width=1280,
	heads=20,
	layers=24,
	output_dim=1280
	)


	PE_VISION_CONFIG["PE-Core-L14-336"] = PEConfig(
	image_size=336,
	patch_size=14,
	width=1024,
	layers=24,
	heads=16,
	mlp_ratio=4.0,
	pool_type="attn",
	output_dim=1024,
	use_cls_token=True,
	)
	PE_TEXT_CONFIG["PE-Core-L14-336"] = PETextConfig(
	context_length=32,
	width=1024,
	heads=16,
	layers=24,
	output_dim=1024
	)


	PE_VISION_CONFIG["PE-Core-B16-224"] = PEConfig(
	image_size=224,
	patch_size=16,
	width=768,
	layers=12,
	heads=12,
	mlp_ratio=4.0,
	pool_type="attn",
	output_dim=1024,
	use_cls_token=True,
	)
	PE_TEXT_CONFIG["PE-Core-B16-224"] = PE_TEXT_CONFIG["PE-Core-L14-336"]




	PE_VISION_CONFIG["PE-Core-S16-384"] = PEConfig(
	image_size=384,
	patch_size=16,
	width=384,
	layers=12,
	heads=6,
	mlp_ratio=4.0,
	pool_type="attn",
	output_dim=512,
	use_cls_token=True,
	)
	PE_TEXT_CONFIG["PE-Core-S16-384"] = PETextConfig(
	context_length=32,
	width=512,
	heads=8,
	layers=12,
	output_dim=512
	)



	PE_VISION_CONFIG["PE-Core-T16-384"] = PEConfig(
	image_size=384,
	patch_size=16,
	width=192,
	layers=12,
	heads=3,
	mlp_ratio=4.0,
	pool_type="attn",
	output_dim=512,
	use_cls_token=True,
	)
	PE_TEXT_CONFIG["PE-Core-T16-384"] = PE_TEXT_CONFIG["PE-Core-S16-384"]







	#########################################
	# PE Lang #
	#########################################

	PE_VISION_CONFIG["PE-Lang-G14-448"] = replace(
	PE_VISION_CONFIG["PE-Core-G14-448"],
	image_size=448,
	pool_type="none",
	use_ln_post=False,
	output_dim=None,
	ls_init_value=0.1,
	layers=47,
	)

	PE_VISION_CONFIG["PE-Lang-L14-448"] = replace(
	PE_VISION_CONFIG["PE-Core-L14-336"],
	image_size=448,
	pool_type="none",
	use_ln_post=False,
	output_dim=None,
	ls_init_value=0.1,
	layers=23
	)


	# Stage 2 checkpoints for PLM-8B and PLM-3B respectively. Pretrained with tiling.
	# Use these checkpoints if you're building a model that uses tiling downstream!
	PE_VISION_CONFIG["PE-Lang-G14-448-Tiling"] = PE_VISION_CONFIG["PE-Lang-G14-448"]
	PE_VISION_CONFIG["PE-Lang-L14-448-Tiling"] = PE_VISION_CONFIG["PE-Lang-L14-448"]








	#########################################
	# PE Spatial #
	#########################################

	PE_VISION_CONFIG["PE-Spatial-G14-448"] = replace(
	PE_VISION_CONFIG["PE-Core-G14-448"],
	image_size=448,
	pool_type="none",
	use_ln_post=False,
	output_dim=None,
	ls_init_value=0.1,
	)

	# No layerscale on the smaller spatial models
	PE_VISION_CONFIG["PE-Spatial-L14-448"] = replace(
	PE_VISION_CONFIG["PE-Core-L14-336"],
	image_size=448,
	pool_type="none",
	use_ln_post=False,
	output_dim=None,
	)


	PE_VISION_CONFIG["PE-Spatial-B16-512"] = replace(
	PE_VISION_CONFIG["PE-Core-B16-224"],
	image_size=512,
	pool_type="none",
	use_ln_post=False,
	output_dim=None,
	)


	PE_VISION_CONFIG["PE-Spatial-S16-512"] = replace(
	PE_VISION_CONFIG["PE-Core-S16-384"],
	image_size=512,
	pool_type="none",
	use_ln_post=False,
	output_dim=None,
	)


	PE_VISION_CONFIG["PE-Spatial-T16-512"] = replace(
	PE_VISION_CONFIG["PE-Core-T16-384"],
	image_size=512,
	pool_type="none",
	use_ln_post=False,
	output_dim=None,
	)