initial commit

acd771b verified 3 days ago

9.26 kB

	# Originally from OpenCLIP (https://github.com/mlfoundations/open_clip)

	from dataclasses import dataclass
	from typing import Optional, Tuple, Union

	import numpy as np
	import torch
	import torch.nn.functional as F
	from torch import nn
	from functools import partial

	from .timm_model import TimmModel
	from .transformer import (
	LayerNormFp32,
	LayerNorm,
	QuickGELU,
	TextTransformer,
	text_global_pool,
	)
	from .utils import to_2tuple


	@dataclass
	class CLIPVisionCfg:
	layers: Union[Tuple[int, int, int, int], int] = 12
	width: int = 768
	head_width: int = 64
	mlp_ratio: float = 4.0
	patch_size: int = 16
	image_size: Union[Tuple[int, int], int] = 224

	ls_init_value: Optional[float] = None
	patch_dropout: float = 0.0
	attentional_pool: bool = False
	attn_pooler_queries: int = 256
	attn_pooler_heads: int = 8
	no_ln_pre: bool = False
	pos_embed_type: str = "learnable"
	final_ln_after_pool: bool = False
	pool_type: str = "tok"
	output_tokens: bool = False
	act_kwargs: Optional[dict] = None
	norm_kwargs: Optional[dict] = None

	block_type: Optional[str] = None
	qk_norm: bool = False
	scaled_cosine_attn: bool = False
	scale_heads: bool = False
	scale_attn_inner: bool = False
	scale_attn: bool = False
	scale_fc: bool = False

	timm_model_name: Optional[str] = None
	timm_model_pretrained: bool = False
	timm_pool: str = "avg"
	timm_proj: str = "linear"
	timm_proj_bias: bool = False
	timm_drop: float = 0.0
	timm_drop_path: Optional[float] = None
	timm_use_rope: bool = False
	timm_rope_keep_ape: bool = False
	timm_dynamic_img_size: bool = False
	timm_norm_pre: bool = False


	@dataclass
	class CLIPTextCfg:
	context_length: int = 77
	vocab_size: int = 49408
	hf_tokenizer_name: Optional[str] = None
	tokenizer_mode: Optional[str] = None
	tokenizer_kwargs: Optional[dict] = None

	width: int = 512
	heads: int = 8
	layers: int = 12
	mlp_ratio: float = 4.0
	ls_init_value: Optional[float] = None
	embed_cls: bool = False
	pad_id: int = 0
	eos_id: int = 2
	no_causal_mask: bool = False
	final_ln_after_pool: bool = False
	pool_type: str = "argmax"
	proj_bias: bool = False
	proj_type: str = "linear"
	output_tokens: bool = False
	act_kwargs: dict = None
	norm_kwargs: dict = None

	block_type: Optional[str] = None
	qk_norm: bool = False
	scaled_cosine_attn: bool = False
	scale_heads: bool = False
	scale_attn_inner: bool = False
	scale_attn: bool = False
	scale_fc: bool = False

	hf_model_name: Optional[str] = None
	hf_model_pretrained: bool = True
	hf_proj_type: str = "mlp"
	hf_pooler_type: str = "mean_pooler"


	def get_cast_dtype(precision: str):
	cast_dtype = None
	if precision == "bf16":
	cast_dtype = torch.bfloat16
	elif precision == "fp16":
	cast_dtype = torch.float16
	return cast_dtype


	def _build_vision_tower(
	embed_dim: int,
	vision_cfg: CLIPVisionCfg,
	quick_gelu: bool = False,
	cast_dtype: Optional[torch.dtype] = None,
	):
	if isinstance(vision_cfg, dict):
	vision_cfg = CLIPVisionCfg(**vision_cfg)

	if not vision_cfg.timm_model_name:
	raise ValueError(
	"Only TimmModel-based vision towers are supported in raon-vision-encoder. "
	"Please set timm_model_name in vision_cfg."
	)

	visual = TimmModel(
	vision_cfg.timm_model_name,
	pretrained=vision_cfg.timm_model_pretrained,
	pool=vision_cfg.timm_pool,
	proj=vision_cfg.timm_proj,
	proj_bias=vision_cfg.timm_proj_bias,
	drop=vision_cfg.timm_drop,
	drop_path=vision_cfg.timm_drop_path,
	patch_drop=vision_cfg.patch_dropout if vision_cfg.patch_dropout > 0 else None,
	init_values=vision_cfg.ls_init_value,
	qk_norm=vision_cfg.qk_norm,
	use_rope=vision_cfg.timm_use_rope,
	rope_keep_ape=vision_cfg.timm_rope_keep_ape,
	dynamic_img_size=vision_cfg.timm_dynamic_img_size,
	norm_pre=vision_cfg.timm_norm_pre,
	embed_dim=embed_dim,
	image_size=vision_cfg.image_size,
	output_tokens=vision_cfg.output_tokens,
	)

	return visual


	def _build_text_tower(
	embed_dim: int,
	text_cfg: CLIPTextCfg,
	quick_gelu: bool = False,
	cast_dtype: Optional[torch.dtype] = None,
	):
	if isinstance(text_cfg, dict):
	text_cfg = CLIPTextCfg(**text_cfg)

	act_layer = QuickGELU if quick_gelu else nn.GELU
	norm_layer = (
	LayerNormFp32 if cast_dtype in (torch.float16, torch.bfloat16) else LayerNorm
	)
	if text_cfg.norm_kwargs:
	norm_layer = partial(norm_layer, **text_cfg.norm_kwargs)
	if text_cfg.act_kwargs is not None:
	act_layer = partial(act_layer, **text_cfg.act_kwargs)

	text = TextTransformer(
	context_length=text_cfg.context_length,
	vocab_size=text_cfg.vocab_size,
	width=text_cfg.width,
	heads=text_cfg.heads,
	layers=text_cfg.layers,
	mlp_ratio=text_cfg.mlp_ratio,
	ls_init_value=text_cfg.ls_init_value,
	output_dim=embed_dim,
	embed_cls=text_cfg.embed_cls,
	no_causal_mask=text_cfg.no_causal_mask,
	pad_id=text_cfg.pad_id,
	eos_id=text_cfg.eos_id,
	pool_type=text_cfg.pool_type,
	proj_type=text_cfg.proj_type,
	proj_bias=text_cfg.proj_bias,
	output_tokens=text_cfg.output_tokens,
	act_layer=act_layer,
	norm_layer=norm_layer,
	block_type=text_cfg.block_type,
	qk_norm=text_cfg.qk_norm,
	scaled_cosine_attn=text_cfg.scaled_cosine_attn,
	scale_heads=text_cfg.scale_heads,
	scale_attn_inner=text_cfg.scale_attn_inner,
	scale_attn=text_cfg.scale_attn,
	scale_fc=text_cfg.scale_fc,
	)
	return text


	class CustomTextCLIP(nn.Module):
	output_dict: torch.jit.Final[bool]

	def __init__(
	self,
	embed_dim: int,
	vision_cfg: CLIPVisionCfg,
	text_cfg: CLIPTextCfg,
	quick_gelu: bool = False,
	init_logit_scale: float = np.log(1 / 0.07),
	init_logit_bias: Optional[float] = None,
	nonscalar_logit_scale: bool = False,
	cast_dtype: Optional[torch.dtype] = None,
	output_dict: bool = False,
	):
	super().__init__()
	self.output_dict = output_dict
	self.visual = _build_vision_tower(embed_dim, vision_cfg, quick_gelu, cast_dtype)
	self.text = _build_text_tower(embed_dim, text_cfg, quick_gelu, cast_dtype)
	self.context_length = self.text.context_length
	self.vocab_size = self.text.vocab_size

	lshape = [1] if nonscalar_logit_scale else []
	self.logit_scale = nn.Parameter(torch.ones(lshape) * init_logit_scale)
	if init_logit_bias is not None:
	self.logit_bias = nn.Parameter(torch.ones(lshape) * init_logit_bias)
	else:
	self.logit_bias = None

	def encode_image(
	self, pixel_values, normalize: bool = False, pixel_attention_mask=None, spatial_shapes=None
	):
	kwargs = {}
	if pixel_attention_mask is not None:
	kwargs["patch_valid_mask"] = pixel_attention_mask
	if spatial_shapes is not None:
	kwargs["spatial_shapes"] = spatial_shapes
	features = self.visual(pixel_values, **kwargs) if kwargs else self.visual(pixel_values)
	return F.normalize(features, dim=-1) if normalize else features

	def encode_text(self, input_ids, normalize: bool = False):
	features = self.text(input_ids)
	return F.normalize(features, dim=-1) if normalize else features

	def get_logits(self, image, text):
	image_features = self.encode_image(pixel_values=image, normalize=True)
	text_features = self.encode_text(input_ids=text, normalize=True)
	image_logits = self.logit_scale.exp() * image_features @ text_features.T
	if self.logit_bias is not None:
	image_logits += self.logit_bias
	text_logits = image_logits.T
	return image_logits, text_logits

	def forward(
	self, image=None, text=None, patch_valid_mask=None, spatial_shapes=None
	):
	image_features = (
	self.encode_image(
	pixel_values=image,
	normalize=True,
	pixel_attention_mask=patch_valid_mask,
	spatial_shapes=spatial_shapes,
	)
	if image is not None
	else None
	)
	text_features = (
	self.encode_text(input_ids=text, normalize=True) if text is not None else None
	)

	if self.output_dict:
	out_dict = {
	"image_features": image_features,
	"text_features": text_features,
	"logit_scale": self.logit_scale.exp(),
	}
	if self.logit_bias is not None:
	out_dict["logit_bias"] = self.logit_bias
	return out_dict

	if self.logit_bias is not None:
	return (
	image_features,
	text_features,
	self.logit_scale.exp(),
	self.logit_bias,
	)
	return image_features, text_features, self.logit_scale.exp()