Spaces:

jerpelhan
/

GECO2-demo

Running on Zero

App Files Files Community

GECO2-demo / models /counter_infer.py

jerpelhan

Updated demo, added AMP for faster inference, added examples

0e137ec 8 days ago

raw

history blame contribute delete

8.58 kB

	import numpy as np
	import skimage
	import torch
	from hydra import compose
	from hydra.utils import instantiate
	from omegaconf import OmegaConf
	from torch import nn
	from torch.nn import functional as F
	from torchvision.ops import roi_align
	from torchvision.transforms import Resize
	from torch.cuda.amp import autocast
	from utils.box_ops import boxes_with_scores
	from .query_generator import C_base
	from .sam_mask import MaskProcessor


	class CNT(nn.Module):

	def __init__(
	self,
	image_size: int,
	num_objects: int,
	emb_dim: int,
	kernel_dim: int,
	reduction: int,
	zero_shot: bool,
	):

	super(CNT, self).__init__()

	self.emb_dim = emb_dim
	self.num_objects = num_objects
	self.reduction = reduction
	self.kernel_dim = kernel_dim
	self.image_size = image_size
	self.zero_shot = zero_shot

	self.class_embed = nn.Sequential(nn.Linear(emb_dim, 1), nn.LeakyReLU())
	self.bbox_embed = MLP(emb_dim, emb_dim, 4, 3)

	self.adapt_features = C_base(
	transformer_dim=self.emb_dim,
	num_prototype_attn_steps=3,
	num_image_attn_steps=2,
	)
	from .prompt_encoder import PromptEncoder
	self.sam_prompt_encoder = PromptEncoder(
	embed_dim=self.emb_dim,
	image_embedding_size=(
	self.image_size // self.reduction,
	self.image_size // self.reduction,
	),
	input_image_size=(self.image_size, self.image_size),
	mask_in_chans=16,
	)
	config_name = '../configs/sam2_hiera_base_plus.yaml'
	cfg = compose(config_name=config_name)
	OmegaConf.resolve(cfg)
	self.backbone = instantiate(cfg.backbone, _recursive_=True)
	checkpoint = torch.hub.load_state_dict_from_url(
	'https://dl.fbaipublicfiles.com/segment_anything_2/072824/' + config_name.split('/')[-1].replace('.yaml',
	'.pt'),
	map_location="cpu"
	)['model']
	state_dict = {k.replace("image_encoder.", ""): v for k, v in checkpoint.items()}
	self.backbone.load_state_dict(state_dict, strict=False)

	self.shape_or_objectness = nn.Sequential(
	nn.Linear(2, 64),
	nn.ReLU(),
	nn.Linear(64, emb_dim),
	nn.ReLU(),
	nn.Linear(emb_dim, 1 ** 2 * emb_dim)
	)
	self.resize = Resize((1024, 1024))
	self.sam_mask = MaskProcessor(self.emb_dim, self.image_size, reduction)
	self.sam_corr = True


	def forward(self, x, bboxes):
	self.num_objects = bboxes.size(1)
	with torch.no_grad():
	feats = self.backbone(x)
	src = feats['vision_features']
	bs, c, w, h = src.shape
	self.reduction = 1024 / w

	bboxes_roi = torch.cat([
	torch.arange(
	bs, requires_grad=False
	).to(bboxes.device).repeat_interleave(self.num_objects).reshape(-1, 1),
	bboxes.flatten(0, 1),
	], dim=1)
	self.kernel_dim = 1

	# # NORMAL
	exemplars = roi_align(
	src,
	boxes=bboxes_roi, output_size=self.kernel_dim,
	spatial_scale=1.0 / self.reduction, aligned=True
	).permute(0, 2, 3, 1).reshape(bs, self.num_objects * self.kernel_dim ** 2, self.emb_dim)

	l1 = feats['backbone_fpn'][0]
	l2 = feats['backbone_fpn'][1]
	r1 = 1.0 / self.reduction * 2 * 2
	exemplars_l1 = roi_align(
	l1,
	boxes=bboxes_roi, output_size=self.kernel_dim,
	spatial_scale=1.0 / self.reduction * 2 * 2, aligned=True
	).permute(0, 2, 3, 1).reshape(bs, self.num_objects * self.kernel_dim ** 2, self.emb_dim)

	exemplars_l2 = roi_align(
	l2,
	boxes=bboxes_roi, output_size=self.kernel_dim,
	spatial_scale=1.0 / self.reduction * 2, aligned=True
	).permute(0, 2, 3, 1).reshape(bs, self.num_objects * self.kernel_dim ** 2, self.emb_dim)

	box_hw = torch.zeros(bboxes.size(0), bboxes.size(1), 2).to(bboxes.device)
	box_hw[:, :, 0] = bboxes[:, :, 2] - bboxes[:, :, 0]
	box_hw[:, :, 1] = bboxes[:, :, 3] - bboxes[:, :, 1]

	# Encode shape
	shape = self.shape_or_objectness(box_hw).reshape(
	bs, -1, self.emb_dim
	)
	prototype_embeddings = torch.cat([exemplars, shape], dim=1)
	prototype_embeddings_l1 = torch.cat([exemplars_l1, shape], dim=1)
	prototype_embeddings_l2 = torch.cat([exemplars_l2, shape], dim=1)
	hq_prototype_embeddings = [prototype_embeddings_l1, prototype_embeddings_l2]

	with autocast(enabled=False):
	if src.type != torch.float32:
	src = src.float()
	prototype_embeddings = prototype_embeddings.float()
	hq_prototype_embeddings = [hq.float() for hq in hq_prototype_embeddings]
	feats['backbone_fpn'] = [f.float() for f in feats['backbone_fpn']]
	feats['vision_pos_enc'] = [f.float() for f in feats['vision_pos_enc']]

	# adapt image feature with prototypes
	adapted_f, adapted_f_aux = self.adapt_features(
	image_embeddings=src,
	image_pe=self.sam_prompt_encoder.get_dense_pe(),
	prototype_embeddings=prototype_embeddings,
	hq_features=feats['backbone_fpn'],
	hq_prototypes=hq_prototype_embeddings,
	hq_pos=feats['vision_pos_enc'],
	)
	# Predict class [fg, bg] and l,r,t,b
	bs, c, w, h = adapted_f.shape
	adapted_f = adapted_f.view(bs, self.emb_dim, -1).permute(0, 2, 1)
	centerness = self.class_embed(adapted_f).view(bs, w, h, 1).permute(0, 3, 1, 2)
	outputs_coord = self.bbox_embed(adapted_f).sigmoid().view(bs, w, h, 4).permute(0, 3, 1, 2)
	outputs, ref_points = boxes_with_scores(centerness, outputs_coord,sort=False, validate=True)
	# from matplotlib import pyplot as plt
	# plt.clf()
	# idx = 0
	# orig_bboxes = outputs.copy()
	# img_ = np.array((x).cpu()[idx].permute(1, 2, 0)) # test.resize512
	# img_ = img_ - np.min(img_)
	# img_ = img_ / np.max(img_)
	# plt.imshow(img_)
	#
	# bboxes_pred = orig_bboxes[idx]['pred_boxes']
	# bboxes_ = ((bboxes_pred * img_.shape[0])).detach().cpu()[0]
	#
	# # calculate width and height and remove bboxes with width or height less than 3px
	# # bboxes_ = bboxes_[(bboxes_[:, 2] - bboxes_[:, 0]) > 15]
	# # bboxes_ = bboxes_[(bboxes_[:, 3] - bboxes_[:, 1]) > 15]
	#
	# for i in range(len(bboxes_)):
	# plt.plot([bboxes_[i][0], bboxes_[i][0], bboxes_[i][2], bboxes_[i][2], bboxes_[i][0]],
	# [bboxes_[i][1], bboxes_[i][3], bboxes_[i][3], bboxes_[i][1], bboxes_[i][1]],
	# c='orange', linewidth=0.5)
	# plt.savefig("gecoboxes")

	if self.sam_corr:
	# mask processing
	masks, ious, corrected_bboxes = self.sam_mask(feats, outputs)
	for i in range(len(outputs)):
	outputs[i]["scores"] = ious[i]
	outputs[i]["pred_boxes"] = corrected_bboxes[i].to(outputs[i]["pred_boxes"].device).unsqueeze(0) / \
	x.shape[
	-1]
	else:
	for i in range(len(outputs)):
	outputs[i]["scores"] = outputs[i]["box_v"]

	return outputs, ref_points, centerness, outputs_coord, masks


	class MLP(nn.Module):
	""" Very simple multi-layer perceptron (also called FFN)"""

	def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
	super().__init__()
	self.num_layers = num_layers
	h = [hidden_dim] * (num_layers - 1)
	self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))

	def forward(self, x):
	for i, layer in enumerate(self.layers):
	x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
	return x


	def build_model(args):
	assert args.reduction in [4, 8, 16]

	return CNT(
	image_size=args.image_size,
	num_objects=args.num_objects,
	zero_shot=args.zero_shot,
	emb_dim=args.emb_dim,
	kernel_dim=args.kernel_dim,
	reduction=args.reduction,

	)