Upload loss.py

a56c7bf about 2 months ago

8.38 kB

	import warnings
	warnings.filterwarnings("ignore")
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import numpy as np
	import random

	def set_random_seed(seed):
	random.seed(seed)
	np.random.seed(seed)
	torch.manual_seed(seed)
	torch.cuda.manual_seed_all(seed)

	class Identity(nn.Module):
	def __init__(self):
	super().__init__()

	def forward(self, x):
	return x

	class CLIPLoss(nn.Module):
	def __init__(self, args, logit_scale):
	super(CLIPLoss, self).__init__()
	self.args = args
	if args.learnable_logit_scale:
	self.logit_scale = nn.Parameter(logit_scale.clone().detach())
	else:
	self.register_buffer('logit_scale', logit_scale.clone().detach())

	def forward(self, image_features, text_features, merged_df=None, indices=None):

	device = image_features.device
	batch_size, feature_dim = image_features.size()
	labels = torch.arange(batch_size, device=device, dtype=torch.long)

	logits_per_image = self.logit_scale * image_features @ text_features.t()

	logits_per_text = logits_per_image.T
	if merged_df is not None:
	compare_matrix = merged_df.iloc[indices, 2:].to_numpy()
	vector_similarity_matrix = np.ones((compare_matrix.shape[0], compare_matrix.shape[0]), dtype=np.int32)
	comparison = (compare_matrix[:, None, :] == compare_matrix[None, :, :]).all(axis=2)
	vector_similarity_matrix[comparison] = 0
	np.fill_diagonal(vector_similarity_matrix, 1)
	vector_similarity_matrix = torch.from_numpy(vector_similarity_matrix).bool().to(device)
	masked_logits_per_image = logits_per_image.masked_fill(~vector_similarity_matrix, float('-inf'))
	masked_logits_per_text = logits_per_text.masked_fill(~vector_similarity_matrix.T, float('-inf'))
	loss = (F.cross_entropy(masked_logits_per_image, labels) + F.cross_entropy(masked_logits_per_text, labels)) / 2
	else:
	loss = (F.cross_entropy(logits_per_image, labels) + F.cross_entropy(logits_per_text, labels)) / 2

	return loss

	class ResidualAdapter(nn.Module):
	def __init__(self, dim, bottleneck_dim=128):
	super().__init__()
	self.down = nn.Linear(dim, bottleneck_dim)
	self.act = nn.LeakyReLU(0.2)
	self.up = nn.Linear(bottleneck_dim, dim)

	nn.init.kaiming_normal_(self.down.weight)
	nn.init.kaiming_normal_(self.up.weight)

	def forward(self, x):
	return self.up(self.act(self.down(x)))


	class CLIPLossACE_HGAT(nn.Module):
	def __init__(self, args, logit_scale, in_channels):
	super(CLIPLossACE_HGAT, self).__init__()
	set_random_seed(args.seed)
	self.args = args
	self.img_edge_adapter = ResidualAdapter(in_channels, args.hidden_features)
	self.text_edge_adapter = ResidualAdapter(in_channels, args.hidden_features)
	self.img_node_adapter = ResidualAdapter(in_channels, args.hidden_features)
	self.text_node_adapter = ResidualAdapter(in_channels, args.hidden_features)

	if args.learnable_logit_scale:
	self.logit_scale = nn.Parameter(logit_scale.clone().detach())
	else:
	self.register_buffer('logit_scale', logit_scale.clone().detach())

	def apply_ace_hgat(self, features, attn_weights, encoder="img"):

	if encoder =="img":
	edge_adapter = self.img_edge_adapter
	node_adapter = self.img_node_adapter
	elif encoder == 'text':
	edge_adapter = self.text_edge_adapter
	node_adapter = self.text_node_adapter
	else:
	raise ValueError(f"encoder must be img or text but given {encoder}")

	B, N, D = features.shape
	patches_norm = F.normalize(features[:, 1:, :], p=2, dim=-1)
	# Similarity Matrix: (B, P, P)
	sim = torch.zeros(size=(B, N, N), device=features.device)
	patch_sim = torch.bmm(patches_norm, patches_norm.transpose(1, 2)) # [B, P, P]
	sim[:, 1:, 1:] = patch_sim
	sim[:, 0, 1:] = attn_weights
	mask_logic = torch.eye(N, device=features.device).bool().unsqueeze(0).repeat(B, 1, 1)
	mask_logic[:, 1:, 0] = True
	sim = sim.masked_fill(mask_logic, -float('inf'))
	topk_vals, topk_indices = torch.topk(sim, k=self.args.topk, dim=-1)
	mask_sparse = torch.full_like(sim, -float('inf'))
	mask_sparse.scatter_(-1, topk_indices, topk_vals)
	A = F.softmax(mask_sparse, dim=-1)
	A = A.masked_fill(torch.eye(N, device=features.device).bool().unsqueeze(0).repeat(B, 1, 1), 1)
	A[:, 1:, 0] = A[:, 0, 1:]

	H_edges_raw = torch.matmul(A, features)
	H_edges_refined = edge_adapter(H_edges_raw)
	H_context_raw = torch.matmul(A.transpose(1, 2), H_edges_refined)
	H_context_processed = node_adapter(H_context_raw)
	x_out = H_context_processed

	return x_out

	def forward(self, clip_model, images, texts, merged_df=None, indices=None):

	device = images.device
	clip_model.visual.trunk.global_pool = ''
	image_features, img_attn_scores = clip_model.visual.trunk.get_attn_scores(images)
	image_features = F.normalize(clip_model.visual.head(image_features), dim=-1)
	text_features, text_attn_scores = clip_model.encode_text(texts, normalize=True, output_attentions=True, output_tokens=True)
	img_attn_scores = img_attn_scores.mean(dim=1) # [B, 197, 197]
	img_attn_weights = img_attn_scores[:, 0, 1:] # relationship between CLS token and patch embeddings [B, 196]

	text_attn_scores = text_attn_scores[-1].mean(dim=1) # [B, 256, 256]
	text_attn_weights = text_attn_scores[:, 0, 1:] # relationship between global token and other token embeddings [B, 255]


	if self.args.apply_gnn_encoders == 'vision':
	image_features = self.apply_ace_hgat(image_features, img_attn_weights, encoder="img")
	image_features = F.normalize(image_features, dim=-1)

	logits_per_image = self.logit_scale * image_features[:, 0] @ text_features[:, 0].t()
	logits_per_text = logits_per_image.T

	elif self.args.apply_gnn_encoders == 'text':
	text_features = self.apply_ace_hgat(text_features, text_attn_weights, encoder="text")
	text_features = F.normalize(text_features, dim=-1)

	logits_per_image = self.logit_scale * image_features[:, 0] @ text_features[:, 0].t()
	logits_per_text = logits_per_image.T

	elif self.args.apply_gnn_encoders == 'both':
	image_features = self.apply_ace_hgat(image_features, img_attn_weights, encoder="img")
	image_features = F.normalize(image_features, dim=-1)

	text_features = self.apply_ace_hgat(text_features, text_attn_weights, encoder="text")
	text_features = F.normalize(text_features, dim=-1)

	logits_per_image = self.logit_scale * image_features[:, 0] @ text_features[:, 0].t()
	logits_per_text = logits_per_image.T

	labels = torch.arange(image_features.shape[0], device=device, dtype=torch.long)

	if logits_per_image.isnan().sum() > 0:
	raise ValueError('NaN value in logits_per_image')

	if merged_df is not None: # Label-Guided InfoNCE loss
	compare_matrix = merged_df.iloc[indices, 2:].to_numpy()
	vector_similarity_matrix = np.ones((compare_matrix.shape[0], compare_matrix.shape[0]), dtype=np.int32)
	comparison = (compare_matrix[:, None, :] == compare_matrix[None, :, :]).all(axis=2)
	vector_similarity_matrix[comparison] = 0
	np.fill_diagonal(vector_similarity_matrix, 1)
	vector_similarity_matrix = torch.from_numpy(vector_similarity_matrix).bool().to(device)
	masked_logits_per_image = logits_per_image.masked_fill(~vector_similarity_matrix, float('-inf'))
	masked_logits_per_text = logits_per_text.masked_fill(~vector_similarity_matrix.T, float('-inf'))
	loss = (F.cross_entropy(masked_logits_per_image, labels) + F.cross_entropy(masked_logits_per_text, labels)) / 2
	else:
	loss = (F.cross_entropy(logits_per_image, labels) + F.cross_entropy(logits_per_text, labels)) / 2

	return loss