Spaces:

snap-research
/

KontinuousKontext

Running on Zero

App Files Files Community

KontinuousKontext / model /sliders_model.py

RishubhPar

added other files.

bd90279 verified about 1 month ago

raw

history blame contribute delete

4.36 kB

	import torch
	import numpy as np


	class SliderProjector(torch.nn.Module):
	def __init__(
	self,
	out_dim, # Dimension of the output token that the projector will generate
	pe_dim, # The dimension of positional embedding that will be applied
	n_layers = 4,
	is_clip_input = True, # This function will check whether the clip embeddings are the input of the projector net or not
	):
	super().__init__()
	self.out_dim = out_dim
	self.pe_dim = pe_dim
	self.is_clip_input = is_clip_input

	# Add the layers here in defining, assume n_layers is another parameter
	layers = []
	pe_extender_dim = 768

	# if the clip embeddings are to be passed along with the input of the slider scalar value, we will increase the dimensions of the input of the projector net
	if is_clip_input:
	in_dim = pe_extender_dim + 768
	else:
	in_dim = pe_extender_dim

	# iterating over the layers and accumulating the layers in a list for defining the model
	for i in range(n_layers - 1):
	layers.append(torch.nn.Linear(in_dim, out_dim))
	layers.append(torch.nn.ReLU())
	in_dim = out_dim
	layers.append(torch.nn.Linear(in_dim, out_dim))

	# a simple linear layer to extend the pe into a higher dimensional space
	self.pe_extender = torch.nn.Linear(pe_dim, 768)
	# then we will pass it through a projector network
	self.projector = torch.nn.Sequential(*layers)

	# A simple encoding function for the scalar input for a pe embedding
	def posEnc(self, s):
	pe = torch.stack([torch.sin(torch.pi * s), torch.cos(torch.pi * s)], dim=-1)
	return pe

	# A forward function that will take the input x and then projects it to a token embedding to condition the diffusion model.
	def forward(self, s, clip_embeddings = None):
	# Apply the positional embedding to the input scalar
	x_pe = self.posEnc(s)
	x_scale_embedding = self.pe_extender(x_pe) # (1, 768)

	if clip_embeddings is not None: # if the clip input is passed, we will concatenated it with the scalar embeddings for processing
	# print("clip embeddings shape: {}".format(clip_embeddings.shape))
	x_combined_embedding = torch.cat([x_scale_embedding, clip_embeddings], dim=-1) # (1, 768 + 768)

	x_proj = self.projector(x_combined_embedding)
	# print("x proj shape: {}".format(x_proj.shape))
	return x_proj


	class SliderProjector_wo_clip(torch.nn.Module):
	def __init__(
	self,
	out_dim, # Dimension of the output token that the projector will generate
	pe_dim, # The dimension of positional embedding
	n_layers = 4,
	is_clip_input = False, # This function will check whether the clip embeddings are the input of the projector net or not
	):
	super().__init__()
	self.out_dim = out_dim
	self.pe_dim = pe_dim

	# Add the layers here in defining, assume n_layers is another parameter
	layers = []
	pe_extender_dim = 768

	# extending the input dimenstion to the 768 with a linear layer to keep the dimensions consistent with other clip based model.
	in_dim = pe_extender_dim

	# iterating over the layers and accumulating the layers in a list for defining the model
	for i in range(n_layers - 1):
	layers.append(torch.nn.Linear(in_dim, out_dim))
	layers.append(torch.nn.ReLU())
	in_dim = out_dim
	layers.append(torch.nn.Linear(in_dim, out_dim))

	# adding a pe extender to have the same dimension as clip embeddings
	self.pe_extender = torch.nn.Linear(pe_dim, 768)
	# then we will pass it through a projector network
	self.projector = torch.nn.Sequential(*layers)

	def posEnc(self, s):
	pe = torch.stack([torch.sin(torch.pi * s), torch.cos(torch.pi * s)], dim=-1)
	return pe

	# A forward function that will take the input x and then projects it to a token embedding to condition the diffusion model.
	def forward(self, s):
	x_pe = self.posEnc(s)
	x_scale_embedding = self.pe_extender(x_pe)

	x_proj = self.projector(x_scale_embedding)
	return x_proj