Spaces:
Running
on
Zero
Running
on
Zero
File size: 4,357 Bytes
bd90279 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
import torch
import numpy as np
class SliderProjector(torch.nn.Module):
def __init__(
self,
out_dim, # Dimension of the output token that the projector will generate
pe_dim, # The dimension of positional embedding that will be applied
n_layers = 4,
is_clip_input = True, # This function will check whether the clip embeddings are the input of the projector net or not
):
super().__init__()
self.out_dim = out_dim
self.pe_dim = pe_dim
self.is_clip_input = is_clip_input
# Add the layers here in defining, assume n_layers is another parameter
layers = []
pe_extender_dim = 768
# if the clip embeddings are to be passed along with the input of the slider scalar value, we will increase the dimensions of the input of the projector net
if is_clip_input:
in_dim = pe_extender_dim + 768
else:
in_dim = pe_extender_dim
# iterating over the layers and accumulating the layers in a list for defining the model
for i in range(n_layers - 1):
layers.append(torch.nn.Linear(in_dim, out_dim))
layers.append(torch.nn.ReLU())
in_dim = out_dim
layers.append(torch.nn.Linear(in_dim, out_dim))
# a simple linear layer to extend the pe into a higher dimensional space
self.pe_extender = torch.nn.Linear(pe_dim, 768)
# then we will pass it through a projector network
self.projector = torch.nn.Sequential(*layers)
# A simple encoding function for the scalar input for a pe embedding
def posEnc(self, s):
pe = torch.stack([torch.sin(torch.pi * s), torch.cos(torch.pi * s)], dim=-1)
return pe
# A forward function that will take the input x and then projects it to a token embedding to condition the diffusion model.
def forward(self, s, clip_embeddings = None):
# Apply the positional embedding to the input scalar
x_pe = self.posEnc(s)
x_scale_embedding = self.pe_extender(x_pe) # (1, 768)
if clip_embeddings is not None: # if the clip input is passed, we will concatenated it with the scalar embeddings for processing
# print("clip embeddings shape: {}".format(clip_embeddings.shape))
x_combined_embedding = torch.cat([x_scale_embedding, clip_embeddings], dim=-1) # (1, 768 + 768)
x_proj = self.projector(x_combined_embedding)
# print("x proj shape: {}".format(x_proj.shape))
return x_proj
class SliderProjector_wo_clip(torch.nn.Module):
def __init__(
self,
out_dim, # Dimension of the output token that the projector will generate
pe_dim, # The dimension of positional embedding
n_layers = 4,
is_clip_input = False, # This function will check whether the clip embeddings are the input of the projector net or not
):
super().__init__()
self.out_dim = out_dim
self.pe_dim = pe_dim
# Add the layers here in defining, assume n_layers is another parameter
layers = []
pe_extender_dim = 768
# extending the input dimenstion to the 768 with a linear layer to keep the dimensions consistent with other clip based model.
in_dim = pe_extender_dim
# iterating over the layers and accumulating the layers in a list for defining the model
for i in range(n_layers - 1):
layers.append(torch.nn.Linear(in_dim, out_dim))
layers.append(torch.nn.ReLU())
in_dim = out_dim
layers.append(torch.nn.Linear(in_dim, out_dim))
# adding a pe extender to have the same dimension as clip embeddings
self.pe_extender = torch.nn.Linear(pe_dim, 768)
# then we will pass it through a projector network
self.projector = torch.nn.Sequential(*layers)
def posEnc(self, s):
pe = torch.stack([torch.sin(torch.pi * s), torch.cos(torch.pi * s)], dim=-1)
return pe
# A forward function that will take the input x and then projects it to a token embedding to condition the diffusion model.
def forward(self, s):
x_pe = self.posEnc(s)
x_scale_embedding = self.pe_extender(x_pe)
x_proj = self.projector(x_scale_embedding)
return x_proj
|