File size: 3,478 Bytes
3d79eb3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
#

import math
from typing import Optional

import torch
from fairseq2.nn.projection import Linear
from fairseq2.typing import DataType, Device
from torch import Tensor
from torch.nn import Module

from lcm.nn.initialization import parse_activation_fn


class DiTTimestepEncoder(Module):
    """
    Embeds scalar timesteps into vector representations.
    Based on DiT's `TimestepEmbedder`
    https://github.com/facebookresearch/DiT/blob/main/models.py
    """

    def __init__(
        self,
        embedding_dim: int,
        frequency_embedding_size: int = 256,
        activation_fn_name: str = "silu",
        device: Optional[Device] = None,
        dtype: Optional[DataType] = None,
    ):
        super().__init__()

        self.dtype = dtype

        self.device = device

        self.embedding_dim = embedding_dim

        self.frequency_embedding_size = frequency_embedding_size

        self.fc1 = Linear(
            frequency_embedding_size,
            embedding_dim,
            bias=True,
            device=device,
            dtype=dtype,
        )
        self.nonlin = parse_activation_fn(activation_fn_name)
        self.fc2 = Linear(
            embedding_dim,
            embedding_dim,
            bias=True,
            device=device,
            dtype=dtype,
        )

        self.reset_parameters()

    def reset_parameters(self) -> None:
        """Reset the parameters and buffers of the module."""
        torch.nn.init.normal_(self.fc1.weight, std=0.02)
        torch.nn.init.normal_(self.fc2.weight, std=0.02)

        if self.fc1.bias is not None:
            torch.nn.init.zeros_(self.fc1.bias)

        if self.fc2.bias is not None:
            torch.nn.init.zeros_(self.fc2.bias)

    @staticmethod
    def sinusoidal_timestep_embedding(
        timestep, frequency_embedding_size, max_period=10000
    ):
        """
        Create sinusoidal timestep embeddings.
        :param timestep: a 1-D Tensor of N indices, one per batch element.
                          These may be fractional.
        :param frequency_embedding_size: the dimension of the output.
        :param max_period: controls the minimum frequency of the embeddings.
        :return: an (N, D) Tensor of positional embeddings.

        Based on DiT's `TimestepEmbedder`
        https://github.com/facebookresearch/DiT/blob/main/models.py
        """
        half = frequency_embedding_size // 2

        freqs = torch.exp(
            -math.log(max_period)
            * torch.arange(start=0, end=half, dtype=torch.float32)
            / half
        ).to(device=timestep.device)

        args = timestep[:, None].float() * freqs[None]

        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)

        if frequency_embedding_size % 2:
            embedding = torch.cat(
                [embedding, torch.zeros_like(embedding[:, :1])], dim=-1
            )

        return embedding

    def forward(self, timesteps: Tensor) -> Tensor:
        initial_size = timesteps.size()

        flat_timesteps = timesteps.view(-1, 1)

        t_freq = self.sinusoidal_timestep_embedding(
            flat_timesteps, self.frequency_embedding_size
        ).to(self.dtype)

        t_emb = self.fc1(t_freq)

        if self.nonlin is not None:
            t_emb = self.nonlin(t_emb)

        t_emb = self.fc2(t_emb)

        return t_emb.view(*initial_size, self.embedding_dim)