File size: 2,317 Bytes
166ab04
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
from typing import Dict, List, Union

import torch
from accelerate import Accelerator
from diffusers.models import AutoencoderKL


@torch.no_grad
def encode_clean_latents(
    batch: Dict, 
    vae: AutoencoderKL, 
    weight_dtype: str = None, 
    accelerator: Accelerator = None) -> List[torch.Tensor]:
    if accelerator is not None:
        print = accelerator.print
    if weight_dtype is None:
        weight_dtype = vae.dtype

    latents = vae.encode(batch["images"].to(vae.dtype)).latent_dist.sample().to(weight_dtype)

    masked_image_latents = vae.encode(batch["masked_images"].to(dtype=vae.dtype)).latent_dist.sample().to(weight_dtype)

    # If a Nan is included, warn and replace
    if torch.any(torch.isnan(latents)):
        print("NaN found in latents, replacing with zeros")
        latents = torch.where(torch.isnan(latents), torch.zeros_like(latents), latents)
    if torch.any(torch.isnan(masked_image_latents)):
        print("NaN found in masked_image_latents, replacing with zeros")
        masked_image_latents = torch.where(torch.isnan(masked_image_latents), torch.zeros_like(masked_image_latents), masked_image_latents)

    latents = latents * vae.config.scaling_factor
    masked_image_latents = masked_image_latents * vae.config.scaling_factor

    return latents, masked_image_latents


def predict_noise(
    diff_model: torch.nn.Module, 
    noisy_latents: torch.Tensor, 
    resized_masks: torch.Tensor, 
    masked_latents: torch.Tensor, 
    timesteps: torch.Tensor, 
    input_ids: torch.Tensor, 
    guidance_scale: float = 1.0, 
    un_cond_input_ids=None) -> torch.Tensor:

    noisy_latents = torch.cat([noisy_latents] * 2)
    resized_masks = torch.cat([resized_masks] * 2)
    masked_latents = torch.cat([masked_latents] * 2)
    # timesteps = torch.cat([timesteps] * 2)

    assert input_ids.shape[0] % 2 == 0


    latent_model_input = torch.cat([
        noisy_latents, resized_masks, masked_latents], dim=1)


    # Predict the noise residual
    noise_pred = diff_model(
        latent_model_input,
        timesteps=timesteps,
        input_ids=input_ids
    ).sample


    noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2)
    noise_pred_cfg = noise_pred_uncond + \
        guidance_scale * (noise_pred_cond - noise_pred_uncond)

    return noise_pred_cfg