2602
Browse files- dataset.py +16 -105
- samples/unet_320x640_0.jpg +2 -2
- samples/unet_352x640_0.jpg +2 -2
- samples/unet_384x640_0.jpg +2 -2
- samples/unet_416x640_0.jpg +2 -2
- samples/unet_448x640_0.jpg +2 -2
- samples/unet_480x640_0.jpg +2 -2
- samples/unet_512x640_0.jpg +2 -2
- samples/unet_544x640_0.jpg +2 -2
- samples/unet_576x640_0.jpg +2 -2
- samples/unet_608x640_0.jpg +2 -2
- samples/unet_640x320_0.jpg +2 -2
- samples/unet_640x352_0.jpg +2 -2
- samples/unet_640x384_0.jpg +2 -2
- samples/unet_640x416_0.jpg +2 -2
- samples/unet_640x448_0.jpg +2 -2
- samples/unet_640x480_0.jpg +2 -2
- samples/unet_640x512_0.jpg +2 -2
- samples/unet_640x544_0.jpg +2 -2
- samples/unet_640x576_0.jpg +2 -2
- samples/unet_640x608_0.jpg +2 -2
- samples/unet_640x640_0.jpg +2 -2
- unet/diffusion_pytorch_model.safetensors +1 -1
dataset.py
CHANGED
|
@@ -18,12 +18,12 @@ from tqdm import tqdm
|
|
| 18 |
from datetime import timedelta
|
| 19 |
|
| 20 |
# ---------------- 1️⃣ Настройки ----------------
|
| 21 |
-
dtype = torch.
|
| 22 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 23 |
batch_size = 5
|
| 24 |
-
min_size = 320 #384 #320 #192 #256 #192
|
| 25 |
-
max_size = 640 #768 #640 #384 #256 #384
|
| 26 |
-
step =
|
| 27 |
empty_share = 0.0
|
| 28 |
limit = 0
|
| 29 |
# Основная процедура обработки
|
|
@@ -43,20 +43,9 @@ def clear_cuda_memory():
|
|
| 43 |
def load_models():
|
| 44 |
print("Загрузка моделей...")
|
| 45 |
#vae = AsymmetricAutoencoderKL.from_pretrained("AiArtLab/sdxs-1b",subfolder="vae",torch_dtype=dtype).to(device).eval()
|
| 46 |
-
vae =
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
#tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 50 |
-
#model = AutoModelForCausalLM.from_pretrained(
|
| 51 |
-
# model_name,
|
| 52 |
-
# torch_dtype=dtype,
|
| 53 |
-
# device_map=device
|
| 54 |
-
#).eval()
|
| 55 |
-
#tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen3-Embedding-0.6B', padding_side='left')
|
| 56 |
-
#model = AutoModel.from_pretrained('Qwen/Qwen3-Embedding-0.6B').to("cuda")
|
| 57 |
-
return vae#, model, tokenizer
|
| 58 |
-
|
| 59 |
-
#vae, model, tokenizer = load_models()
|
| 60 |
vae = load_models()
|
| 61 |
|
| 62 |
shift_factor = getattr(vae.config, "shift_factor", 0.0)
|
|
@@ -67,8 +56,11 @@ scaling_factor = getattr(vae.config, "scaling_factor", 1.0)
|
|
| 67 |
if scaling_factor is None:
|
| 68 |
scaling_factor = 1.0
|
| 69 |
|
| 70 |
-
|
| 71 |
-
|
|
|
|
|
|
|
|
|
|
| 72 |
|
| 73 |
# ---------------- 3️⃣ Трансформации ----------------
|
| 74 |
def get_image_transform(min_size=256, max_size=512, step=64):
|
|
@@ -126,50 +118,6 @@ def get_image_transform(min_size=256, max_size=512, step=64):
|
|
| 126 |
return transform
|
| 127 |
|
| 128 |
# ---------------- 4️⃣ Функции обработки ----------------
|
| 129 |
-
def last_token_pool(last_hidden_states: torch.Tensor,
|
| 130 |
-
attention_mask: torch.Tensor) -> torch.Tensor:
|
| 131 |
-
# Определяем, есть ли left padding
|
| 132 |
-
left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
|
| 133 |
-
if left_padding:
|
| 134 |
-
return last_hidden_states[:, -1]
|
| 135 |
-
else:
|
| 136 |
-
sequence_lengths = attention_mask.sum(dim=1) - 1
|
| 137 |
-
batch_size = last_hidden_states.shape[0]
|
| 138 |
-
return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]
|
| 139 |
-
|
| 140 |
-
def encode_texts_batch(texts, tokenizer, model, device="cuda", max_length=150, normalize=False):
|
| 141 |
-
with torch.inference_mode():
|
| 142 |
-
# Токенизация
|
| 143 |
-
batch = tokenizer(
|
| 144 |
-
texts,
|
| 145 |
-
return_tensors="pt",
|
| 146 |
-
padding="max_length",
|
| 147 |
-
truncation=True,
|
| 148 |
-
max_length=max_length
|
| 149 |
-
).to(device)
|
| 150 |
-
|
| 151 |
-
# Прогон через модель
|
| 152 |
-
#outputs = model(**batch)
|
| 153 |
-
|
| 154 |
-
# Пулинг по last token
|
| 155 |
-
#embeddings = last_token_pool(outputs.last_hidden_state, batch["attention_mask"])
|
| 156 |
-
|
| 157 |
-
# L2-нормализация (опционально, обычно нужна для семантического поиска)
|
| 158 |
-
#if normalize:
|
| 159 |
-
# embeddings = F.normalize(embeddings, p=2, dim=1)
|
| 160 |
-
|
| 161 |
-
# Прогон через базовую модель (внутри CausalLM)
|
| 162 |
-
outputs = model.model(**batch, output_hidden_states=True)
|
| 163 |
-
|
| 164 |
-
# Берем последний слой (эмбеддинги всех токенов)
|
| 165 |
-
hidden_states = outputs.hidden_states[-1] # [B, L, D]
|
| 166 |
-
|
| 167 |
-
# Можно применить нормализацию по каждому токену (как в CLIP)
|
| 168 |
-
if normalize:
|
| 169 |
-
hidden_states = F.normalize(hidden_states, p=2, dim=-1)
|
| 170 |
-
|
| 171 |
-
return hidden_states.cpu().numpy() # embeddings.unsqueeze(1).cpu().numpy()
|
| 172 |
-
|
| 173 |
def clean_label(label):
|
| 174 |
label = label.replace("Image 1", "").replace("Image 2", "").replace("Image 3", "").replace("Image 4", "").replace("The image depicts ","").replace("The image presents ","").replace("The image features ","").replace("The image portrays ","").replace("The image is ","").strip()
|
| 175 |
if label.startswith("."):
|
|
@@ -200,42 +148,6 @@ def process_labels_for_guidance(original_labels, prob_to_make_empty=0.01):
|
|
| 200 |
|
| 201 |
return labels_for_model, labels_for_logging
|
| 202 |
|
| 203 |
-
def _patchify_latents(latents):
|
| 204 |
-
batch_size, num_channels_latents, height, width = latents.shape
|
| 205 |
-
latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2)
|
| 206 |
-
latents = latents.permute(0, 1, 3, 5, 2, 4)
|
| 207 |
-
latents = latents.reshape(batch_size, num_channels_latents * 4, height // 2, width // 2)
|
| 208 |
-
return latents
|
| 209 |
-
|
| 210 |
-
@staticmethod
|
| 211 |
-
def _unpatchify_latents(latents):
|
| 212 |
-
batch_size, num_channels_latents, height, width = latents.shape
|
| 213 |
-
latents = latents.reshape(batch_size, num_channels_latents // (2 * 2), 2, 2, height, width)
|
| 214 |
-
latents = latents.permute(0, 1, 4, 2, 5, 3)
|
| 215 |
-
latents = latents.reshape(batch_size, num_channels_latents // (2 * 2), height * 2, width * 2)
|
| 216 |
-
return latents
|
| 217 |
-
|
| 218 |
-
def flux_encode(vae,latents):
|
| 219 |
-
# patch
|
| 220 |
-
image_latents = _patchify_latents(latents)
|
| 221 |
-
# norm
|
| 222 |
-
latents_bn_mean = vae.bn.running_mean.view(1, -1, 1, 1).to(image_latents.device, image_latents.dtype)
|
| 223 |
-
latents_bn_std = torch.sqrt(vae.bn.running_var.view(1, -1, 1, 1) + vae.config.batch_norm_eps)
|
| 224 |
-
latents = (image_latents - latents_bn_mean) / latents_bn_std
|
| 225 |
-
# unpatch
|
| 226 |
-
latents = _unpatchify_latents(latents)
|
| 227 |
-
return latents
|
| 228 |
-
|
| 229 |
-
def flux_decode(vae,latents):
|
| 230 |
-
# patch
|
| 231 |
-
image_latents = _patchify_latents(latents)
|
| 232 |
-
# norm
|
| 233 |
-
latents_bn_mean = vae.bn.running_mean.view(1, -1, 1, 1).to(image_latents.device, image_latents.dtype)
|
| 234 |
-
latents_bn_std = torch.sqrt(vae.bn.running_var.view(1, -1, 1, 1) + vae.config.batch_norm_eps)
|
| 235 |
-
latents = image_latents * latents_bn_std + latents_bn_mean
|
| 236 |
-
# unpatch
|
| 237 |
-
latents = _unpatchify_latents(latents)
|
| 238 |
-
return latents
|
| 239 |
|
| 240 |
def encode_to_latents(images, texts):
|
| 241 |
transform = get_image_transform(min_size, max_size, step)
|
|
@@ -269,20 +181,19 @@ def encode_to_latents(images, texts):
|
|
| 269 |
# Кодируем батч
|
| 270 |
with torch.no_grad():
|
| 271 |
posteriors = vae.encode(batch_tensor).latent_dist.mode()
|
| 272 |
-
|
| 273 |
-
|
|
|
|
| 274 |
|
| 275 |
-
latents_np =
|
| 276 |
|
| 277 |
# Обрабатываем тексты
|
| 278 |
text_labels = [clean_label(text) for text in texts]
|
| 279 |
|
| 280 |
model_prompts, text_labels = process_labels_for_guidance(text_labels, empty_share)
|
| 281 |
-
#embeddings = encode_texts_batch(model_prompts, tokenizer, model)
|
| 282 |
|
| 283 |
return {
|
| 284 |
"vae": latents_np,
|
| 285 |
-
#"embeddings": embeddings,
|
| 286 |
"text": text_labels,
|
| 287 |
"width": widths,
|
| 288 |
"height": heights
|
|
|
|
| 18 |
from datetime import timedelta
|
| 19 |
|
| 20 |
# ---------------- 1️⃣ Настройки ----------------
|
| 21 |
+
dtype = torch.float16
|
| 22 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 23 |
batch_size = 5
|
| 24 |
+
min_size = 640 #320 #384 #320 #192 #256 #192
|
| 25 |
+
max_size = 1280 #640 #768 #640 #384 #256 #384
|
| 26 |
+
step = 64
|
| 27 |
empty_share = 0.0
|
| 28 |
limit = 0
|
| 29 |
# Основная процедура обработки
|
|
|
|
| 43 |
def load_models():
|
| 44 |
print("Загрузка моделей...")
|
| 45 |
#vae = AsymmetricAutoencoderKL.from_pretrained("AiArtLab/sdxs-1b",subfolder="vae",torch_dtype=dtype).to(device).eval()
|
| 46 |
+
vae = AutoencoderKL.from_pretrained("vae", torch_dtype=dtype).to(device).eval()
|
| 47 |
+
return vae
|
| 48 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
vae = load_models()
|
| 50 |
|
| 51 |
shift_factor = getattr(vae.config, "shift_factor", 0.0)
|
|
|
|
| 56 |
if scaling_factor is None:
|
| 57 |
scaling_factor = 1.0
|
| 58 |
|
| 59 |
+
mean = getattr(vae.config, "latents_mean", None)
|
| 60 |
+
std = getattr(vae.config, "latents_std", None)
|
| 61 |
+
if mean is not None and std is not None:
|
| 62 |
+
latents_std = torch.tensor(std, device=device, dtype=dtype).view(1, len(std), 1, 1)
|
| 63 |
+
latents_mean = torch.tensor(mean, device=device, dtype=dtype).view(1, len(mean), 1, 1)
|
| 64 |
|
| 65 |
# ---------------- 3️⃣ Трансформации ----------------
|
| 66 |
def get_image_transform(min_size=256, max_size=512, step=64):
|
|
|
|
| 118 |
return transform
|
| 119 |
|
| 120 |
# ---------------- 4️⃣ Функции обработки ----------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
def clean_label(label):
|
| 122 |
label = label.replace("Image 1", "").replace("Image 2", "").replace("Image 3", "").replace("Image 4", "").replace("The image depicts ","").replace("The image presents ","").replace("The image features ","").replace("The image portrays ","").replace("The image is ","").strip()
|
| 123 |
if label.startswith("."):
|
|
|
|
| 148 |
|
| 149 |
return labels_for_model, labels_for_logging
|
| 150 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
|
| 152 |
def encode_to_latents(images, texts):
|
| 153 |
transform = get_image_transform(min_size, max_size, step)
|
|
|
|
| 181 |
# Кодируем батч
|
| 182 |
with torch.no_grad():
|
| 183 |
posteriors = vae.encode(batch_tensor).latent_dist.mode()
|
| 184 |
+
if latents_mean is not None and latents_std is not None:
|
| 185 |
+
posteriors = (posteriors - latents_mean) / latents_std
|
| 186 |
+
posteriors = (posteriors - shift_factor) / scaling_factor
|
| 187 |
|
| 188 |
+
latents_np = posteriors.to(dtype).cpu().numpy()
|
| 189 |
|
| 190 |
# Обрабатываем тексты
|
| 191 |
text_labels = [clean_label(text) for text in texts]
|
| 192 |
|
| 193 |
model_prompts, text_labels = process_labels_for_guidance(text_labels, empty_share)
|
|
|
|
| 194 |
|
| 195 |
return {
|
| 196 |
"vae": latents_np,
|
|
|
|
| 197 |
"text": text_labels,
|
| 198 |
"width": widths,
|
| 199 |
"height": heights
|
samples/unet_320x640_0.jpg
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
samples/unet_352x640_0.jpg
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
samples/unet_384x640_0.jpg
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
samples/unet_416x640_0.jpg
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
samples/unet_448x640_0.jpg
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
samples/unet_480x640_0.jpg
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
samples/unet_512x640_0.jpg
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
samples/unet_544x640_0.jpg
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
samples/unet_576x640_0.jpg
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
samples/unet_608x640_0.jpg
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
samples/unet_640x320_0.jpg
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
samples/unet_640x352_0.jpg
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
samples/unet_640x384_0.jpg
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
samples/unet_640x416_0.jpg
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
samples/unet_640x448_0.jpg
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
samples/unet_640x480_0.jpg
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
samples/unet_640x512_0.jpg
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
samples/unet_640x544_0.jpg
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
samples/unet_640x576_0.jpg
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
samples/unet_640x608_0.jpg
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
samples/unet_640x640_0.jpg
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
unet/diffusion_pytorch_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 5946605448
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:12e68807a96fc1f2dd993c400888104240383fd74379a2f083952996c0d60c13
|
| 3 |
size 5946605448
|