recoilme commited on
Commit
d7be4e1
·
1 Parent(s): 76cce30
Files changed (5) hide show
  1. README.md +5 -115
  2. pipeline_sdxs.py +8 -17
  3. promo.png +2 -2
  4. result_grid.jpg +3 -0
  5. test.ipynb +2 -2
README.md CHANGED
@@ -7,12 +7,14 @@ pipeline_tag: text-to-image
7
 
8
  *XS Size, Excess Quality*
9
 
 
 
10
  At AiArtLab, we strive to create a free, compact and fast model that can be trained on consumer graphics cards.
11
 
12
  - We use U-Net for its high efficiency.
13
  - We have chosen the Qwen0.6b wich support 100+ languages.
14
  - We train new SOTA 16ch Simple VAE, which preserves details and anatomy.
15
- - The model was trained (~2 month on 4xRTX5090) on approximately 1+ million images with various resolutions and styles, including anime and realistic photos.
16
 
17
  ### Model Limitations:
18
  - Limited concept coverage due to the small dataset.
@@ -21,6 +23,7 @@ At AiArtLab, we strive to create a free, compact and fast model that can be trai
21
  - **[Stan](https://t.me/Stangle)** — Key investor. Thank you for believing in us when others called it madness.
22
  - **Captainsaturnus**
23
  - **Love. Death. Transformers.**
 
24
 
25
  ## Datasets
26
  - **[CaptionEmporium](https://huggingface.co/CaptionEmporium)**
@@ -37,120 +40,7 @@ BTC: 3JHv9Hb8kEW8zMAccdgCdZGfrHeMhH1rpN
37
 
38
  [recoilme](https://t.me/recoilme)
39
 
40
- Train status, in progress:
41
-
42
- ![result](result_grid.jpg)
43
 
44
  ## Example
45
 
46
- ```python
47
- import torch
48
- from diffusers import AutoencoderKL, DDPMScheduler, UNet2DConditionModel
49
- from transformers import AutoModel, AutoTokenizer
50
- from PIL import Image
51
- from tqdm.auto import tqdm
52
- import os
53
-
54
- def encode_prompt(prompt, negative_prompt, device, dtype):
55
- if negative_prompt is None:
56
- negative_prompt = ""
57
-
58
- with torch.no_grad():
59
- positive_inputs = tokenizer(
60
- prompt,
61
- return_tensors="pt",
62
- padding="max_length",
63
- max_length=512,
64
- truncation=True,
65
- ).to(device)
66
- positive_embeddings = text_model.encode_texts(
67
- positive_inputs.input_ids, positive_inputs.attention_mask
68
- )
69
- if positive_embeddings.ndim == 2:
70
- positive_embeddings = positive_embeddings.unsqueeze(1)
71
- positive_embeddings = positive_embeddings.to(device, dtype=dtype)
72
-
73
- negative_inputs = tokenizer(
74
- negative_prompt,
75
- return_tensors="pt",
76
- padding="max_length",
77
- max_length=150,
78
- truncation=True,
79
- ).to(device)
80
- negative_embeddings = text_model.encode_texts(negative_inputs.input_ids, negative_inputs.attention_mask)
81
- if negative_embeddings.ndim == 2:
82
- negative_embeddings = negative_embeddings.unsqueeze(1)
83
- negative_embeddings = negative_embeddings.to(device, dtype=dtype)
84
- return torch.cat([negative_embeddings, positive_embeddings], dim=0)
85
-
86
- def generate_latents(embeddings, height=576, width=576, num_inference_steps=50, guidance_scale=5.5):
87
- with torch.no_grad():
88
- device, dtype = embeddings.device, embeddings.dtype
89
- half = embeddings.shape[0] // 2
90
- latent_shape = (half, 16, height // 8, width // 8)
91
- latents = torch.randn(latent_shape, device=device, dtype=dtype)
92
- embeddings = embeddings.repeat_interleave(half, dim=0)
93
-
94
- scheduler.set_timesteps(num_inference_steps)
95
-
96
- for t in tqdm(scheduler.timesteps, desc="Генерация"):
97
- latent_model_input = torch.cat([latents] * 2)
98
- latent_model_input = scheduler.scale_model_input(latent_model_input, t)
99
- noise_pred = unet(latent_model_input, t, embeddings).sample
100
- noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
101
- noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
102
- latents = scheduler.step(noise_pred, t, latents).prev_sample
103
- return latents
104
-
105
-
106
- def decode_latents(latents, vae, output_type="pil"):
107
- latents = (latents / vae.config.scaling_factor) + vae.config.shift_factor
108
- with torch.no_grad():
109
- images = vae.decode(latents).sample
110
- images = (images / 2 + 0.5).clamp(0, 1)
111
- images = images.cpu().permute(0, 2, 3, 1).float().numpy()
112
- if output_type == "pil":
113
- images = (images * 255).round().astype("uint8")
114
- images = [Image.fromarray(image) for image in images]
115
- return images
116
-
117
- # Example usage:
118
- if __name__ == "__main__":
119
- device = "cuda"
120
- dtype = torch.float16
121
-
122
- prompt = "girl"
123
- negative_prompt = "bad quality"
124
- tokenizer = AutoTokenizer.from_pretrained("visheratin/mexma-siglip")
125
- text_model = AutoModel.from_pretrained(
126
- "visheratin/mexma-siglip", torch_dtype=dtype, trust_remote_code=True
127
- ).to(device, dtype=dtype).eval()
128
-
129
- embeddings = encode_prompt(prompt, negative_prompt, device, dtype)
130
-
131
- pipeid = "AiArtLab/sdxs"
132
- variant = "fp16"
133
-
134
- unet = UNet2DConditionModel.from_pretrained(pipeid, subfolder="unet", variant=variant).to(device, dtype=dtype).eval()
135
- vae = AutoencoderKL.from_pretrained(pipeid, subfolder="vae", variant=variant).to(device, dtype=dtype).eval()
136
- scheduler = DDPMScheduler.from_pretrained(pipeid, subfolder="scheduler")
137
-
138
-
139
- height, width = 640, 576
140
- num_inference_steps = 40
141
- output_folder, project_name = "samples", "sdxs"
142
- latents = generate_latents(
143
- embeddings=embeddings,
144
- height=height,
145
- width=width,
146
- num_inference_steps = num_inference_steps
147
- )
148
-
149
- images = decode_latents(latents, vae)
150
-
151
- os.makedirs(output_folder, exist_ok=True)
152
- for idx, image in enumerate(images):
153
- image.save(f"{output_folder}/{project_name}_{idx}.jpg")
154
-
155
- print("Images generated and saved to:", output_folder)
156
- ```
 
7
 
8
  *XS Size, Excess Quality*
9
 
10
+ ![promo](promo.png)
11
+
12
  At AiArtLab, we strive to create a free, compact and fast model that can be trained on consumer graphics cards.
13
 
14
  - We use U-Net for its high efficiency.
15
  - We have chosen the Qwen0.6b wich support 100+ languages.
16
  - We train new SOTA 16ch Simple VAE, which preserves details and anatomy.
17
+ - The model was trained ~3 month on 4xRTX5090 on approximately 1+ million images with various resolutions and styles, including anime and realistic photos.
18
 
19
  ### Model Limitations:
20
  - Limited concept coverage due to the small dataset.
 
23
  - **[Stan](https://t.me/Stangle)** — Key investor. Thank you for believing in us when others called it madness.
24
  - **Captainsaturnus**
25
  - **Love. Death. Transformers.**
26
+ - **TOPAPEC**
27
 
28
  ## Datasets
29
  - **[CaptionEmporium](https://huggingface.co/CaptionEmporium)**
 
40
 
41
  [recoilme](https://t.me/recoilme)
42
 
 
 
 
43
 
44
  ## Example
45
 
46
+ ![result_grid](result_grid.jpg)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pipeline_sdxs.py CHANGED
@@ -54,13 +54,9 @@ class SdxsPipeline(DiffusionPipeline):
54
  ).to(device)
55
 
56
  # Получаем эмбеддинги
57
- outputs = self.text_encoder(text_inputs.input_ids, text_inputs.attention_mask)
58
- last_hidden_state = outputs.last_hidden_state.to(device, dtype=dtype)
59
- pos_embeddings = self.text_projector(last_hidden_state[:, 0])
60
-
61
- # Добавляем размерность для batch processing
62
- if pos_embeddings.ndim == 2:
63
- pos_embeddings = pos_embeddings.unsqueeze(1)
64
  else:
65
  # Создаем пустые эмбеддинги, если нет позитивного промпта
66
  # (полезно для некоторых сценариев с unconditional generation)
@@ -85,15 +81,12 @@ class SdxsPipeline(DiffusionPipeline):
85
 
86
  neg_inputs = self.tokenizer(
87
  negative_prompt, return_tensors="pt", padding="max_length",
88
- max_length=512, truncation=True
89
  ).to(device)
90
-
91
- neg_outputs = self.text_encoder(neg_inputs.input_ids, neg_inputs.attention_mask)
92
- neg_last_hidden_state = neg_outputs.last_hidden_state.to(device, dtype=dtype)
93
- neg_embeddings = self.text_projector(neg_last_hidden_state[:, 0])
94
-
95
- if neg_embeddings.ndim == 2:
96
- neg_embeddings = neg_embeddings.unsqueeze(1)
97
 
98
  # Объединяем для classifier-free guidance
99
  text_embeddings = torch.cat([neg_embeddings, pos_embeddings], dim=0)
@@ -159,8 +152,6 @@ class SdxsPipeline(DiffusionPipeline):
159
  latent_input = torch.cat([latents] * 2)
160
  else:
161
  latent_input = latents
162
-
163
- latent_input = self.scheduler.scale_model_input(latent_input, t)
164
 
165
  # Предсказание шума
166
  noise_pred = self.unet(latent_input, t, text_embeddings).sample
 
54
  ).to(device)
55
 
56
  # Получаем эмбеддинги
57
+ outputs = self.text_encoder(text_inputs.input_ids, text_inputs.attention_mask,output_hidden_states=True)
58
+ pos_embeddings = outputs.hidden_states[-1].to(device, dtype=dtype)
59
+
 
 
 
 
60
  else:
61
  # Создаем пустые эмбеддинги, если нет позитивного промпта
62
  # (полезно для некоторых сценариев с unconditional generation)
 
81
 
82
  neg_inputs = self.tokenizer(
83
  negative_prompt, return_tensors="pt", padding="max_length",
84
+ max_length=150, truncation=True
85
  ).to(device)
86
+
87
+ # Получаем эмбеддинги
88
+ neg_outputs = self.text_encoder(neg_inputs.input_ids, neg_inputs.attention_mask,output_hidden_states=True)
89
+ neg_embeddings = neg_outputs.hidden_states[-1].to(device, dtype=dtype)
 
 
 
90
 
91
  # Объединяем для classifier-free guidance
92
  text_embeddings = torch.cat([neg_embeddings, pos_embeddings], dim=0)
 
152
  latent_input = torch.cat([latents] * 2)
153
  else:
154
  latent_input = latents
 
 
155
 
156
  # Предсказание шума
157
  noise_pred = self.unet(latent_input, t, text_embeddings).sample
promo.png CHANGED

Git LFS Details

  • SHA256: a9dc11dcb7fdc91f3e93f411d675b42d4475e5fba35c11083cab3bd5abb5ef90
  • Pointer size: 132 Bytes
  • Size of remote file: 2.58 MB

Git LFS Details

  • SHA256: 916ed67a3afa657c59315ec8bb93363cd85338a1b948f631e9b7f54e45d1d1fb
  • Pointer size: 131 Bytes
  • Size of remote file: 805 kB
result_grid.jpg ADDED

Git LFS Details

  • SHA256: 0a9e0e0663bd39c12f0c5650ffc0ecd407950d379157eb10c3a8f5c7d6271dfb
  • Pointer size: 132 Bytes
  • Size of remote file: 6.64 MB
test.ipynb CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9910e312c0b51bba9e406e9d0e70f0d8f11c1568b6a5648c779d9081f83bbec0
3
- size 5926474
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eac847382ebd4e35a4e3d1fe49fe3330d5f40f41df61c7de6e23e9b08ed2f804
3
+ size 4953274