v14

Browse files

Files changed (5) hide show

README.md +5 -115
pipeline_sdxs.py +8 -17
promo.png +2 -2
result_grid.jpg +3 -0
test.ipynb +2 -2

README.md CHANGED Viewed

@@ -7,12 +7,14 @@ pipeline_tag: text-to-image
 *XS Size, Excess Quality*
 At AiArtLab, we strive to create a free, compact and fast model that can be trained on consumer graphics cards.
 - We use U-Net for its high efficiency.
 - We have chosen the Qwen0.6b wich support 100+ languages.
 - We train new SOTA 16ch Simple VAE, which preserves details and anatomy.
-- The model was trained (~2 month on 4xRTX5090) on approximately 1+ million images with various resolutions and styles, including anime and realistic photos.
 ### Model Limitations:
 - Limited concept coverage due to the small dataset.
@@ -21,6 +23,7 @@ At AiArtLab, we strive to create a free, compact and fast model that can be trai
 - **[Stan](https://t.me/Stangle)** — Key investor. Thank you for believing in us when others called it madness.
 - **Captainsaturnus**
 - **Love. Death. Transformers.**
 ## Datasets
 - **[CaptionEmporium](https://huggingface.co/CaptionEmporium)**
@@ -37,120 +40,7 @@ BTC: 3JHv9Hb8kEW8zMAccdgCdZGfrHeMhH1rpN
 [recoilme](https://t.me/recoilme)
-Train status, in progress:
-![result](result_grid.jpg)
 ## Example
-```python
-import torch
-from diffusers import AutoencoderKL, DDPMScheduler, UNet2DConditionModel
-from transformers import AutoModel, AutoTokenizer
-from PIL import Image
-from tqdm.auto import tqdm
-import os
-def encode_prompt(prompt, negative_prompt, device, dtype):
-    if negative_prompt is None:
-        negative_prompt = ""
-    with torch.no_grad():
-        positive_inputs = tokenizer(
-            prompt,
-            return_tensors="pt",
-            padding="max_length",
-            max_length=512,
-            truncation=True,
-        ).to(device)
-        positive_embeddings = text_model.encode_texts(
-            positive_inputs.input_ids, positive_inputs.attention_mask
-        )
-        if positive_embeddings.ndim == 2:
-            positive_embeddings = positive_embeddings.unsqueeze(1)
-        positive_embeddings = positive_embeddings.to(device, dtype=dtype)
-        negative_inputs = tokenizer(
-            negative_prompt,
-            return_tensors="pt",
-            padding="max_length",
-            max_length=150,
-            truncation=True,
-        ).to(device)
-        negative_embeddings = text_model.encode_texts(negative_inputs.input_ids, negative_inputs.attention_mask)
-        if negative_embeddings.ndim == 2:
-            negative_embeddings = negative_embeddings.unsqueeze(1)
-        negative_embeddings = negative_embeddings.to(device, dtype=dtype)
-    return torch.cat([negative_embeddings, positive_embeddings], dim=0)
-def generate_latents(embeddings, height=576, width=576, num_inference_steps=50, guidance_scale=5.5):
-    with torch.no_grad():
-        device, dtype = embeddings.device, embeddings.dtype
-        half = embeddings.shape[0] // 2
-        latent_shape = (half, 16, height // 8, width // 8)
-        latents = torch.randn(latent_shape, device=device, dtype=dtype)
-        embeddings = embeddings.repeat_interleave(half, dim=0)
-        scheduler.set_timesteps(num_inference_steps)
-        for t in tqdm(scheduler.timesteps, desc="Генерация"):
-            latent_model_input = torch.cat([latents] * 2)
-            latent_model_input = scheduler.scale_model_input(latent_model_input, t)
-            noise_pred = unet(latent_model_input, t, embeddings).sample
-            noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-            noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-            latents = scheduler.step(noise_pred, t, latents).prev_sample
-    return latents
-def decode_latents(latents, vae, output_type="pil"):
-    latents = (latents / vae.config.scaling_factor) + vae.config.shift_factor
-    with torch.no_grad():
-        images = vae.decode(latents).sample
-    images = (images / 2 + 0.5).clamp(0, 1)
-    images = images.cpu().permute(0, 2, 3, 1).float().numpy()
-    if output_type == "pil":
-        images = (images * 255).round().astype("uint8")
-        images = [Image.fromarray(image) for image in images]
-    return images
-# Example usage:
-if __name__ == "__main__":
-    device = "cuda"
-    dtype = torch.float16
-    prompt = "girl"
-    negative_prompt = "bad quality"
-    tokenizer = AutoTokenizer.from_pretrained("visheratin/mexma-siglip")
-    text_model = AutoModel.from_pretrained(
-        "visheratin/mexma-siglip", torch_dtype=dtype, trust_remote_code=True
-    ).to(device, dtype=dtype).eval()
-    embeddings = encode_prompt(prompt, negative_prompt, device, dtype)
-    pipeid = "AiArtLab/sdxs"
-    variant = "fp16"
-    unet = UNet2DConditionModel.from_pretrained(pipeid, subfolder="unet", variant=variant).to(device, dtype=dtype).eval()
-    vae = AutoencoderKL.from_pretrained(pipeid, subfolder="vae", variant=variant).to(device, dtype=dtype).eval()
-    scheduler = DDPMScheduler.from_pretrained(pipeid, subfolder="scheduler")
-    height, width = 640, 576
-    num_inference_steps = 40
-    output_folder, project_name = "samples", "sdxs"
-    latents = generate_latents(
-        embeddings=embeddings,
-        height=height,
-        width=width,
-        num_inference_steps = num_inference_steps
-    )
-    images = decode_latents(latents, vae)
-    os.makedirs(output_folder, exist_ok=True)
-    for idx, image in enumerate(images):
-        image.save(f"{output_folder}/{project_name}_{idx}.jpg")
-    print("Images generated and saved to:", output_folder)
-```

 *XS Size, Excess Quality*
+![promo](promo.png)
 At AiArtLab, we strive to create a free, compact and fast model that can be trained on consumer graphics cards.
 - We use U-Net for its high efficiency.
 - We have chosen the Qwen0.6b wich support 100+ languages.
 - We train new SOTA 16ch Simple VAE, which preserves details and anatomy.
+- The model was trained ~3 month on 4xRTX5090 on approximately 1+ million images with various resolutions and styles, including anime and realistic photos.
 ### Model Limitations:
 - Limited concept coverage due to the small dataset.
 - **[Stan](https://t.me/Stangle)** — Key investor. Thank you for believing in us when others called it madness.
 - **Captainsaturnus**
 - **Love. Death. Transformers.**
+- **TOPAPEC**
 ## Datasets
 - **[CaptionEmporium](https://huggingface.co/CaptionEmporium)**
 [recoilme](https://t.me/recoilme)
 ## Example
+![result_grid](result_grid.jpg)

pipeline_sdxs.py CHANGED Viewed

@@ -54,13 +54,9 @@ class SdxsPipeline(DiffusionPipeline):
                 ).to(device)
                 # Получаем эмбеддинги
-                outputs = self.text_encoder(text_inputs.input_ids, text_inputs.attention_mask)
-                last_hidden_state = outputs.last_hidden_state.to(device, dtype=dtype)
-                pos_embeddings = self.text_projector(last_hidden_state[:, 0])
-                # Добавляем размерность для batch processing
-                if pos_embeddings.ndim == 2:
-                    pos_embeddings = pos_embeddings.unsqueeze(1)
             else:
                 # Создаем пустые эмбеддинги, если нет позитивного промпта
                 # (полезно для некоторых сценариев с unconditional generation)
@@ -85,15 +81,12 @@ class SdxsPipeline(DiffusionPipeline):
                 neg_inputs = self.tokenizer(
                     negative_prompt, return_tensors="pt", padding="max_length",
-                    max_length=512, truncation=True
                 ).to(device)
-                neg_outputs = self.text_encoder(neg_inputs.input_ids, neg_inputs.attention_mask)
-                neg_last_hidden_state = neg_outputs.last_hidden_state.to(device, dtype=dtype)
-                neg_embeddings = self.text_projector(neg_last_hidden_state[:, 0])
-                if neg_embeddings.ndim == 2:
-                    neg_embeddings = neg_embeddings.unsqueeze(1)
                 # Объединяем для classifier-free guidance
                 text_embeddings = torch.cat([neg_embeddings, pos_embeddings], dim=0)
@@ -159,8 +152,6 @@ class SdxsPipeline(DiffusionPipeline):
                 latent_input = torch.cat([latents] * 2)
             else:
                 latent_input = latents
-            latent_input = self.scheduler.scale_model_input(latent_input, t)
             # Предсказание шума
             noise_pred = self.unet(latent_input, t, text_embeddings).sample

                 ).to(device)
                 # Получаем эмбеддинги
+                outputs = self.text_encoder(text_inputs.input_ids, text_inputs.attention_mask,output_hidden_states=True)
+                pos_embeddings = outputs.hidden_states[-1].to(device, dtype=dtype)
             else:
                 # Создаем пустые эмбеддинги, если нет позитивного промпта
                 # (полезно для некоторых сценариев с unconditional generation)
                 neg_inputs = self.tokenizer(
                     negative_prompt, return_tensors="pt", padding="max_length",
+                    max_length=150, truncation=True
                 ).to(device)
+                # Получаем эмбеддинги
+                neg_outputs = self.text_encoder(neg_inputs.input_ids, neg_inputs.attention_mask,output_hidden_states=True)
+                neg_embeddings = neg_outputs.hidden_states[-1].to(device, dtype=dtype)
                 # Объединяем для classifier-free guidance
                 text_embeddings = torch.cat([neg_embeddings, pos_embeddings], dim=0)
                 latent_input = torch.cat([latents] * 2)
             else:
                 latent_input = latents
             # Предсказание шума
             noise_pred = self.unet(latent_input, t, text_embeddings).sample

promo.png CHANGED Viewed

Git LFS Details

SHA256: a9dc11dcb7fdc91f3e93f411d675b42d4475e5fba35c11083cab3bd5abb5ef90
Pointer size: 132 Bytes
Size of remote file: 2.58 MB

Git LFS Details

SHA256: 916ed67a3afa657c59315ec8bb93363cd85338a1b948f631e9b7f54e45d1d1fb
Pointer size: 131 Bytes
Size of remote file: 805 kB

result_grid.jpg ADDED Viewed

Git LFS Details

SHA256: 0a9e0e0663bd39c12f0c5650ffc0ecd407950d379157eb10c3a8f5c7d6271dfb
Pointer size: 132 Bytes
Size of remote file: 6.64 MB

test.ipynb CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9910e312c0b51bba9e406e9d0e70f0d8f11c1568b6a5648c779d9081f83bbec0
-size 5926474

 version https://git-lfs.github.com/spec/v1
+oid sha256:eac847382ebd4e35a4e3d1fe49fe3330d5f40f41df61c7de6e23e9b08ed2f804
+size 4953274