promo
Browse files- 1.png +0 -0
- README.md +13 -2
- Untitled.ipynb +110 -39
- low.png +0 -0
- promo.png +2 -2
- test.ipynb +0 -0
- transformer/diffusion_pytorch_model.fp16.safetensors +1 -1
- waifu.png +0 -0
1.png
CHANGED
|
|
README.md
CHANGED
|
@@ -16,12 +16,22 @@ waifu is a free text-to-image model that can efficiently generate images in 80 l
|
|
| 16 |
(2) [**Linear DiT**](https://github.com/NVlabs/Sana): we use 1.6b DiT transformer with linear attention. \
|
| 17 |
(3) [**MEXMA-SigLIP**](https://huggingface.co/visheratin/mexma-siglip): MEXMA-SigLIP is a model that combines the [MEXMA](https://huggingface.co/facebook/MEXMA) multilingual text encoder and an image encoder from the [SigLIP](https://huggingface.co/timm/ViT-SO400M-14-SigLIP-384) model. This allows us to get a high-performance CLIP model for 80 languages.. \
|
| 18 |
(4) Other: we use Flow-Euler sampler, Adafactor-Fused optimizer and bf16 precision for training, and combine efficient caption labeling (MoonDream, CogVlM, Human, Gpt's) and danbooru tags to accelerate convergence.
|
|
|
|
| 19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
## Example
|
| 22 |
|
| 23 |
```py
|
| 24 |
-
# install diffusers from source
|
| 25 |
pip install git+https://github.com/huggingface/diffusers
|
| 26 |
```
|
| 27 |
|
|
@@ -60,7 +70,8 @@ for img in image:
|
|
| 60 |
|
| 61 |
## Donations
|
| 62 |
|
| 63 |
-
We are a small GPU poor group of enthusiasts (current train budget ~$
|
|
|
|
| 64 |
|
| 65 |
Please contact with us if you may provide some GPU's on training
|
| 66 |
|
|
|
|
| 16 |
(2) [**Linear DiT**](https://github.com/NVlabs/Sana): we use 1.6b DiT transformer with linear attention. \
|
| 17 |
(3) [**MEXMA-SigLIP**](https://huggingface.co/visheratin/mexma-siglip): MEXMA-SigLIP is a model that combines the [MEXMA](https://huggingface.co/facebook/MEXMA) multilingual text encoder and an image encoder from the [SigLIP](https://huggingface.co/timm/ViT-SO400M-14-SigLIP-384) model. This allows us to get a high-performance CLIP model for 80 languages.. \
|
| 18 |
(4) Other: we use Flow-Euler sampler, Adafactor-Fused optimizer and bf16 precision for training, and combine efficient caption labeling (MoonDream, CogVlM, Human, Gpt's) and danbooru tags to accelerate convergence.
|
| 19 |
+
, вот исправленный текст на английском языке:
|
| 20 |
|
| 21 |
+
## Pros
|
| 22 |
+
- Small model that can be trained on a common GPU; fast training process.
|
| 23 |
+
- Supports multiple languages and demonstrates good prompt adherence.
|
| 24 |
+
- Utilizes the best 16-channel VAE (Variational Autoencoder).
|
| 25 |
+
|
| 26 |
+
## Cons
|
| 27 |
+
- Trained on only 2 million images (low-budget model, approximately $3,000).
|
| 28 |
+
- Training dataset consists primarily of anime and illustrations (only about 1% realistic images).
|
| 29 |
+
- Only lowres for now (512)
|
| 30 |
|
| 31 |
## Example
|
| 32 |
|
| 33 |
```py
|
| 34 |
+
# 1st, install latest diffusers from source!!
|
| 35 |
pip install git+https://github.com/huggingface/diffusers
|
| 36 |
```
|
| 37 |
|
|
|
|
| 70 |
|
| 71 |
## Donations
|
| 72 |
|
| 73 |
+
We are a small GPU poor group of enthusiasts (current train budget ~$3k)
|
| 74 |
+

|
| 75 |
|
| 76 |
Please contact with us if you may provide some GPU's on training
|
| 77 |
|
Untitled.ipynb
CHANGED
|
@@ -292,7 +292,7 @@
|
|
| 292 |
},
|
| 293 |
{
|
| 294 |
"cell_type": "code",
|
| 295 |
-
"execution_count":
|
| 296 |
"id": "5d85c3c7-3b5b-40c8-bc2b-6b4d344287d9",
|
| 297 |
"metadata": {},
|
| 298 |
"outputs": [
|
|
@@ -300,8 +300,7 @@
|
|
| 300 |
"name": "stdout",
|
| 301 |
"output_type": "stream",
|
| 302 |
"text": [
|
| 303 |
-
"
|
| 304 |
-
" device='cuda:0', dtype=torch.bfloat16)\n"
|
| 305 |
]
|
| 306 |
}
|
| 307 |
],
|
|
@@ -328,10 +327,10 @@
|
|
| 328 |
"\n",
|
| 329 |
"# Получение эмбеддингов изображения\n",
|
| 330 |
"with torch.inference_mode():\n",
|
| 331 |
-
" image_embeddings = model.encode_images(img, normalize=
|
| 332 |
"\n",
|
| 333 |
"# Вывод эмбеддингов\n",
|
| 334 |
-
"print(image_embeddings)"
|
| 335 |
]
|
| 336 |
},
|
| 337 |
{
|
|
@@ -617,61 +616,133 @@
|
|
| 617 |
},
|
| 618 |
{
|
| 619 |
"cell_type": "code",
|
| 620 |
-
"execution_count":
|
| 621 |
"id": "6fc2606b-cf1c-488a-a8fc-d98a4abcc8c0",
|
| 622 |
"metadata": {},
|
| 623 |
"outputs": [
|
| 624 |
{
|
| 625 |
-
"
|
| 626 |
-
|
| 627 |
-
|
| 628 |
-
|
| 629 |
-
|
| 630 |
-
|
| 631 |
-
|
| 632 |
-
|
| 633 |
-
|
| 634 |
-
|
| 635 |
-
"
|
| 636 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 637 |
}
|
| 638 |
],
|
| 639 |
"source": [
|
| 640 |
-
"from transformers import AutoProcessor, AutoModel\n",
|
| 641 |
-
"from PIL import Image\n",
|
| 642 |
-
"import requests\n",
|
| 643 |
"import torch\n",
|
| 644 |
-
"\n",
|
| 645 |
-
"
|
| 646 |
-
"
|
| 647 |
-
"#processor = AutoProcessor.from_pretrained(\"timm/ViT-SO400M-14-SigLIP-384\")\n",
|
| 648 |
-
"\n",
|
| 649 |
"from open_clip import create_model_from_pretrained, get_tokenizer # works on open-clip-torch>=2.23.0, timm>=0.9.8\n",
|
| 650 |
"\n",
|
| 651 |
-
"model,
|
|
|
|
| 652 |
"\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 653 |
"\n",
|
| 654 |
-
"
|
| 655 |
-
"
|
| 656 |
-
"image = Image.open(requests.get(url, stream=True).raw)\n",
|
| 657 |
"\n",
|
| 658 |
-
"
|
| 659 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 660 |
"\n",
|
| 661 |
-
"
|
| 662 |
-
"with torch.no_grad():\n",
|
| 663 |
-
" outputs = model.get_image_features(**inputs)\n",
|
| 664 |
"\n",
|
| 665 |
-
"
|
| 666 |
-
"
|
| 667 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 668 |
]
|
| 669 |
},
|
| 670 |
{
|
| 671 |
"cell_type": "code",
|
| 672 |
-
"execution_count":
|
| 673 |
"id": "e7b5d910-de0e-4f41-8d4e-7e4501aa33f4",
|
| 674 |
"metadata": {},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 675 |
"outputs": [],
|
| 676 |
"source": []
|
| 677 |
}
|
|
|
|
| 292 |
},
|
| 293 |
{
|
| 294 |
"cell_type": "code",
|
| 295 |
+
"execution_count": 10,
|
| 296 |
"id": "5d85c3c7-3b5b-40c8-bc2b-6b4d344287d9",
|
| 297 |
"metadata": {},
|
| 298 |
"outputs": [
|
|
|
|
| 300 |
"name": "stdout",
|
| 301 |
"output_type": "stream",
|
| 302 |
"text": [
|
| 303 |
+
"torch.Size([1, 1152])\n"
|
|
|
|
| 304 |
]
|
| 305 |
}
|
| 306 |
],
|
|
|
|
| 327 |
"\n",
|
| 328 |
"# Получение эмбеддингов изображения\n",
|
| 329 |
"with torch.inference_mode():\n",
|
| 330 |
+
" image_embeddings = model.encode_images(img, normalize=False)\n",
|
| 331 |
"\n",
|
| 332 |
"# Вывод эмбеддингов\n",
|
| 333 |
+
"print(image_embeddings.shape)"
|
| 334 |
]
|
| 335 |
},
|
| 336 |
{
|
|
|
|
| 616 |
},
|
| 617 |
{
|
| 618 |
"cell_type": "code",
|
| 619 |
+
"execution_count": 6,
|
| 620 |
"id": "6fc2606b-cf1c-488a-a8fc-d98a4abcc8c0",
|
| 621 |
"metadata": {},
|
| 622 |
"outputs": [
|
| 623 |
{
|
| 624 |
+
"name": "stderr",
|
| 625 |
+
"output_type": "stream",
|
| 626 |
+
"text": [
|
| 627 |
+
"/tmp/ipykernel_19418/3674156061.py:18: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.\n",
|
| 628 |
+
" with torch.no_grad(), torch.cuda.amp.autocast():\n"
|
| 629 |
+
]
|
| 630 |
+
},
|
| 631 |
+
{
|
| 632 |
+
"name": "stdout",
|
| 633 |
+
"output_type": "stream",
|
| 634 |
+
"text": [
|
| 635 |
+
"torch.Size([1, 1152])\n",
|
| 636 |
+
"tensor([[-0.0550, 0.1304, 0.1885, ..., -0.1434, -0.4676, 0.1461]])\n",
|
| 637 |
+
"Label probabilities: [('a dog', 0.0), ('a cat', 0.0), ('a donut', 0.0), ('a beignet', 0.517)]\n"
|
| 638 |
+
]
|
| 639 |
+
},
|
| 640 |
+
{
|
| 641 |
+
"name": "stderr",
|
| 642 |
+
"output_type": "stream",
|
| 643 |
+
"text": [
|
| 644 |
+
"/tmp/ipykernel_19418/3674156061.py:31: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.\n",
|
| 645 |
+
" with torch.no_grad(), torch.cuda.amp.autocast():\n"
|
| 646 |
+
]
|
| 647 |
+
},
|
| 648 |
+
{
|
| 649 |
+
"name": "stdout",
|
| 650 |
+
"output_type": "stream",
|
| 651 |
+
"text": [
|
| 652 |
+
"All patches shape: torch.Size([1, 1152])\n"
|
| 653 |
+
]
|
| 654 |
}
|
| 655 |
],
|
| 656 |
"source": [
|
|
|
|
|
|
|
|
|
|
| 657 |
"import torch\n",
|
| 658 |
+
"import torch.nn.functional as F\n",
|
| 659 |
+
"from urllib.request import urlopen\n",
|
| 660 |
+
"from PIL import Image\n",
|
|
|
|
|
|
|
| 661 |
"from open_clip import create_model_from_pretrained, get_tokenizer # works on open-clip-torch>=2.23.0, timm>=0.9.8\n",
|
| 662 |
"\n",
|
| 663 |
+
"model, preprocess = create_model_from_pretrained('hf-hub:timm/ViT-SO400M-14-SigLIP-384')\n",
|
| 664 |
+
"tokenizer = get_tokenizer('hf-hub:timm/ViT-SO400M-14-SigLIP-384')\n",
|
| 665 |
"\n",
|
| 666 |
+
"image = Image.open(urlopen(\n",
|
| 667 |
+
" 'https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/beignets-task-guide.png'\n",
|
| 668 |
+
"))\n",
|
| 669 |
+
"image = preprocess(image).unsqueeze(0)\n",
|
| 670 |
"\n",
|
| 671 |
+
"labels_list = [\"a dog\", \"a cat\", \"a donut\", \"a beignet\"]\n",
|
| 672 |
+
"text = tokenizer(labels_list, context_length=model.context_length)\n",
|
|
|
|
| 673 |
"\n",
|
| 674 |
+
"with torch.no_grad(), torch.cuda.amp.autocast():\n",
|
| 675 |
+
" image_features = model.encode_image(image)\n",
|
| 676 |
+
" print(image_features.shape)\n",
|
| 677 |
+
" print(image_features)\n",
|
| 678 |
+
" text_features = model.encode_text(text)\n",
|
| 679 |
+
" image_features = F.normalize(image_features, dim=-1)\n",
|
| 680 |
+
" text_features = F.normalize(text_features, dim=-1)\n",
|
| 681 |
"\n",
|
| 682 |
+
" text_probs = torch.sigmoid(image_features @ text_features.T * model.logit_scale.exp() + model.logit_bias)\n",
|
|
|
|
|
|
|
| 683 |
"\n",
|
| 684 |
+
"zipped_list = list(zip(labels_list, [round(p.item(), 3) for p in text_probs[0]]))\n",
|
| 685 |
+
"print(\"Label probabilities: \", zipped_list)\n",
|
| 686 |
+
"\n",
|
| 687 |
+
"with torch.no_grad(), torch.cuda.amp.autocast():\n",
|
| 688 |
+
" # Получаем скрытые состояния всех патчей\n",
|
| 689 |
+
" outputs = model.visual(image) # [batch_size, num_patches + 1, hidden_dim]\n",
|
| 690 |
+
" print(\"All patches shape:\", outputs.shape) # Пример: [1, 256, 1152]\n",
|
| 691 |
+
" #all_patch_embeddings = outputs[:, 1:, :] # Игнорируем [CLS]-токен\n",
|
| 692 |
+
" #print(\"All patches shape:\", all_patch_embeddings.shape) # Пример: [1, 256, 1152]\n"
|
| 693 |
]
|
| 694 |
},
|
| 695 |
{
|
| 696 |
"cell_type": "code",
|
| 697 |
+
"execution_count": 7,
|
| 698 |
"id": "e7b5d910-de0e-4f41-8d4e-7e4501aa33f4",
|
| 699 |
"metadata": {},
|
| 700 |
+
"outputs": [
|
| 701 |
+
{
|
| 702 |
+
"name": "stderr",
|
| 703 |
+
"output_type": "stream",
|
| 704 |
+
"text": [
|
| 705 |
+
"/tmp/ipykernel_19418/2526917774.py:1: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.\n",
|
| 706 |
+
" with torch.no_grad(), torch.cuda.amp.autocast():\n"
|
| 707 |
+
]
|
| 708 |
+
},
|
| 709 |
+
{
|
| 710 |
+
"ename": "AttributeError",
|
| 711 |
+
"evalue": "'TimmModel' object has no attribute 'patch_embed'",
|
| 712 |
+
"output_type": "error",
|
| 713 |
+
"traceback": [
|
| 714 |
+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
| 715 |
+
"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
|
| 716 |
+
"Cell \u001b[0;32mIn[7], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mno_grad(), torch\u001b[38;5;241m.\u001b[39mcuda\u001b[38;5;241m.\u001b[39mamp\u001b[38;5;241m.\u001b[39mautocast():\n\u001b[1;32m 2\u001b[0m \u001b[38;5;66;03m# Извлекаем патчи и позиционные эмбеддинги\u001b[39;00m\n\u001b[0;32m----> 3\u001b[0m x \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvisual\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpatch_embed\u001b[49m(image) \u001b[38;5;66;03m# [1, num_patches, 1152]\u001b[39;00m\n\u001b[1;32m 4\u001b[0m x \u001b[38;5;241m=\u001b[39m model\u001b[38;5;241m.\u001b[39mvisual\u001b[38;5;241m.\u001b[39mpos_drop(x \u001b[38;5;241m+\u001b[39m model\u001b[38;5;241m.\u001b[39mvisual\u001b[38;5;241m.\u001b[39mpos_embed)\n\u001b[1;32m 6\u001b[0m \u001b[38;5;66;03m# Проход через трансформерные блоки\u001b[39;00m\n",
|
| 717 |
+
"File \u001b[0;32m~/.local/lib/python3.11/site-packages/torch/nn/modules/module.py:1931\u001b[0m, in \u001b[0;36mModule.__getattr__\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m 1929\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m name \u001b[38;5;129;01min\u001b[39;00m modules:\n\u001b[1;32m 1930\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m modules[name]\n\u001b[0;32m-> 1931\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mAttributeError\u001b[39;00m(\n\u001b[1;32m 1932\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mtype\u001b[39m(\u001b[38;5;28mself\u001b[39m)\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m object has no attribute \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1933\u001b[0m )\n",
|
| 718 |
+
"\u001b[0;31mAttributeError\u001b[0m: 'TimmModel' object has no attribute 'patch_embed'"
|
| 719 |
+
]
|
| 720 |
+
}
|
| 721 |
+
],
|
| 722 |
+
"source": [
|
| 723 |
+
"with torch.no_grad(), torch.cuda.amp.autocast():\n",
|
| 724 |
+
" # Извлекаем патчи и позиционные эмбеддинги\n",
|
| 725 |
+
" x = model.visual.patch_embed(image) # [1, num_patches, 1152]\n",
|
| 726 |
+
" x = model.visual.pos_drop(x + model.visual.pos_embed)\n",
|
| 727 |
+
" \n",
|
| 728 |
+
" # Проход через трансформерные блоки\n",
|
| 729 |
+
" for blk in model.visual.blocks:\n",
|
| 730 |
+
" x = blk(x)\n",
|
| 731 |
+
" \n",
|
| 732 |
+
" # Применяем LayerNorm (если есть)\n",
|
| 733 |
+
" if hasattr(model.visual, \"norm\"):\n",
|
| 734 |
+
" x = model.visual.norm(x)\n",
|
| 735 |
+
" \n",
|
| 736 |
+
" # Теперь x содержит все патчи\n",
|
| 737 |
+
" print(\"All patches shape:\", x.shape)\n",
|
| 738 |
+
" # Пример вывода: torch.Size([1, 756, 1152])"
|
| 739 |
+
]
|
| 740 |
+
},
|
| 741 |
+
{
|
| 742 |
+
"cell_type": "code",
|
| 743 |
+
"execution_count": null,
|
| 744 |
+
"id": "29ecd610-7121-4c39-80cf-5021b80f6431",
|
| 745 |
+
"metadata": {},
|
| 746 |
"outputs": [],
|
| 747 |
"source": []
|
| 748 |
}
|
low.png
ADDED
|
promo.png
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
test.ipynb
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
transformer/diffusion_pytorch_model.fp16.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 3203093344
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:58fcdcbbb4ebdb16298ce420ee5277b89d52be4d0432ad93eea8ddbce4b3cf86
|
| 3 |
size 3203093344
|
waifu.png
CHANGED
|
|