federicogirella commited on Oct 22, 2025

Commit

217bd11

verified ·

1 Parent(s): 8e6eb47

Upload folder using huggingface_hub

Browse files

Files changed (25) hide show

.gitattributes +2 -35
.gitignore +197 -0
README.md +65 -3
ckpts/lots/lots.bin +3 -0
ckpts/lots/pair_former_config.json +1 -0
pyproject.toml +17 -0
requirements.txt +13 -0
run_inference.sh +11 -0
run_train.sh +17 -0
scripts/lots/convert_lots_weights.py +30 -0
scripts/lots/inference_lots.py +140 -0
scripts/lots/train_lots.py +536 -0
scripts/sketchy/sketchy.ipynb +230 -0
setup.py +3 -0
src/lots/__init__.py +0 -0
src/lots/cross_attn.py +408 -0
src/lots/lots_pipeline.py +227 -0
src/lots/pair_former.py +226 -0
src/lots/projectors.py +49 -0
src/sketchy/__init__.py +0 -0
src/sketchy/sketchy_dataset.py +226 -0
src/utils/__init__.py +0 -0
src/utils/dinov2_utils.py +55 -0
src/utils/script_utils.py +100 -0
static/LOTS.png +3 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,2 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text


1	+ ckpts/lots/lots.bin filter=lfs diff=lfs merge=lfs -text
2	+ static/LOTS.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,197 @@

+# Created by https://www.toptal.com/developers/gitignore/api/python,visualstudiocode
+# Edit at https://www.toptal.com/developers/gitignore?templates=python,visualstudiocode
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+### Python Patch ###
+# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
+poetry.toml
+# ruff
+.ruff_cache/
+# LSP config files
+pyrightconfig.json
+### VisualStudioCode ###
+.vscode/
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+!.vscode/*.code-snippets
+# Local History for Visual Studio Code
+.history/
+# Built Visual Studio Code Extensions
+*.vsix
+### VisualStudioCode Patch ###
+# Ignore all local history of files
+.history
+.ionide
+### Custom ###
+outputs

README.md CHANGED Viewed

@@ -1,3 +1,65 @@
----
-license: cc-by-nc-4.0
----

+# LOTS of Fashion! Multi-Conditioning for Image Generation via Sketch-Text Pairing #
+[![Code](https://img.shields.io/badge/Code-%23121011.svg?style=flat&logo=github&logoColor=white)](https://github.com/intelligolabs/lots)
+[![Project Page](https://img.shields.io/badge/Project_Page-121013?style=flat&logo=github&logoColor=white)](https://intelligolabs.github.io/lots)
+[![Dataset on HF](https://huggingface.co/datasets/huggingface/badges/resolve/main/dataset-on-hf-sm-dark.svg)](https://huggingface.co/datasets/federicogirella/sketchy)
+![LOTS](static/LOTS.png)
+This is the official implementation of the **LOTS** adapter from the paper *"LOTS of Fashion! Multi-Conditioning for Image Generation via Sketch-Text Pairing"*, published as **Oral at ICCV25** in Honolulu.
+To access the **Sketchy** dataset, refer to [the HuggingFace repository](https://huggingface.co/datasets/federicogirella/sketchy)
+## Road Map ##
+- [x] Code release
+- [x] Weights release
+- [ ] Platform release
+## Repository Structure ##
+1. `ckpts` folder
+* Contains the pre-trained weights of the LOTS adapter.
+2. `scripts` folder
+* Contains all the scripts for training and inference with LOTS on Sketchy.
+3. `src` folder
+* Contains all the source code for the classes, models, and dataloaders used in the scripts.
+## Installation ##
+We advise creating a Conda environment as follows
+* `conda create -n lots python=3.12`
+* `conda activate lots`
+* `pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu121`
+* `pip install -r requirements.txt`
+* `pip install -e .`
+Unzip the pre-trained weights and config
+```
+cd ckpts
+unzip lots.zip
+cd ..
+```
+## **Training** ##
+We provide the script to train LOTS on our Sketchy dataset in `scripts/lots/train_lots.py`.
+For an example of usage, check `run_train.sh`, which contains the default parameters used in our experiments.
+## **Inference** ##
+You can test our pre-trained model with the inference script in `scripts/lots/inference_lots.py`.
+For an example, check `run_inference.sh`.
+This script generates an image for each item in the test split of Sketchy, and saves them in a structured folder, with each item identified by its unique ID.
+## Citation
+If you find our work useful, please cite our work:
+```
+@inproceedings{girella2025lots,
+  author    = {Girella, Federico and Talon, Davide and Lie, Ziyue and Ruan, Zanxi and Wang, Yiming and Cristani, Marco},
+  title     = {LOTS of Fashion! Multi-Conditioning for Image Generation via Sketch-Text Pairing},
+  journal   = {Proceedings of the International Conference on Computer Vision},
+  year      = {2025},
+}
+```

ckpts/lots/lots.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f6db0781dd119e34c9b8fbad48572eef423347be9a8a57b7297d173be899e075
+size 2105167979

ckpts/lots/pair_former_config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"in_channels": 2048, "fusion_strategy": "deferred", "num_layers": 2, "num_attention_heads": 8, "inner_dim": 2048, "dropout": 0.0, "norm_num_groups": 32, "activation_fn": "geglu", "masking_strategy": "compression", "num_cls_tokens": 32}

pyproject.toml ADDED Viewed

	@@ -0,0 +1,17 @@

+[build-system]
+requires = ["setuptools"]
+build-backedn = "setuptools.build_meta"
+[project]
+name = "lots"
+authors = [
+    {name = "Federico Girella", email = "federico.girella@univr.it"},
+]
+description = "Package for LOTS experiments"
+readme = "README.md"
+version = "1.0.0"
+requires-python = ">=3.12"
+dependencies = []
+[tool.setuptools.packages.find]
+where = ["src"]

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+accelerate==1.3.0
+diffusers==0.33.1
+fashionpedia==1.1
+huggingface-hub==0.28.1
+matplotlib==3.10.0
+notebook==7.3.2
+numpy==2.1.2
+opencv-python==4.11.0.86
+pillow==11.0.0
+pycocotools==2.0.8
+tokenizers==0.21.0
+tqdm==4.67.1
+transformers==4.48.3

run_inference.sh ADDED Viewed

	@@ -0,0 +1,11 @@

+RUN_NAME="test_run"
+python scripts/lots/inference_lots.py \
+    --base_model_path="stabilityai/stable-diffusion-xl-base-1.0" \
+    --device="cuda" \
+    --seed=21 \
+    --dinov2_model="vits14" \
+    --ckpt_path="ckpts/lots/lots.bin" \
+    --dataset_root="data/sketchy" \
+    --out_dir="outputs/inference/$RUN_NAME" \
+    --resolution=512

run_train.sh ADDED Viewed

	@@ -0,0 +1,17 @@

+RUN_NAME="test_run"
+accelerate launch --mixed_precision "bf16" --num_processes 4 --multi-gpu --gpu_ids='all'\
+    scripts/lots/train_lots.py \
+    --pretrained_model_name_or_path="stabilityai/stable-diffusion-xl-base-1.0" \
+    --dataset_root="data/sketchy" \
+    --output_dir="outputs/checkpoints/$RUN_NAME" \
+    --resolution=512 \
+    --learning_rate=1e-5 \
+    --num_train_epochs=80 \
+    --dataloader_num_workers=8 \
+    --save_steps=10000 \
+    --train_batch_size=8 \
+    --dinov2_model="vits14" \
+    --num_cls_tokens=32 \
+    --fusion_strategy="deferred" \
+    --gradient_accumulation_steps=8

scripts/lots/convert_lots_weights.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import torch
+import os
+def convert_lots_weights(ckpt):
+    sd = torch.load(ckpt, map_location="cpu")
+    image_proj_sd = {}
+    cross_attn = {}
+    text_proj_sd = {}
+    pair_former_sd = {}
+    for k in sd:
+        if k.startswith("unet"):
+            pass
+        elif k.startswith("image_proj_model"):
+            image_proj_sd[k.replace("image_proj_model.", "")] = sd[k]
+        elif k.startswith("text_proj_model"):
+            text_proj_sd[k.replace("text_proj_model.", "")] = sd[k]
+        elif k.startswith("cross_attn_modules"):
+            cross_attn[k.replace("cross_attn_modules.", "")] = sd[k]
+        elif k.startswith("pair_former_model"):
+            pair_former_sd[k.replace("pair_former_model.", "")] = sd[k]
+    assert len(text_proj_sd) > 0, "text projection weights are empty"
+    assert len(cross_attn) > 0, "cross-attn modules weights are empty"
+    assert len(image_proj_sd) > 0, "image projection weights are empty"
+    assert len(pair_former_sd) > 0, "pair former weights are empty"
+    return {"image_proj": image_proj_sd, "cross_attn": cross_attn, "text_proj": text_proj_sd, "pair_former": pair_former_sd}
+if __name__ == "__main__":
+    ckpt = "/path/to/training/pytorch_model.bin"
+    state_dict = convert_lots_weights(ckpt)
+    torch.save(state_dict, ckpt.replace(os.path.basename(ckpt), "lots.bin"))

scripts/lots/inference_lots.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import torch
+from diffusers import StableDiffusionXLPipeline
+import os
+from lots.lots_pipeline import LOTSPipeline
+from utils.dinov2_utils import get_dinov2_model
+from tqdm import tqdm
+from utils.script_utils import set_seed
+import argparse
+import os
+from convert_lots_weights import convert_lots_weights
+from sketchy.sketchy_dataset import SketchyDataset
+def get_args():
+    parser = argparse.ArgumentParser(description="Inference script for CLIPAdapter")
+    parser.add_argument("--base_model_path", type=str, default="stabilityai/stable-diffusion-xl-base-1.0", help="Path to the base model")
+    parser.add_argument("--device", type=str, default="cuda", help="Device to run the model on")
+    parser.add_argument("--seed", type=int, default=21, help="Seed for reproducibility")
+    parser.add_argument("--dinov2_model", type=str, default="vits14",
+        choices=["vits14", "vitb14", "vitl14", "vitg14"],
+        help="DINOv2 model type to use")
+    parser.add_argument("--ckpt_path", type=str, required=True, help="Path to the checkpoint.bin")
+    parser.add_argument("--dataset_root", type=str, required=True, help="Path to the validation dataset root")
+    parser.add_argument("--out_dir", type=str, required=True, help="Path to the output directory")
+    parser.add_argument("--with_shoes", action="store_true", help="Keep shoes in the dataset")
+    parser.add_argument("--resolution", type=int, default=512, help="Resolution for the generated images")
+    args = parser.parse_args()
+    return args
+if __name__ == "__main__":
+    args = get_args()
+    base_model_path = args.base_model_path
+    device = args.device
+    SEED = args.seed
+    ckpt_path = args.ckpt_path
+    val_dataset_root = args.dataset_root
+    out_dir = args.out_dir
+    with_shoes = args.with_shoes
+    image_encoder = get_dinov2_model(args.dinov2_model)
+    # load SDXL pipeline
+    pipe = StableDiffusionXLPipeline.from_pretrained(
+        base_model_path,
+        torch_dtype=torch.float16,
+        add_watermarker=False,
+    )
+    # check that the bin exists and is properly converted
+    if not os.path.exists(ckpt_path):
+        print('Converting weights')
+        state_dict = convert_lots_weights(ckpt_path.replace(os.path.basename(ckpt_path), "pytorch_model.bin"))
+        torch.save(state_dict, ckpt_path)
+    lots_pipe = LOTSPipeline(
+        pipe,
+        image_encoder=image_encoder,
+        model_type=args.dinov2_model,
+        lots_ckpt=ckpt_path,
+        device=device,
+        num_tokens=32,
+    )
+    set_seed(SEED)
+    os.makedirs(out_dir, exist_ok=True)
+    os.makedirs(out_dir, exist_ok=True)
+    img_dir = os.path.join(out_dir, "image_dir")
+    os.makedirs(img_dir, exist_ok=True)
+    global_sketch_dir = os.path.join(out_dir, "global_sketch_dir")
+    os.makedirs(global_sketch_dir, exist_ok=True)
+    local_sketches_dir = os.path.join(out_dir, "local_sketches_dir")
+    os.makedirs(local_sketches_dir, exist_ok=True)
+    global_descriptions_dir = os.path.join(out_dir, "global_description_dir")
+    os.makedirs(global_descriptions_dir, exist_ok=True)
+    local_descriptions_dir = os.path.join(out_dir, "local_descriptions_dir")
+    os.makedirs(local_descriptions_dir, exist_ok=True)
+    run_name = ckpt_path.split("/")[-3] + "-" + ckpt_path.split("/")[-2].split("-")[-1]
+    val_dataset = SketchyDataset(
+        dataset_root=val_dataset_root,
+        split="test",
+        load_img = True,
+        load_global_sketch=True,
+        load_local_sketch=True,
+        compose_global_sketch=True,
+        img_size=args.resolution,
+        img_transforms=None,
+        global_sketch_transforms=None,
+        local_sketch_transforms=None,
+        text_tokenizers=None,
+        with_shoes=with_shoes,
+        concat_locals=True,
+    )
+    val_dataloader = torch.utils.data.DataLoader(
+        val_dataset,
+        batch_size=1,
+        num_workers=0,
+        drop_last=False,
+        shuffle=False,
+        collate_fn=val_dataset.collate_fn,
+    )
+    prompt = "High quality photo of a model, artistic, 4k"
+    with open(os.path.join(out_dir, "prompt.txt"), "w") as f:
+        f.write(prompt)
+    for idx, batch in tqdm(enumerate(val_dataloader), desc="Generating images", total=len(val_dataloader)):
+        image = batch["image"][0]
+        # apply transformations
+        global_sketch = batch["global_sketch"][0]
+        ann_ids = batch["local_descriptions_ann_ids"][0]
+        input_sketches = batch["local_sketches"][0]
+        # batch the sketches
+        global_desc = batch["global_description"][0]
+        local_descriptions = batch["local_descriptions"][0]
+        image_id = batch["image_id"][0]
+        gen_images = lots_pipe.generate(prompt=prompt, pil_images=input_sketches, descriptions=local_descriptions, num_samples=1, num_inference_steps=30, resolution=args.resolution, scale=0.8)
+        gen_image = gen_images[0]
+        # save data
+        with open(os.path.join(global_descriptions_dir, f"{image_id}.txt"), "w") as f:
+            f.write(global_desc)
+        # save the partial desccriptions
+        with open(os.path.join(local_descriptions_dir, f"{image_id}.txt"), "w") as f:
+            f.write('\n'.join(local_descriptions))
+        # save the sketch
+        os.makedirs(os.path.join(local_sketches_dir, f"{image_id}"), exist_ok=True)
+        for s, sid in zip(input_sketches, ann_ids):
+            s.save(os.path.join(local_sketches_dir, f"{image_id}", f"{sid}.png"))
+        global_sketch.save(os.path.join(global_sketch_dir, f"{image_id}.png"))
+        output_path = os.path.join(img_dir, f"{image_id}.png")
+        gen_image.save(output_path)
+    print(f"DONE")

scripts/lots/train_lots.py ADDED Viewed

	@@ -0,0 +1,536 @@

+## partially adapted from https://github.dev/tencent-ailab/IP-Adapter/tree/main
+import os
+import random
+import argparse
+from pathlib import Path
+import itertools
+import torch
+import torch.nn.functional as F
+from torchvision import transforms
+from transformers import AutoImageProcessor
+from accelerate import Accelerator
+from accelerate.utils import ProjectConfiguration
+from diffusers import AutoencoderKL, DDPMScheduler, UNet2DConditionModel
+from transformers import CLIPTextModel, CLIPTokenizer, CLIPTextModelWithProjection
+import os
+from utils.dinov2_utils import get_dinov2_model, get_feature_dim, extract_features, get_pooling_dim
+from utils.script_utils import is_torch2_available
+if is_torch2_available():
+    from lots.cross_attn import AttnProcessor2_0 as AttnProcessor
+    from lots.cross_attn import LOTSAttnProcessor2_0 as LOTSAttnProcessor
+else:
+    from lots.cross_attn import AttnProcessor
+    from lots.cross_attn import LOTSAttnProcessor as LOTSAttnProcessor
+from convert_lots_weights import convert_lots_weights
+from lots.projectors import TokenProjector, SequenceProjModel
+from lots.pair_former import PairFormer
+from sketchy.sketchy_dataset import SketchyDataset
+def parse_args():
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--dataset_root",
+        type=str,
+        default="",
+        required=True,
+        help="Training data root path",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="lots_adapter",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images"
+        ),
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-5,
+        help="Learning rate to use.",
+    )
+    parser.add_argument("--weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--num_train_epochs", type=int, default=80)
+    parser.add_argument(
+        "--train_batch_size", type=int, default=8, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--noise_offset", type=float, default=None, help="noise offset")
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument(
+        "--save_steps",
+        type=int,
+        default=10000,
+        help=(
+            "Save a checkpoint of the training state every X updates"
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--dinov2_model",
+        type=str,
+        default="vits14",
+        choices=["vits14", "vitb14", "vitl14", "vitg14"],
+        help="DINOv2 model type to use",
+    )
+    parser.add_argument("--with_shoes", action="store_true", help="Use shoes in the annotations")
+    parser.add_argument("--num_cls_tokens", type=int, default=32, help="Number of class tokens")
+    parser.add_argument("--fusion_strategy", type=str, default="deferred", help="Fusion strategy to use", choices=["mean", "deferred"])
+    parser.add_argument("--gradient_accumulation_steps", type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.")
+    args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+    return args
+class LOTSTrainingPipeline(torch.nn.Module):
+    """LOTS"""
+    def __init__(self, unet, image_proj_model, text_proj_model, pair_former_model, cross_attn_modules, ckpt_path=None):
+        super().__init__()
+        self.unet = unet
+        self.image_proj_model = image_proj_model
+        self.text_proj_model = text_proj_model
+        self.pair_former_model = pair_former_model
+        self.cross_attn_modules = cross_attn_modules
+        if ckpt_path is not None:
+            self.load_from_checkpoint(ckpt_path)
+    def forward(self, noisy_latents, timesteps, encoder_hidden_states, unet_added_cond_kwargs, image_embeds, image_masks, partial_text_embeds, partial_text_masks):
+        pair_img_tokens = self.image_proj_model(image_embeds)
+        pair_txt_tokens = self.text_proj_model(partial_text_embeds)
+        # pair fusion with mask
+        compressed_pairs = self.pair_former_model(image_embeds=pair_img_tokens, text_embeds=pair_txt_tokens, image_masks=image_masks, text_masks=partial_text_masks)
+        # fusion output has shape: B, N*L, C where L is a variable number of tokens
+        # create the cross_attn_mask for the unet
+        # the mask needs to be a tensor (batch, seq_len) where True means keep, False means discard
+        tokens_per_item = self.pair_former_model.num_cls_tokens
+        num_items = pair_img_tokens.shape[1]
+        pair_cross_attn_mask = torch.zeros((compressed_pairs.shape[0], tokens_per_item*num_items), dtype=torch.bool, device=compressed_pairs.device)
+        for i, mask in enumerate(image_masks):
+            pair_cross_attn_mask[i, :sum(mask) * tokens_per_item ] = True
+        # encoder_hidden_states will be fed to unet.
+        # The processors will handle the first part of the sequence (global text) with the pre-trained weights,
+        # and the pairs with the additional cross-attn modules
+        encoder_hidden_states = torch.cat([encoder_hidden_states, compressed_pairs], dim=1)
+        # Predict the noise residual
+        noise_pred = self.unet(noisy_latents, timesteps, encoder_hidden_states, added_cond_kwargs=unet_added_cond_kwargs, encoder_attention_mask=pair_cross_attn_mask).sample
+        return noise_pred
+    def load_from_checkpoint(self, ckpt_path: str):
+        # Calculate original checksums
+        orig_img_sum = torch.sum(torch.stack([torch.sum(p) for p in self.image_proj_model.parameters()]))
+        orig_text_sum = torch.sum(torch.stack([torch.sum(p) for p in self.text_proj_model.parameters()]))
+        orig_pair_former_sum = torch.sum(torch.stack([torch.sum(p) for p in self.pair_former_model.parameters()]))
+        orig_cross_attn_sum = torch.sum(torch.stack([torch.sum(p) for p in self.cross_attn_modules.parameters()]))
+        state_dict = torch.load(ckpt_path, map_location="cpu")
+        # Load state dict for projection models, pair former, and cross-attn modules
+        self.image_proj_model.load_state_dict(state_dict["image_proj"], strict=True)
+        self.text_proj_model.load_state_dict(state_dict["text_proj"], strict=True)
+        self.pair_former_model.load_state_dict(state_dict["pair_former"], strict=True)
+        self.cross_attn_modules.load_state_dict(state_dict["cross_attn"], strict=True)
+        # Calculate new checksums
+        new_img_sum = torch.sum(torch.stack([torch.sum(p) for p in self.image_proj_model.parameters()]))
+        new_text_sum = torch.sum(torch.stack([torch.sum(p) for p in self.text_proj_model.parameters()]))
+        new_pair_former_sum = torch.sum(torch.stack([torch.sum(p) for p in self.pair_former_model.parameters()]))
+        new_cross_attn_sum = torch.sum(torch.stack([torch.sum(p) for p in self.cross_attn_modules.parameters()]))
+        # Verify if the weights have changed
+        assert orig_img_sum != new_img_sum, "Weights of image_proj_model did not change!"
+        assert orig_text_sum != new_text_sum, "Weights of text_proj_model did not change!"
+        assert orig_pair_former_sum != new_pair_former_sum, "Weights of pair_former_model did not change!"
+        assert orig_cross_attn_sum != new_cross_attn_sum, "Weights of cross_attn_modules did not change!"
+        print(f"Successfully loaded weights from checkpoint {ckpt_path}")
+def create_batch_tensor(batch, image_drop_prob=0.0, image_size=512):
+    # data is returned as a dict of lists
+    batch_size = len(batch["image"])
+    # find the item in data with the maximum number of sketches
+    max_num_sketches = max([len(example) for example in batch["local_sketches"]])
+    # do padding to items to put all data in a tensor
+    batch["local_sketch_masks"] = []
+    batch["local_text_masks"] = []
+    batch["drop_image_embeds"] = []
+    batch["crop_coords_top_left"] = []
+    batch["target_size"] = []
+    batch["original_size"] = []
+    for idx in range(batch_size):
+        # pad local sketches
+        num_sketches = len(batch["local_sketches"][idx])
+        batch['local_sketch_masks'].append([True for _ in range(num_sketches)]) # True means it's not padding
+        batch['local_text_masks'].append([True for _ in range(len(batch["local_descriptions_ids"][idx]))]) # True means it's not padding
+        if num_sketches < max_num_sketches:
+            batch["local_sketches"][idx] += [torch.zeros_like(batch["local_sketches"][idx][0]) for _ in range(max_num_sketches - num_sketches)]
+            # add the padding mask
+            batch["local_sketch_masks"][idx] += [False for _ in range(max_num_sketches - num_sketches)]
+        batch["local_sketches"][idx] = torch.cat(batch["local_sketches"][idx], dim=0)
+        # pad local text
+        num_local_texts = len(batch["local_descriptions_ids"][idx])
+        if num_local_texts < max_num_sketches:
+            batch["local_descriptions_ids"][idx] += [torch.zeros_like(batch["local_descriptions_ids"][idx][0]) for _ in range(max_num_sketches - num_local_texts)]
+            batch["local_text_masks"][idx] += [False for _ in range(max_num_sketches - num_local_texts)]
+        batch["local_descriptions_ids"][idx] = torch.cat(batch["local_descriptions_ids"][idx], dim=0) # TODO: check dim
+        # pad local text 2
+        num_local_texts_2 = len(batch["local_descriptions_ids_2"][idx])
+        if num_local_texts_2 < max_num_sketches:
+            batch["local_descriptions_ids_2"][idx] += [torch.zeros_like(batch["local_descriptions_ids_2"][idx][0]) for _ in range(max_num_sketches - num_local_texts_2)]
+        batch["local_descriptions_ids_2"][idx] = torch.cat(batch["local_descriptions_ids_2"][idx], dim=0) # TODO: check dim
+        # decide whether to drop the image embed
+        rand_num = random.random()
+        if rand_num < image_drop_prob:
+            batch['drop_image_embeds'].append(1)
+        else:
+            batch['drop_image_embeds'].append(0)
+        # add crop_coords_top_left, original, and target_size
+        batch['crop_coords_top_left'].append(torch.tensor([0, 0]))
+        batch['original_size'].append(torch.tensor([image_size, image_size]))
+        batch['target_size'].append(torch.tensor([image_size, image_size]))
+    batch["local_descriptions_ids"] = torch.stack(batch["local_descriptions_ids"], dim=0)
+    batch["local_descriptions_ids_2"] = torch.stack(batch["local_descriptions_ids_2"], dim=0)
+    batch["local_sketches"] = torch.stack(batch["local_sketches"], dim=0)
+    batch["original_size"] = torch.stack(batch["original_size"], dim=0)
+    batch["crop_coords_top_left"] = torch.stack(batch["crop_coords_top_left"], dim=0)
+    batch["target_size"] = torch.stack(batch["target_size"], dim=0)
+    return batch
+def main():
+    args = parse_args()
+    logging_dir = Path(args.output_dir, "logs")
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+    accelerator = Accelerator(
+        mixed_precision=args.mixed_precision,
+        project_config=accelerator_project_config,
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+    )
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+    # Load scheduler, tokenizer and models.
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+    tokenizer = CLIPTokenizer.from_pretrained(args.pretrained_model_name_or_path, subfolder="tokenizer")
+    text_encoder = CLIPTextModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="text_encoder")
+    tokenizer_2 = CLIPTokenizer.from_pretrained(args.pretrained_model_name_or_path, subfolder="tokenizer_2")
+    text_encoder_2 = CLIPTextModelWithProjection.from_pretrained(args.pretrained_model_name_or_path, subfolder="text_encoder_2")
+    vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae")
+    unet = UNet2DConditionModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="unet")
+    image_encoder = get_dinov2_model(args.dinov2_model)
+    feature_dim = get_feature_dim(args.dinov2_model)
+    # freeze parameters of models to save more memory
+    unet.requires_grad_(False)
+    vae.requires_grad_(False)
+    text_encoder.requires_grad_(False)
+    text_encoder_2.requires_grad_(False)
+    image_encoder.requires_grad_(False)
+    num_tokens = 4
+    image_proj_model = TokenProjector(
+        cross_attention_dim=unet.config.cross_attention_dim,
+        embeddings_dim=feature_dim,
+    )
+    text_proj_model = SequenceProjModel(
+        cross_attention_dim=unet.config.cross_attention_dim,
+        embeddings_dim=text_encoder.config.projection_dim +  text_encoder_2.config.projection_dim,
+        extra_context_tokens=num_tokens,
+    )
+    num_global_tokens = 77 # clip text tokens
+    pair_former = PairFormer(
+        in_channels=unet.config.cross_attention_dim,
+        inner_dim=unet.config.cross_attention_dim,
+        fusion_strategy=args.fusion_strategy,
+        num_layers=2,
+        num_attention_heads=8,
+        dropout=0.0,
+        activation_fn="geglu",
+        norm_num_groups=32,
+        masking_strategy="compression",
+        num_cls_tokens=args.num_cls_tokens
+    )
+    # init cross_attention layers
+    # credits to IP-Adapter for the procedure
+    attn_procs = {}
+    unet_sd = unet.state_dict()
+    for name in unet.attn_processors.keys():
+        cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
+        if name.startswith("mid_block"):
+            hidden_size = unet.config.block_out_channels[-1]
+        elif name.startswith("up_blocks"):
+            block_id = int(name[len("up_blocks.")])
+            hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
+        elif name.startswith("down_blocks"):
+            block_id = int(name[len("down_blocks.")])
+            hidden_size = unet.config.block_out_channels[block_id]
+        if cross_attention_dim is None:
+            attn_procs[name] = AttnProcessor()
+        else:
+            layer_name = name.split(".processor")[0]
+            weights = {
+                "to_k_lots.weight": unet_sd[layer_name + ".to_k.weight"],
+                "to_v_lots.weight": unet_sd[layer_name + ".to_v.weight"],
+            }
+            attn_procs[name] = LOTSAttnProcessor(hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, num_global_tokens=num_global_tokens)
+            attn_procs[name].load_state_dict(weights)
+    unet.set_attn_processor(attn_procs)
+    adapter_modules = torch.nn.ModuleList(unet.attn_processors.values())
+    lots_pipeline = LOTSTrainingPipeline(unet, image_proj_model=image_proj_model, text_proj_model=text_proj_model, pair_former_model=pair_former, cross_attn_modules=adapter_modules)
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+    vae.to(accelerator.device) # use fp32
+    text_encoder.to(accelerator.device, dtype=weight_dtype)
+    text_encoder_2.to(accelerator.device, dtype=weight_dtype)
+    image_encoder.to(accelerator.device, dtype=weight_dtype)
+    params_to_opt = itertools.chain(lots_pipeline.image_proj_model.parameters(), lots_pipeline.text_proj_model.parameters(), lots_pipeline.cross_attn_modules.parameters(), lots_pipeline.pair_former_model.parameters())
+    optimizer = torch.optim.AdamW(params_to_opt, lr=args.learning_rate, weight_decay=args.weight_decay)
+    # dataloader
+    image_transforms = transforms.Compose([
+            transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR),
+            transforms.ToTensor(),
+            transforms.Normalize([0.5], [0.5]),
+        ])
+    sketch_processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base")
+    # lambda function to automatically extract pixel values from dino processor
+    sketch_transforms = lambda pil_image: sketch_processor(images=pil_image, return_tensors="pt").pixel_values
+    train_dataset = SketchyDataset(args.dataset_root,
+                                   split="train",
+                                   load_img=True,
+                                   load_global_sketch=False,
+                                   load_local_sketch=True,
+                                   img_size=args.resolution,
+                                   img_transforms=image_transforms,
+                                   global_sketch_transforms=None,
+                                   local_sketch_transforms=sketch_transforms,
+                                   text_tokenizers=[tokenizer, tokenizer_2],
+                                   with_shoes=args.with_shoes,
+                                   concat_locals=True, # not needed
+                                   compose_global_sketch=False # not needed
+                                   )
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        shuffle=True,
+        collate_fn=train_dataset.collate_fn,
+        batch_size=args.train_batch_size,
+        num_workers=args.dataloader_num_workers,
+        drop_last=True
+    )
+    # pre-compute the global description text tokens
+    global_desc = "High quality photo of a model, artistic, 4k"
+    global_desc_ids1 = tokenizer(global_desc, max_length=tokenizer.model_max_length, padding="max_length", truncation=True, return_tensors="pt").input_ids
+    global_desc_ids2 = tokenizer_2(global_desc, max_length=tokenizer_2.model_max_length, padding="max_length", truncation=True, return_tensors="pt").input_ids
+    # Prepare everything with our `accelerator`.
+    lots_pipeline, optimizer, train_dataloader = accelerator.prepare(lots_pipeline, optimizer, train_dataloader)
+    global_step = 0
+    for epoch in range(0, args.num_train_epochs):
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(lots_pipeline):
+                # handle batching of the inputs with padding
+                batch = create_batch_tensor(batch, image_drop_prob=0.05, image_size=args.resolution)
+                # Convert images to latent space
+                with torch.no_grad():
+                    # vae of sdxl should use fp32
+                    latents = vae.encode(batch["image"].to(accelerator.device, dtype=torch.float32)).latent_dist.sample()
+                    latents = latents * vae.config.scaling_factor
+                    latents = latents.to(accelerator.device, dtype=weight_dtype)
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(latents)
+                if args.noise_offset:
+                    # https://www.crosslabs.org//blog/diffusion-with-offset-noise
+                    noise += args.noise_offset * torch.randn((latents.shape[0], latents.shape[1], 1, 1)).to(accelerator.device, dtype=weight_dtype)
+                bsz = latents.shape[0]
+                # Sample a random timestep for each image
+                timesteps = torch.randint(0, noise_scheduler.num_train_timesteps, (bsz,), device=latents.device)
+                timesteps = timesteps.long()
+                # Add noise to the latents according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+                with torch.no_grad():
+                    image_embeds = []
+                    for sketches in batch['local_sketches']:
+                        image_embeds.append(image_encoder(sketches).last_hidden_state)
+                    image_embeds = torch.stack(image_embeds)
+                image_embeds_ = []
+                for image_embed, drop_image_embed in zip(image_embeds, batch["drop_image_embeds"]):
+                    if drop_image_embed == 1:
+                        image_embeds_.append(torch.zeros_like(image_embed))
+                    else:
+                        image_embeds_.append(image_embed)
+                image_embeds = torch.stack(image_embeds_)
+                with torch.no_grad():
+                    # Use the generic global description. Change this if you also want to train to condition using global description.
+                    encoder_output = text_encoder(global_desc_ids1.to(accelerator.device), output_hidden_states=True)
+                    global_text_embeds = encoder_output.hidden_states[-2]
+                    encoder_output_2 = text_encoder_2(global_desc_ids2.to(accelerator.device), output_hidden_states=True)
+                    pooled_text_embeds = encoder_output_2[0]
+                    global_text_embeds_2 = encoder_output_2.hidden_states[-2]
+                    global_text_embeds = torch.concat([global_text_embeds, global_text_embeds_2], dim=-1) # concat
+                    # repeat for each item in the batch
+                    global_text_embeds = global_text_embeds.repeat(args.train_batch_size, 1, 1)
+                    pooled_text_embeds = pooled_text_embeds.repeat(args.train_batch_size, 1)
+                    # local description embeddings
+                    local_text_embeds = []
+                    for text_ids_1 in batch['local_descriptions_ids']:
+                        local_text_embeds.append(text_encoder(text_ids_1.to(accelerator.device))['pooler_output'])
+                    local_text_embeds = torch.stack(local_text_embeds)
+                    partial_text_embeds_ = []
+                    for text_embed, drop_image_embed in zip(local_text_embeds, batch["drop_image_embeds"]):
+                        if drop_image_embed == 1:
+                            partial_text_embeds_.append(torch.zeros_like(text_embed))
+                        else:
+                            partial_text_embeds_.append(text_embed)
+                    local_text_embeds = torch.stack(partial_text_embeds_)
+                    # local description embeds 2
+                    local_text_embeds_2 = []
+                    for local_text_ids_2 in batch['local_descriptions_ids_2']:
+                        local_text_embeds_2.append(text_encoder_2(local_text_ids_2.to(accelerator.device))['text_embeds'])
+                    local_text_embeds_2 = torch.stack(local_text_embeds_2)
+                    local_text_embeds_2_ = []
+                    for text_embed, drop_image_embed in zip(local_text_embeds_2, batch["drop_image_embeds"]):
+                        if drop_image_embed == 1:
+                            local_text_embeds_2_.append(torch.zeros_like(text_embed))
+                        else:
+                            local_text_embeds_2_.append(text_embed)
+                    local_text_embeds_2 = torch.stack(local_text_embeds_2_)
+                    # merge partial text embeds in channels
+                    local_text_embeds = torch.cat([local_text_embeds, local_text_embeds_2], dim=2)
+                # add cond
+                add_time_ids = [
+                    batch["original_size"].to(accelerator.device),
+                    batch["crop_coords_top_left"].to(accelerator.device),
+                    batch["target_size"].to(accelerator.device),
+                ]
+                add_time_ids = torch.cat(add_time_ids, dim=1).to(accelerator.device, dtype=weight_dtype)
+                unet_added_cond_kwargs = {"text_embeds": pooled_text_embeds, "time_ids": add_time_ids}
+                noise_pred = lots_pipeline(noisy_latents, timesteps, global_text_embeds, unet_added_cond_kwargs,
+                                        image_embeds=image_embeds,
+                                        image_masks=batch['local_sketch_masks'],
+                                        partial_text_embeds=local_text_embeds,
+                                        partial_text_masks=batch['local_text_masks'])
+                loss = F.mse_loss(noise_pred.float(), noise.float(), reduction="mean")
+                # Gather the losses across all processes for logging (if we use distributed training).
+                avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean().item()
+                # Backpropagate
+                accelerator.backward(loss)
+                # accellerator takes care of gradient accumulation
+                optimizer.step()
+                optimizer.zero_grad()
+                if accelerator.is_main_process:
+                    print("Epoch {}, step {}, step_loss: {}".format(
+                        epoch, step, avg_loss))
+            global_step += 1
+            if global_step % args.save_steps == 0:
+                save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                accelerator.save_state(save_path, safe_serialization=False)
+                if accelerator.is_main_process:
+                    # save fusion config
+                    pair_former.save_config_json(os.path.join(save_path, 'pair_former_config.json'))
+                    state_dict = convert_lots_weights(os.path.join(save_path, 'pytorch_model.bin'))
+                    torch.save(state_dict, os.path.join(save_path, 'lots.bin'))
+                    # remove old save state
+                    os.remove(os.path.join(save_path, 'pytorch_model.bin'))
+                    print(f"Saved checkpoint to {save_path}")
+    accelerator.wait_for_everyone()
+    save_path = os.path.join(args.output_dir, f"checkpoint-final")
+    accelerator.save_state(save_path, safe_serialization=False)
+    if accelerator.is_main_process:
+        pair_former.save_config_json(os.path.join(save_path, 'pair_former_config.json'))
+        state_dict = convert_lots_weights(os.path.join(save_path, 'pytorch_model.bin'))
+        torch.save(state_dict, os.path.join(save_path, 'lots.bin'))
+        print(f"Saved checkpoint to {save_path}")
+    accelerator.end_training()
+if __name__ == "__main__":
+    main()

scripts/sketchy/sketchy.ipynb ADDED Viewed

	@@ -0,0 +1,230 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "258ea95e",
+   "metadata": {},
+   "source": [
+    "# Sketchy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6ec5215d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "from sketchy.sketchy_dataset import SketchyDataset\n",
+    "from torch.utils.data import DataLoader\n",
+    "from tqdm import tqdm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8f772acd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset_root = \"<path/to/sketchy/root>\"\n",
+    "split = \"train\"\n",
+    "img_size = 512\n",
+    "load_img = True\n",
+    "load_global_sketch = True\n",
+    "load_local_sketches = True\n",
+    "with_shoes = False\n",
+    "concat_locals = True\n",
+    "compose_global_sketch = True\n",
+    "img_transforms = None\n",
+    "global_sketch_transforms = None\n",
+    "\n",
+    "\n",
+    "sketchy_dataset = SketchyDataset(dataset_root=dataset_root, \n",
+    "                              split=split, \n",
+    "                              img_size=img_size, \n",
+    "                              load_img=load_img, \n",
+    "                              load_global_sketch=load_global_sketch,\n",
+    "                              load_local_sketch=load_local_sketches,\n",
+    "                              img_transforms=img_transforms,\n",
+    "                              global_sketch_transforms=global_sketch_transforms,\n",
+    "                              with_shoes=with_shoes,\n",
+    "                              concat_locals=concat_locals,\n",
+    "                              compose_global_sketch=compose_global_sketch,\n",
+    "                              )\n",
+    "print(f\"Number of images in {split} split: {len(sketchy_dataset)}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0d036a36",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create a dataloader with the proper collate function\n",
+    "dataloader = DataLoader(sketchy_dataset,\n",
+    "                        batch_size=8, \n",
+    "                        shuffle=False, \n",
+    "                        num_workers=0, \n",
+    "                        collate_fn=sketchy_dataset.collate_fn)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5baa0a3c",
+   "metadata": {},
+   "source": [
+    "## Visualize the item data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6ea93e19",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get a sample from the dataset\n",
+    "item = sketchy_dataset[13]\n",
+    "print(\"####### ITEM KEYS ########\")\n",
+    "for key in item.keys():\n",
+    "    print(f\"{key}\")\n",
+    "    \n",
+    "print(\"\\n####### IMAGE ########\")\n",
+    "# item['image'] is an image (by default PIL.Image)\n",
+    "plt.imshow(item['image'])\n",
+    "plt.axis('off')\n",
+    "plt.title(\"GT Image of item\")\n",
+    "plt.show()\n",
+    "\n",
+    "\n",
+    "print(\"\\n####### LOCAL DESCRIPTIONS ########\")\n",
+    "# item['local_descriptions'] is a list of strings. Each string is a description of a single item in the image.\n",
+    "# NOTE: the local descriptions, local sketches, and masks are all aligned, meaning that the i-th local description corresponds to the i-th mask and i-th local sketch.\n",
+    "num_descriptions = len(item['local_descriptions'])\n",
+    "print(f\"Number of local descriptions in item: {num_descriptions}\")\n",
+    "for i, desc in enumerate(item['local_descriptions']):\n",
+    "    print(f\"Local description {i}: {desc}\")\n",
+    "\n",
+    "print(\"\\n####### GLOBAL SKETCH ########\")\n",
+    "# item['global_sketch'] is an image\n",
+    "# visualize the global sketch\n",
+    "plt.imshow(item['global_sketch'])\n",
+    "plt.axis('off')\n",
+    "plt.title(\"Global Sketch\")\n",
+    "plt.show()\n",
+    "\n",
+    "print(\"\\n####### LOCAL SKETCHES ########\")\n",
+    "# item['local_sketches'] is a list of images. Each item in the item has a list. In each sublist, there is an image for each local sketch in the item.\n",
+    "num_local_sketches = len(item['local_sketches'])\n",
+    "assert num_local_sketches == num_descriptions, \"Number of local sketches will always be equal to number of local descriptions\"\n",
+    "print(f\"Number of local sketches in item 0: {num_local_sketches}\")\n",
+    "# visualize the local sketches\n",
+    "MAX_NUM_COLUMNS = 2\n",
+    "num_cols = min(num_local_sketches, MAX_NUM_COLUMNS)\n",
+    "num_rows = num_local_sketches // num_cols + (num_local_sketches % num_cols > 0)\n",
+    "fig, axs = plt.subplots(num_rows, num_cols, figsize=(5, 5))\n",
+    "if num_local_sketches > 1:\n",
+    "    # flatten the axs for easier indexing\n",
+    "    axs = axs.flatten()\n",
+    "    for i in range(len(item['local_sketches'])):\n",
+    "        axs[i].imshow(item['local_sketches'][i])\n",
+    "        axs[i].set_title(f\"Local Sketch {i}\")\n",
+    "        axs[i].axis('off')\n",
+    "else:\n",
+    "    axs.imshow(item['local_sketches'][0])\n",
+    "    axs.set_title(f\"Local Sketch 0\")\n",
+    "    axs.axis('off')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "56d29ae3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# iterate over the dataloader.\n",
+    "# NOTE: this changes how the data is structured due to the collate function. This is needed for batching the data.\n",
+    "for idx, batch in tqdm(enumerate(dataloader), total=len(dataloader), desc=\"Iterating over batches\"):\n",
+    "    continue # remove this to visualize the first element of a batch\n",
+    "\n",
+    "    print(\"####### BATCH INFO ########\")\n",
+    "    # every batch is a dictionary with the following keys:\n",
+    "    for key in batch.keys():\n",
+    "        print(f\"{key}\")\n",
+    "    \n",
+    "    print(\"\\n####### IMAGE ########\")\n",
+    "    # batch['image'] is a list of images, one for each item in the batch\n",
+    "    plt.imshow(batch['image'][0])\n",
+    "    plt.axis('off')\n",
+    "    plt.title(\"GT Image of item 0 in batch\")\n",
+    "    plt.show()\n",
+    "    \n",
+    "\n",
+    "    print(\"####### LOCAL DESCRIPTIONS ########\")\n",
+    "    # batch['local_descriptions'] is a list of lists of strings. Each item in the batch has a list. In each sublist, there is a description for each item in the image.\n",
+    "    num_descriptions = len(batch['local_descriptions'][0])\n",
+    "    print(f\"Number of local descriptions in item 0: {num_descriptions}\")\n",
+    "    for i, desc in enumerate(batch['local_descriptions'][0]):\n",
+    "        print(f\"Local description {i}: {desc}\")\n",
+    "\n",
+    "    \n",
+    "    print(\"####### GLOBAL SKETCH ########\")\n",
+    "    # batch['global_sketch'] is a list of images, one for each item in the batch\n",
+    "    # visualize the global sketch\n",
+    "    plt.imshow(batch['global_sketch'][0])\n",
+    "    plt.axis('off')\n",
+    "    plt.title(\"Global Sketch of item 0 in batch\")\n",
+    "    plt.show()\n",
+    "\n",
+    "    \n",
+    "    print(\"####### LOCAL SKETCHES ########\")\n",
+    "    # batch['local_sketches'] is a list of lists of images. Each item in the batch has a list. In each sublist, there is an image for each local sketch in the item.\n",
+    "    num_local_sketches = len(batch['local_sketches'][0])\n",
+    "    assert num_local_sketches == num_descriptions, \"Number of local sketches will always be equal to number of local descriptions\"\n",
+    "    print(f\"Number of local sketches in item 0: {num_local_sketches}\")\n",
+    "    # visualize the local sketches\n",
+    "    MAX_NUM_COLUMNS = 2\n",
+    "    num_cols = min(num_local_sketches, MAX_NUM_COLUMNS)\n",
+    "    num_rows = num_local_sketches // num_cols + (num_local_sketches % num_cols > 0)\n",
+    "    fig, axs = plt.subplots(num_rows, num_cols, figsize=(5, 5))\n",
+    "    # flatten the axs for easier indexing\n",
+    "    if num_local_sketches > 1:\n",
+    "        axs = axs.flatten()\n",
+    "        for i in range(len(batch['local_sketches'][0])):\n",
+    "            axs[i].imshow(batch['local_sketches'][0][i])\n",
+    "            axs[i].set_title(f\"Local Sketch {i}\")\n",
+    "            axs[i].axis('off')\n",
+    "    else:\n",
+    "        axs.imshow(batch['local_sketches'][0][0])\n",
+    "        axs.set_title(f\"Local Sketch 0\")\n",
+    "        axs.axis('off')\n",
+    "    break  # remove this to iterate through all batches"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "sketch2img",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

setup.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from setuptools import setup
2	+
3	+ setup()

src/lots/__init__.py ADDED Viewed

File without changes

src/lots/cross_attn.py ADDED Viewed

	@@ -0,0 +1,408 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class LOTSAttnProcessor2_0(torch.nn.Module):
+    r"""
+    Attention processor for LOTS cross-attention modules for PyTorch 2.0.
+    Inspired by IP-Adapter https://github.dev/tencent-ailab/IP-Adapter/tree/main
+    Args:
+        hidden_size (`int`):
+            The hidden size of the attention layer.
+        cross_attention_dim (`int`):
+            The number of channels in the `encoder_hidden_states`.
+        scale (`float`, defaults to 1.0):
+            the weight scale of image prompt.
+        num_global_tokens (`int`):
+            The context length of the global text tokens (not pair information).
+    """
+    def __init__(self, hidden_size, cross_attention_dim=None, scale=1.0, num_global_tokens=77):
+        super().__init__()
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+        self.scale = scale
+        self.num_global_tokens = num_global_tokens
+        self.to_k_lots = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+        self.to_v_lots = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+        *args,
+        **kwargs,
+    ):
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        # our attention mask in case of padding items in the batch
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length - self.num_global_tokens, batch_size)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        else:
+            # get encoder_hidden_states, lots_pair_states
+            encoder_hidden_states, lots_pair_states = (
+                encoder_hidden_states[:, :self.num_global_tokens, :],
+                encoder_hidden_states[:, self.num_global_tokens:, :],
+            )
+            if attn.norm_cross:
+                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        # for lots cross-attn
+        lots_key = self.to_k_lots(lots_pair_states)
+        lots_value = self.to_v_lots(lots_pair_states)
+        lots_key = lots_key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        lots_value = lots_value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        lots_pair_states = F.scaled_dot_product_attention(
+            query, lots_key, lots_value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        with torch.no_grad():
+            self.attn_map = query @ lots_key.transpose(-2, -1).softmax(dim=-1)
+            # use the mask to mask the attention map
+            if attention_mask is not None:
+                self.masked_attn_map = (query @ lots_key.transpose(-2, -1) + attention_mask).softmax(dim=-1)
+        lots_pair_states = lots_pair_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        lots_pair_states = lots_pair_states.to(query.dtype)
+        hidden_states = hidden_states + self.scale * lots_pair_states
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class LOTSAttnProcessor(nn.Module):
+    r"""
+    Attention processor for LOTS cross-attention.
+    Inspired by IP-Adapter
+    Args:
+        hidden_size (`int`):
+            The hidden size of the attention layer.
+        cross_attention_dim (`int`):
+            The number of channels in the `encoder_hidden_states`.
+        scale (`float`, defaults to 1.0):
+            the weight scale of image prompt.
+        num_global_tokens (`int`):
+            The context length of the global text tokens (not pair information).
+    """
+    def __init__(self, hidden_size, cross_attention_dim=None, scale=1.0, num_global_tokens=77):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+        self.scale = scale
+        self.num_global_tokens = num_global_tokens
+        self.to_k_lots = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+        self.to_v_lots = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+        *args,
+        **kwargs,
+    ):
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        else:
+            # get encoder_hidden_states, lots_pair_states
+            encoder_hidden_states, lots_pair_states = (
+                encoder_hidden_states[:, :self.num_global_tokens, :],
+                encoder_hidden_states[:, self.num_global_tokens:, :],
+            )
+            if attn.norm_cross:
+                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+        attention_probs = attn.get_attention_scores(query, key, None)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # for lots cross-attn
+        lots_key = self.to_k_lots(lots_pair_states)
+        lots_value = self.to_v_lots(lots_pair_states)
+        lots_key = attn.head_to_batch_dim(lots_key)
+        lots_value = attn.head_to_batch_dim(lots_value)
+        lots_attention_probs = attn.get_attention_scores(query, lots_key, attention_mask)
+        self.attn_map = lots_attention_probs
+        lots_pair_states = torch.bmm(lots_attention_probs, lots_value)
+        lots_pair_states = attn.batch_to_head_dim(lots_pair_states)
+        hidden_states = hidden_states + self.scale * lots_pair_states
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+# Processors from IP-Adapter https://github.dev/tencent-ailab/IP-Adapter/tree/main
+class AttnProcessor(nn.Module):
+    r"""
+    Default processor for performing attention-related computations.
+    """
+    def __init__(
+        self,
+        hidden_size=None,
+        cross_attention_dim=None,
+    ):
+        super().__init__()
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+        *args,
+        **kwargs,
+    ):
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class AttnProcessor2_0(torch.nn.Module):
+    r"""
+    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
+    """
+    def __init__(
+        self,
+        hidden_size=None,
+        cross_attention_dim=None,
+    ):
+        super().__init__()
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+        *args,
+        **kwargs,
+    ):
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states

src/lots/lots_pipeline.py ADDED Viewed

	@@ -0,0 +1,227 @@

+from typing import List
+from PIL import Image
+import torch
+import os
+from typing import List
+import torch
+from PIL import Image
+from lots.projectors import TokenProjector, SequenceProjModel
+from utils.dinov2_utils import get_pooling_dim, get_feature_dim
+from transformers import AutoImageProcessor
+from lots.pair_former import PairFormer
+from utils.script_utils import is_torch2_available, get_generator
+import json
+if is_torch2_available():
+    from lots.cross_attn import AttnProcessor2_0 as AttnProcessor
+    from lots.cross_attn import LOTSAttnProcessor2_0 as LOTSAttnProcessor
+else:
+    from lots.cross_attn import AttnProcessor
+    from lots.cross_attn import LOTSAttnProcessor
+class LOTSPipeline:
+    def __init__(self, sd_pipe, lots_ckpt, device, image_encoder=None, num_global_tokens=77, num_tokens=32, model_type='vits14'):
+        # TODO: documentation
+        self.device = device
+        self.image_encoder = image_encoder
+        self.lots_ckpt = lots_ckpt
+        self.num_global_tokens = num_global_tokens
+        self.num_tokens = num_tokens
+        self.model_type = model_type
+        self.pipe = sd_pipe.to(self.device)
+        self.add_cross_attn(num_global_tokens=num_global_tokens)
+        self.image_encoder = image_encoder.to(self.device, dtype=torch.float16)
+        self.image_processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base")
+        # image proj model
+        self.image_proj_model, self.text_proj_model, self.pair_former = self.init_proj()
+        self.load_cross_attn()
+    def add_cross_attn(self, num_global_tokens=77):
+        unet = self.pipe.unet
+        attn_procs = {}
+        for name in unet.attn_processors.keys():
+            cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
+            if name.startswith("mid_block"):
+                hidden_size = unet.config.block_out_channels[-1]
+            elif name.startswith("up_blocks"):
+                block_id = int(name[len("up_blocks.")])
+                hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
+            elif name.startswith("down_blocks"):
+                block_id = int(name[len("down_blocks.")])
+                hidden_size = unet.config.block_out_channels[block_id]
+            if cross_attention_dim is None:
+                attn_procs[name] = AttnProcessor()
+            else:
+                attn_procs[name] = LOTSAttnProcessor(
+                    hidden_size=hidden_size,
+                    cross_attention_dim=cross_attention_dim,
+                    scale=1.0,
+                    num_global_tokens=num_global_tokens,
+                ).to(self.device, dtype=torch.float16)
+        unet.set_attn_processor(attn_procs)
+    def init_proj(self):
+        base_dim = get_feature_dim(self.model_type)
+        embeddings_dim = get_pooling_dim(base_dim, "cls")
+        image_proj_model = TokenProjector(
+            cross_attention_dim=self.pipe.unet.config.cross_attention_dim,
+            embeddings_dim=embeddings_dim,
+        ).to(self.device, dtype=torch.float16)
+        text_proj_model = SequenceProjModel(
+            cross_attention_dim=self.pipe.unet.config.cross_attention_dim,
+            embeddings_dim=self.pipe.text_encoder.config.projection_dim + self.pipe.text_encoder_2.config.projection_dim,
+            extra_context_tokens=4,
+        ).to(self.device, dtype=torch.float16)
+        # check if config is available from ckpt folder
+        # should be in the same folder as self.lots_ckpt
+        config_path = os.path.join(os.path.dirname(self.lots_ckpt), "pair_former_config.json")
+        if os.path.exists(config_path):
+            with open(config_path, "r") as f:
+                fusion_config = json.load(f)
+            pair_former_model = PairFormer(**fusion_config).to(self.device, dtype=torch.float16)
+        else:
+            # use default parameters
+            pair_former_model = PairFormer(
+                in_channels=self.pipe.unet.config.cross_attention_dim,
+                inner_dim=self.pipe.unet.config.cross_attention_dim,
+                fusion_strategy="deferred",
+                num_layers=2,
+                num_attention_heads=8,
+                dropout=0.0,
+                activation_fn="geglu",
+                norm_num_groups=32,
+                masking_strategy="compression",
+                num_cls_tokens=32,
+            ).to(self.device, dtype=torch.float16)
+        return image_proj_model, text_proj_model, pair_former_model
+    def load_cross_attn(self):
+        state_dict = torch.load(self.lots_ckpt, map_location="cpu")
+        self.image_proj_model.load_state_dict(state_dict["image_proj"], strict=True)
+        self.text_proj_model.load_state_dict(state_dict["text_proj"], strict=True)
+        self.pair_former.load_state_dict(state_dict["pair_former"], strict=True)
+        # load through reference to unet to avoid issues
+        attn_layers = torch.nn.ModuleList(self.pipe.unet.attn_processors.values())
+        attn_layers.load_state_dict(state_dict["cross_attn"], strict=True)
+    def generate(
+        self,
+        pil_images,
+        descriptions,
+        prompt=None,
+        negative_prompt=None,
+        scale=1.0,
+        num_samples=4,
+        seed=None,
+        num_inference_steps=30,
+        resolution=512,
+        **kwargs,
+    ):
+        self.set_scale(scale)
+        num_prompts = 1
+        num_sketches = len(pil_images)
+        if prompt is None:
+            prompt = "High quality photo of a model, artistic, 4k"
+        if negative_prompt is None:
+            negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
+        if not isinstance(prompt, List):
+            prompt = [prompt] * num_prompts
+        if not isinstance(negative_prompt, List):
+            negative_prompt = [negative_prompt] * num_prompts
+        # TODO: implement multiple images per prompt
+        # sketch image embeds
+        image_prompt_embeds, uncond_image_prompt_embeds = self.get_image_embeds(pil_images)
+        # text embeds
+        text_prompt_embeds, uncond_text_prompt_embeds = self.get_text_embeds(descriptions)
+        # fusion embeds
+        # create masks for the pair former
+        mask = [[True for _ in range(num_sketches)]] # extra dimension for batching
+        pair_embeds = self.pair_former(image_embeds=image_prompt_embeds, text_embeds=text_prompt_embeds, image_masks=mask, text_masks=mask)
+        uncond_pair_embeds = self.pair_former(image_embeds=uncond_image_prompt_embeds, text_embeds=uncond_text_prompt_embeds, image_masks=mask, text_masks=mask)
+        with torch.inference_mode():
+            (
+                prompt_embeds,
+                negative_prompt_embeds,
+                pooled_prompt_embeds,
+                negative_pooled_prompt_embeds,
+            ) = self.pipe.encode_prompt(
+                prompt,
+                num_images_per_prompt=num_samples,
+                do_classifier_free_guidance=True,
+                negative_prompt=negative_prompt,
+            )
+            prompt_embeds = torch.cat([prompt_embeds, pair_embeds], dim=1)
+            negative_prompt_embeds = torch.cat([negative_prompt_embeds, uncond_pair_embeds], dim=1)
+        self.generator = get_generator(seed, self.device)
+        images = self.pipe(
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            num_inference_steps=num_inference_steps,
+            generator=self.generator,
+            height=resolution,
+            width=resolution,
+            **kwargs,
+        ).images
+        return images
+    @torch.inference_mode()
+    def get_image_embeds(self, pil_images):
+        if isinstance(pil_images, Image.Image):
+            pil_images = [pil_images]
+        sketches = [self.image_processor(images=pil_image, return_tensors="pt").pixel_values.to(self.device, dtype=torch.float16) for pil_image in pil_images]
+        sketches = torch.cat(sketches, dim=0)
+        outputs = self.image_encoder(sketches)
+        image_embeds = outputs.last_hidden_state.unsqueeze(0) # add batch dimension
+        image_prompt_embeds = self.image_proj_model(image_embeds)
+        uncond_image_prompt_embeds = self.image_proj_model(torch.zeros_like(image_embeds))
+        return image_prompt_embeds, uncond_image_prompt_embeds
+    @torch.inference_mode()
+    def get_text_embeds(self, descriptions):
+        if descriptions is not None:
+            if isinstance(descriptions, str):
+                descriptions = [descriptions]
+            descriptions_ids = [self.pipe.tokenizer(description, return_tensors="pt", padding="max_length", truncation=True, max_length=self.pipe.tokenizer.model_max_length).input_ids.to(self.device)
+                                for description in descriptions]
+            text_embeds = [self.pipe.text_encoder(description_ids)['pooler_output'] for description_ids in descriptions_ids]
+            descriptions_ids_2 = [self.pipe.tokenizer_2(description, return_tensors="pt", padding="max_length", truncation=True, max_length=self.pipe.tokenizer_2.model_max_length).input_ids.to(self.device)
+                                 for description in descriptions]
+            text_embeds_2 = [self.pipe.text_encoder_2(description_ids_2)['text_embeds'] for description_ids_2 in descriptions_ids_2]
+            text_embeds = torch.cat(text_embeds, dim=0)
+            text_embeds_2 = torch.cat(text_embeds_2, dim=0)
+            text_embeds = torch.cat([text_embeds, text_embeds_2], dim=1).unsqueeze(0) # add batch dimension
+        text_prompt_embeds = self.text_proj_model(text_embeds)
+        uncond_text_prompt_embeds = self.text_proj_model(torch.zeros_like(text_embeds))
+        return text_prompt_embeds, uncond_text_prompt_embeds
+    def set_scale(self, scale):
+        for attn_processor in self.pipe.unet.attn_processors.values():
+            if isinstance(attn_processor, LOTSAttnProcessor):
+                attn_processor.scale = scale

src/lots/pair_former.py ADDED Viewed

	@@ -0,0 +1,226 @@

+import torch
+import torch.nn as nn
+from diffusers.models.attention import BasicTransformerBlock
+import json
+class PairFormer(nn.Module):
+    # TODO: documentation
+    def __init__(self,
+                 in_channels: int,
+                 fusion_strategy: str = "deferred",
+                 num_layers: int = 2,
+                 num_attention_heads: int = 8,
+                 inner_dim: int = 2048,
+                 dropout: float = 0.0,
+                 norm_num_groups: int = 32,
+                 activation_fn: str = "geglu",
+                 masking_strategy="compression",
+                 num_cls_tokens: int = 30,
+                 ):
+        super(PairFormer, self).__init__()
+        self.allowed_masking_strategies = ["modality", "pair", "compression", "all"]
+        self.mask_type = ["pair", "modality", "compression", "all"]
+        self.allowed_fusion_strategy = ["mean", "deferred"]
+        assert inner_dim % num_attention_heads == 0, "Inner_dim must be divisible by num_attention_heads"
+        assert in_channels % norm_num_groups == 0, "Inner_dim must be divisible by norm_num_groups"
+        assert masking_strategy in self.allowed_masking_strategies, "Masking strategy not supported, choose from: {}".format(self.allowed_masking_strategies)
+        self.masking_strategy = masking_strategy
+        self.attention_head_dim = inner_dim // num_attention_heads
+        self.in_channels = in_channels
+        self.with_in_projection = in_channels != inner_dim
+        self.with_out_projection = in_channels != inner_dim
+        self.fusion_strategy = fusion_strategy
+        self.num_layers = num_layers
+        self.inner_dim = inner_dim
+        self.num_cls_tokens = num_cls_tokens
+        # save the parameters in a config
+        self.config = {
+            "in_channels": in_channels,
+            "pooling_method": fusion_strategy,
+            "num_layers": num_layers,
+            "num_attention_heads": num_attention_heads,
+            "inner_dim": inner_dim,
+            "dropout": dropout,
+            "norm_num_groups": norm_num_groups,
+            "activation_fn": activation_fn,
+            "masking_strategy": masking_strategy,
+            "num_cls_tokens": num_cls_tokens
+        }
+        self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True)
+        if self.with_in_projection:
+            self.in_proj = nn.Linear(in_channels, inner_dim)
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    self.inner_dim,
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=self.attention_head_dim,
+                    dropout=dropout,
+                    activation_fn=activation_fn,
+                    norm_type="layer_norm",
+                    num_embeds_ada_norm=None,
+                    attention_bias=False,
+                    double_self_attention=True,
+                    norm_elementwise_affine=True,
+                    positional_embeddings=None,
+                    num_positional_embeddings=None,
+                )
+                for d in range(num_layers)
+            ]
+        )
+        if self.with_out_projection:
+            self.proj_out = nn.Linear(inner_dim, in_channels)
+        if self.masking_strategy == "compression" or self.masking_strategy == "all":
+            # create learnable CLS tokens
+            assert num_cls_tokens > 0, "Number of CLS tokens must be provided for masking strategy compression"
+            self.cls_tokens = nn.Parameter(torch.randn(1,1, num_cls_tokens, inner_dim)) # B, N, L, C
+    def save_config_json(self, path):
+        json.dump(self.config, open(path, "w"))
+    def prepare_attention_mask(self, image_masks, text_masks, LI, LT, masking_strategy="compression"):
+        """
+        Args:
+            image_masks: list of lists, of shape (B, N)
+            text_masks: list of lists of shape (B, N)
+            LI: int, number of image tokens
+            LT: int, number of text tokens
+        """
+        B = len(image_masks)
+        N = len(image_masks[0])
+        # create the attention mask
+        if masking_strategy == "pair":
+            """
+            Paired information can only attend to each other. Basically a giant diagonal matrix.
+            """
+            # since each pair can only attend to himself, we can collapse the pairs in the batch dimension and have a True mask
+            attention_mask = torch.ones(B*N, (LI+LT), (LI+LT), dtype=torch.bool)
+        elif masking_strategy == "modality":
+            """
+            Each sketch can attend to all other sketches (except padding ones). Same with text.
+            Fusion is done on a modality-level, not pair-level.
+            """
+            # the attention mask is a grid with 2 repeating rows and columns
+            rep_row = torch.ones(((LI+LT), (LI+LT)), dtype=torch.bool)
+            # prevent image tokens (first LI) to attend to text tokens (last LT)
+            rep_row[:LI, LI:] = False
+            # and vice versa
+            rep_row[LI:, :LI] = False
+            # repeat the column N times
+            mask = rep_row.repeat(N, N)
+            # repeat the mask for each batch element
+            attention_mask = mask.repeat(B, 1, 1)
+            # each item has different masks
+            for b in range(B):
+                for m in range(N):
+                    # find from which item the padding starts
+                    if not image_masks[b][m]:
+                        attention_mask[b, :, m*(LI+LT):] = False
+                        break
+        elif masking_strategy == "compression":
+            """
+            Paired information can only attend to each other and the added cls_tokens. Basically a giant diagonal matrix.
+            This is the default LOTS behavior.
+            """
+            # same as v1, but you have extra self.num_cls_tokens tokens per item
+            attention_mask = torch.zeros(B, N*(LI+LT+self.num_cls_tokens), N*(LI+LT+self.num_cls_tokens), dtype=torch.bool)
+            # each item has different masks
+            for b in range(B):
+                for i in range(N):
+                    # allow the image tokens and text tokens of the same pair to attend to each other
+                    attention_mask[b, i*(LI+LT+self.num_cls_tokens):(i+1)*(LI+LT+self.num_cls_tokens), i*(LI+LT+self.num_cls_tokens):(i+1)*(LI+LT+self.num_cls_tokens)] = True
+        elif masking_strategy == "all":
+            "all tokens, including cls, can attend to all other tokens, except padding"
+            attention_mask = torch.ones(B, N*(LI+LT+self.num_cls_tokens), N*(LI+LT+self.num_cls_tokens), dtype=torch.bool)
+            for b in range(B):
+                for m in range(N):
+                    # find from which item the padding starts
+                    if not image_masks[b][m]:
+                        attention_mask[b, :, m*(LI+LT+self.num_cls_tokens):] = False
+                        break
+        else:
+            raise NotImplementedError("Masking strategy not implemented")
+        return attention_mask
+    def forward(self, image_embeds, image_masks, text_embeds, text_masks, timestep=None):
+        """
+        Args:
+            image_embeds: torch.Tensor of shape (batch_size, sequence_length, in_channels)
+            image_masks: torch.Tensor of shape (batch_size, sequence_length)
+            text_embeds: torch.Tensor of shape (batch_size, sequence_length, in_channels)
+            text_masks: torch.Tensor of shape (batch_size, sequence_length)
+        """
+        B, N, LI, C = image_embeds.shape
+        _, _, LT, _ = text_embeds.shape
+        # prepare masks
+        attention_masks = []
+        for l in range(self.num_layers):
+            if self.masking_strategy == "modality":
+                attention_masks.append(self.prepare_attention_mask(image_masks, text_masks, LI, LT, masking_strategy="modality").to(image_embeds.device))
+            elif self.masking_strategy == "pair":
+                attention_masks.append(self.prepare_attention_mask(image_masks, text_masks, LI, LT, masking_strategy="pair").to(image_embeds.device))
+            elif self.masking_strategy == "compression":
+                attention_masks.append(self.prepare_attention_mask(image_masks, text_masks, LI, LT, masking_strategy="compression").to(image_embeds.device))
+            elif self.masking_strategy == "all":
+                attention_masks.append(self.prepare_attention_mask(image_masks, text_masks, LI, LT, masking_strategy="all").to(image_embeds.device))
+            else:
+                raise NotImplementedError("Masking strategy not implemented")
+        # concat image and text
+        if self.masking_strategy == "compression" or self.masking_strategy == "all":
+            # with cls tokens
+            batch_cls_tokens = self.cls_tokens.repeat(B, N, 1, 1)
+            x = torch.cat([batch_cls_tokens, image_embeds, text_embeds], dim=2)
+        else:
+            x = torch.cat([image_embeds, text_embeds], dim=2)
+        _, _, L, C = x.shape
+        if self.masking_strategy == "pair":
+            # collapse dim 0 and 1 (pairs as batch items)
+            x = x.reshape(B*N, L, C)
+        else:
+            # collapse dim 1 and 2
+            x = x.reshape(B, N*L, C)
+        # normalize the channels
+        x = x.permute(0, 2, 1) # B, C, N*L
+        x = self.norm(x)
+        x = x.permute(0, 2, 1) # B, N*L, C
+        # projection if necessary
+        if self.with_in_projection:
+            x = self.in_proj(x)
+        for attn_mask, block in zip(attention_masks, self.transformer_blocks):
+            x = block(hidden_states=x, attention_mask=attn_mask, encoder_attention_mask=attn_mask, timestep=timestep)
+        # this returns a B, N*L, C tensor
+        if self.with_out_projection:
+            x = self.proj_out(x)
+        # restore to original dimensions
+        x = x.reshape(B, N, L, C)
+        # x = x + residual # NOTE: do we want residuals?
+        if self.masking_strategy == "compression" or self.masking_strategy == "all":
+            x = x[:, :, :self.num_cls_tokens, :]
+        # do pooling keeping in mind the masking
+        if self.fusion_strategy == "mean":
+            pair_embeds = []
+            for b in range(B):
+                # select only items that are not masked
+                selector = torch.ones((N), dtype=torch.bool).to(x.device)
+                for i in range(N):
+                    if not image_masks[b][i]:
+                        selector[i] = False
+                item_embeds = x[b, selector, :, :]
+                # do the mean pooling
+                item_embeds = item_embeds.mean(dim=0, keepdim=False)
+                pair_embeds.append(item_embeds)
+            pair_embeds = torch.stack(pair_embeds)
+            # pair_embeds: B, L, C
+        elif self.fusion_strategy == "deferred":
+            pair_embeds = x.reshape(B, -1, C) # B, N*L, C
+            # the padding items are masked in the unet cross_attn outside of this module
+        else:
+            raise NotImplementedError("Pooling method not implemented")
+        return pair_embeds

src/lots/projectors.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import torch
+class TokenProjector(torch.nn.Module):
+    """
+    Projection Model
+    Takes in input embeddings of shape (BS, L, embeddings_dim) and projects them to (BS, L, cross_attention_dim)
+    """
+    def __init__(self, embeddings_dim=1024, cross_attention_dim=1024):
+        super().__init__()
+        self.cross_attention_dim = cross_attention_dim
+        self.proj = torch.nn.Linear(embeddings_dim, cross_attention_dim)
+        self.norm = torch.nn.LayerNorm(cross_attention_dim)
+    def forward(self, token_embeds):
+        """
+        token_embeds: torch.Tensor of shape (BS, L, embeddings_dim)
+        returns: torch.Tensor of shape (BS, L, attention_dim)
+        """
+        # image embeds in shape (BS, L, C)
+        embeds = token_embeds
+        projected_tokens = self.proj(embeds)
+        projected_tokens = self.norm(projected_tokens)
+        return projected_tokens
+class SequenceProjModel(torch.nn.Module):
+    """
+    Projection Model
+    Extends a single token to a sequence of tokens
+    """
+    def __init__(self, cross_attention_dim=1024, embeddings_dim=1024, extra_context_tokens=4):
+        super().__init__()
+        self.generator = None
+        self.cross_attention_dim = cross_attention_dim
+        self.extra_context_tokens = extra_context_tokens
+        self.proj = torch.nn.Linear(embeddings_dim, self.extra_context_tokens * cross_attention_dim)
+        self.norm = torch.nn.LayerNorm(cross_attention_dim)
+    def forward(self, token_embeds):
+        embeds = token_embeds
+        B, L, C = embeds.shape
+        extra_context_tokens = self.proj(embeds).reshape(
+            B, L, self.extra_context_tokens, self.cross_attention_dim
+        )
+        extra_context_tokens = self.norm(extra_context_tokens)
+        return extra_context_tokens

src/sketchy/__init__.py ADDED Viewed

File without changes

src/sketchy/sketchy_dataset.py ADDED Viewed

	@@ -0,0 +1,226 @@

+from fashionpedia.fp import Fashionpedia
+from PIL import Image, ImageOps
+import os
+import torch
+from torch.utils.data import Dataset
+class SketchyDataset(Dataset):
+    def __init__(self, dataset_root, split='train',
+                 load_img=False,
+                 load_global_sketch=False,
+                 load_local_sketch=False,
+                 img_size=512,
+                 img_transforms=None,
+                 global_sketch_transforms=None,
+                 local_sketch_transforms=None,
+                 text_tokenizers = None,
+                 with_shoes=False, # shoes are not included by default
+                 concat_locals=True, # concatenate local descriptions to create the global description
+                 compose_global_sketch=True, # compose the global sketch from the local sketches instead of using the pre-computed one
+                 ):
+        self.root = dataset_root
+        self.split = split
+        self.load_img = load_img
+        self.load_global_sketch = load_global_sketch
+        self.load_local_sketch = load_local_sketch
+        self.img_size = img_size
+        self.img_transforms = img_transforms
+        self.global_sketch_transforms = global_sketch_transforms
+        self.local_sketch_transforms = local_sketch_transforms
+        self.text_tokenizers = text_tokenizers
+        self.concat_locals = concat_locals
+        self.with_shoes = with_shoes
+        self.compose_global_sketch = compose_global_sketch
+        if self.compose_global_sketch:
+            assert load_global_sketch and load_local_sketch, "Need to load both global and local sketches to compose the global sketch"
+        self.json_path = os.path.join(self.root, f"{self.split}_sketchy.json")
+        self.init_dataset(self.json_path)
+    def init_dataset(self, json_path):
+        self.fp = Fashionpedia(json_path)
+        # go through the dataset and remove the shoes
+        if not self.with_shoes:
+            self.removeShoes()
+        # get all images ids
+        self.img_ids = list(self.fp.getImgIds())
+    def collate_fn(self, batch):
+        """ Use this when you are ok with having lists of different sizes in the batch"""
+        return_dict = {}
+        for key in batch[0].keys():
+            if key == 'image':
+                images = [d[key] for d in batch]
+                if self.img_transforms is not None:
+                    images = torch.stack(images)
+                return_dict['image'] = images
+            else:
+                return_dict[key] = [d[key] for d in batch]
+        return return_dict
+    def __len__(self):
+        return len(self.img_ids)
+    def __getitem__(self, idx):
+        return_dict = {}
+        img_id = self.img_ids[idx]
+        return_dict['image_id'] = img_id
+        img_data = self.fp.loadImgs(img_id)[0]
+        return_dict['img_data'] = img_data
+        img_path = os.path.join(self.root, self.split, 'images', str(img_id), img_data['file_name'])
+        global_sketch_path = os.path.join(self.root, self.split, 'full_sketches', str(img_id), str(img_id) + '.png')
+        annotations = self.fp.loadAnns(self.fp.getAnnIds(img_id))
+        return_dict['annotations'] = annotations
+        return_dict['global_sketch_path'] = global_sketch_path
+        return_dict['local_descriptions'] = [ann['description'].strip().lower() for ann in annotations]
+        return_dict['local_descriptions_ann_ids'] = [ann['id'] for ann in annotations]
+        if self.concat_locals:
+            return_dict['global_description'] = ". ".join(return_dict['local_descriptions']).strip().lower()
+        return_dict['local_sketches_paths'] = [os.path.join(self.root, self.split, 'partial_sketches', str(img_id), str(ann['id']) + '.png') for ann in annotations]
+        if self.load_local_sketch:
+            local_sketches = [Image.open(local_sketch_path).resize((self.img_size, self.img_size)) for local_sketch_path in return_dict['local_sketches_paths']]
+            if self.compose_global_sketch:
+                global_sketch = Image.new("L", (self.img_size, self.img_size), color=0)
+                for local_sketch in local_sketches:
+                    local_sketch = ImageOps.invert(local_sketch.convert("L"))
+                    global_sketch.paste(local_sketch, (0, 0), local_sketch)
+                global_sketch = ImageOps.invert(global_sketch)
+                global_sketch = global_sketch.convert("RGB")
+                if self.global_sketch_transforms is not None:
+                    global_sketch = self.global_sketch_transforms(global_sketch)
+                return_dict['global_sketch'] = global_sketch
+            local_sketches = [local_sketch.convert("RGB") for local_sketch in local_sketches]
+            if self.local_sketch_transforms is not None:
+                local_sketches = [self.local_sketch_transforms(local_sketch) for local_sketch in local_sketches]
+            return_dict['local_sketches'] = local_sketches
+        else:
+            return_dict['local_sketches'] = return_dict['local_sketches_paths']
+        return_dict['image_path'] = img_path
+        if self.load_img:
+            image = Image.open(img_path).convert("RGB")
+            image = image.resize((self.img_size, self.img_size))
+            if self.img_transforms is not None:
+                image = self.img_transforms(image)
+            return_dict['image'] = image
+        else:
+            return_dict['image'] = img_path
+        if not self.compose_global_sketch:
+            if self.load_global_sketch:
+                global_sketch = Image.open(global_sketch_path).convert("RGB")
+                global_sketch = global_sketch.resize((self.img_size, self.img_size))
+                if self.global_sketch_transforms is not None:
+                    global_sketch = self.global_sketch_transforms(global_sketch)
+                return_dict['global_sketch'] = global_sketch
+            else:
+                return_dict['global_sketch'] = global_sketch_path
+        # process text with tokenizers if needed
+        if self.text_tokenizers is not None:
+            # first global description
+            text = return_dict['global_description']
+            if len(self.text_tokenizers) == 1:
+                text_input_ids = self.text_tokenizers[0](
+                    text,
+                    max_length=self.text_tokenizers[0].model_max_length,
+                    padding="max_length",
+                    truncation=True,
+                    return_tensors="pt"
+                ).input_ids
+                return_dict['global_description_ids'] = text_input_ids
+                # then local descriptions
+                local_descriptions = return_dict['local_descriptions']
+                local_text_ids = []
+                for text in local_descriptions:
+                    text_input_ids = self.text_tokenizers[0](
+                        text,
+                        max_length=self.text_tokenizers[0].model_max_length,
+                        padding="max_length",
+                        truncation=True,
+                        return_tensors="pt"
+                    ).input_ids
+                    local_text_ids.append(text_input_ids)
+                return_dict['local_descriptions_ids'] = local_text_ids
+            else:
+                # get text and tokenize
+                text_input_ids = self.text_tokenizers[0](
+                    text,
+                    max_length=self.text_tokenizers[0].model_max_length,
+                    padding="max_length",
+                    truncation=True,
+                    return_tensors="pt"
+                ).input_ids
+                text_input_ids_2 =self.text_tokenizers[1](
+                    text,
+                    max_length=self.text_tokenizers[1].model_max_length,
+                    padding="max_length",
+                    truncation=True,
+                    return_tensors="pt"
+                ).input_ids
+                return_dict['global_description_ids'] = text_input_ids
+                return_dict['global_description_ids_2'] = text_input_ids_2
+                # then local descriptions
+                local_descriptions = return_dict['local_descriptions']
+                local_text_ids = []
+                for text in local_descriptions:
+                    text_input_ids = self.text_tokenizers[0](
+                        text,
+                        max_length=self.text_tokenizers[0].model_max_length,
+                        padding="max_length",
+                        truncation=True,
+                        return_tensors="pt"
+                    ).input_ids
+                    local_text_ids.append(text_input_ids)
+                return_dict['local_descriptions_ids'] = local_text_ids
+                local_text_ids_2 = []
+                for text in local_descriptions:
+                    text_input_ids_2 = self.text_tokenizers[1](
+                        text,
+                        max_length=self.text_tokenizers[1].model_max_length,
+                        padding="max_length",
+                        truncation=True,
+                        return_tensors="pt"
+                    ).input_ids
+                    local_text_ids_2.append(text_input_ids_2)
+                return_dict['local_descriptions_ids_2'] = local_text_ids_2
+        return return_dict
+    def ann2Mask(self, ann):
+        mask = self.fp.annToMask(ann)*255
+        mask = Image.fromarray(mask)
+        mask = ImageOps.contain(mask, (ann['final_width'], ann['final_height']))
+        padding = tuple(ann['padding'])
+        mask = ImageOps.expand(mask, padding, fill='black')
+        mask = mask.resize((self.img_size, self.img_size))
+        return mask
+    def removeShoes(self):
+        # get the annotations from the fp object
+        new_annotations = []
+        for ann_id, ann in self.fp.anns.items():
+            # remove all annotations with category_name "shoe"
+            if ann['category_name'] != 'shoe':
+                new_annotations.append(ann.copy())
+        self.fp.dataset['annotations'] = new_annotations
+        # re-create the index
+        self.fp.createIndex()
+        # get all images ids
+        self.img_ids = list(self.fp.getImgIds())
+        # remove images that have no annotations
+        new_img_data = []
+        for img_id, img_data in self.fp.imgs.items():
+            anns = self.fp.loadAnns(self.fp.getAnnIds(img_id))
+            if len(anns) > 0:
+                new_img_data.append(img_data.copy())
+        self.fp.dataset['images'] = new_img_data
+        # re-create the index
+        self.fp.createIndex()

src/utils/__init__.py ADDED Viewed

File without changes

src/utils/dinov2_utils.py ADDED Viewed

	@@ -0,0 +1,55 @@

+from transformers import AutoImageProcessor, AutoModel
+import torch
+def get_dinov2_model(model_type="vits14"):
+    """Get DINOv2 model that returns full hidden states"""
+    model_map = {
+        'vits14': 'facebook/dinov2-small',
+        'vitb14': 'facebook/dinov2-base',
+        'vitl14': 'facebook/dinov2-large',
+        'vitg14': 'facebook/dinov2-giant'
+    }
+    model = AutoModel.from_pretrained(model_map[model_type])
+    return model
+def get_feature_dim(model_type):
+    """Get feature dimension based on model type"""
+    dims = {
+        'vits14': 384,
+        'vitb14': 768,
+        'vitl14': 1024,
+        'vitg14': 1536
+    }
+    return dims[model_type]
+def extract_features(image_features, pooling_type='cls'):
+    """Extract features using different pooling strategies"""
+    # image_features should be last_hidden_states with shape [batch_size, num_patches+1, hidden_dim]
+    batch_size = image_features.shape[0]
+    if pooling_type == 'cls':
+        return image_features[:, 0]  # get CLS token
+    elif pooling_type == 'avg':
+        return torch.mean(image_features[:, 1:], dim=1)  # average over patches
+    elif pooling_type == 'max':
+        return torch.max(image_features[:, 1:], dim=1)[0]  # max over patches
+    elif pooling_type == 'cls_max':
+        cls_token = image_features[:, 0]
+        max_pool = torch.max(image_features[:, 1:], dim=1)[0]
+        return torch.cat([cls_token, max_pool], dim=-1)
+    elif pooling_type == 'cls_avg':
+        cls_token = image_features[:, 0]
+        avg_pool = torch.mean(image_features[:, 1:], dim=1)
+        return torch.cat([cls_token, avg_pool], dim=-1)
+    else:
+        raise ValueError(f"Unknown pooling type: {pooling_type}")
+def get_pooling_dim(base_dim, pooling_type):
+    """Returns the final feature dimension according to the pooling type"""
+    if pooling_type in ['cls', 'avg', 'max']:
+        return base_dim
+    elif pooling_type in ['cls_max', 'cls_avg']:
+        return base_dim * 2
+    else:
+        raise ValueError(f"Unknown pooling type: {pooling_type}")

src/utils/script_utils.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import random
+import numpy as np
+import torch
+from transformers import PretrainedConfig
+import torch.nn.functional as F
+def set_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+def import_model_class_from_model_name_or_path(
+    pretrained_model_name_or_path: str, revision: str, subfolder: str = "text_encoder"
+):
+    text_encoder_config = PretrainedConfig.from_pretrained(
+        pretrained_model_name_or_path, subfolder=subfolder, revision=revision
+    )
+    model_class = text_encoder_config.architectures[0]
+    if model_class == "CLIPTextModel":
+        from transformers import CLIPTextModel
+        return CLIPTextModel
+    elif model_class == "CLIPTextModelWithProjection":
+        from transformers import CLIPTextModelWithProjection
+        return CLIPTextModelWithProjection
+    else:
+        raise ValueError(f"{model_class} is not supported.")
+def encode_prompt(prompt_batch, text_encoders, tokenizers, proportion_empty_prompts=0):
+    prompt_embeds_list = []
+    captions = []
+    if type(prompt_batch) == str:
+        prompt_batch = [prompt_batch]
+    for caption in prompt_batch:
+        if random.random() < proportion_empty_prompts:
+            # randomly replace some captions with empty ones
+            captions.append("")
+        elif isinstance(caption, str):
+            # keep the caption
+            captions.append(caption)
+        elif isinstance(caption, (list, np.ndarray)):
+            # This happens when passing multiple captions for the same image
+            raise ValueError("Multiple captions were passed in the wrong format.")
+        else:
+            raise ValueError("Prompt is in the wrong format.")
+    with torch.no_grad():
+        for tokenizer, text_encoder in zip(tokenizers, text_encoders):
+            text_inputs = tokenizer(
+                captions,
+                padding="max_length",
+                max_length=tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = tokenizer(captions, padding="longest", return_tensors="pt").input_ids
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
+                print(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {tokenizer.model_max_length} tokens: {removed_text}"
+                )
+            prompt_embeds = text_encoder(
+                text_input_ids.to(text_encoder.device),
+                output_hidden_states=True,
+            )
+            # We are only interested in the pooled output of the final text encoder
+            pooled_prompt_embeds = prompt_embeds[0]
+            prompt_embeds = prompt_embeds.hidden_states[-2]
+            bs_embed, seq_len, _ = prompt_embeds.shape
+            prompt_embeds = prompt_embeds.view(bs_embed, seq_len, -1)
+            prompt_embeds_list.append(prompt_embeds)
+    prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+    pooled_prompt_embeds = pooled_prompt_embeds.view(bs_embed, -1)
+    return prompt_embeds, pooled_prompt_embeds
+def is_torch2_available():
+    return hasattr(F, "scaled_dot_product_attention")
+def get_generator(seed, device):
+    if seed is not None:
+        if isinstance(seed, list):
+            generator = [torch.Generator(device).manual_seed(seed_item) for seed_item in seed]
+        else:
+            generator = torch.Generator(device).manual_seed(seed)
+    else:
+        generator = None
+    return generator

static/LOTS.png ADDED Viewed

Git LFS Details

SHA256: 7f5f7a616a1856458a8ab11f6b8ba5b3f1a3a1121a9a4a52f5f96c1f080716a1
Pointer size: 131 Bytes
Size of remote file: 271 kB