Commit
·
b03f48b
verified
·
0
Parent(s):
Duplicate from davidelobba/TEMU-VTOFF
Browse filesCo-authored-by: Davide <davidelobba@users.noreply.huggingface.co>
- .gitattributes +36 -0
- README.md +68 -0
- teaser.png +3 -0
- transformer/config.json +22 -0
- transformer/diffusion_pytorch_model.safetensors +3 -0
- transformer_vton/config.json +16 -0
- transformer_vton/diffusion_pytorch_model.safetensors +3 -0
.gitattributes
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
teaser.png filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: cc-by-nc-4.0
|
| 3 |
+
base_model:
|
| 4 |
+
- stabilityai/stable-diffusion-3-medium-diffusers
|
| 5 |
+
pipeline_tag: image-to-image
|
| 6 |
+
tags:
|
| 7 |
+
- image-generation
|
| 8 |
+
- image-to-image
|
| 9 |
+
- virtual-try-on
|
| 10 |
+
- virtual-try-off
|
| 11 |
+
- diffusion
|
| 12 |
+
- dit
|
| 13 |
+
- stable-diffusion-3
|
| 14 |
+
- multimodal
|
| 15 |
+
- fashion
|
| 16 |
+
- pytorch
|
| 17 |
+
language: en
|
| 18 |
+
datasets:
|
| 19 |
+
- dresscode
|
| 20 |
+
- viton-hd
|
| 21 |
+
---
|
| 22 |
+
|
| 23 |
+
<div align="center">
|
| 24 |
+
<h1 align="center">TEMU-VTOFF</h1>
|
| 25 |
+
<h3 align="center">Text-Enhanced MUlti-category Virtual Try-Off</h3>
|
| 26 |
+
</div>
|
| 27 |
+
|
| 28 |
+
<div align="center">
|
| 29 |
+
<picture>
|
| 30 |
+
<source srcset="/davidelobba/TEMU-VTOFF/resolve/main/teaser.png" media="(prefers-color-scheme: dark)">
|
| 31 |
+
<img src="/davidelobba/TEMU-VTOFF/resolve/main/teaser.png" width="75%" alt="TEMU-VTOFF Teaser">
|
| 32 |
+
</source>
|
| 33 |
+
</picture>
|
| 34 |
+
</div>
|
| 35 |
+
|
| 36 |
+
<div align="center">
|
| 37 |
+
|
| 38 |
+
**Inverse Virtual Try-On: Generating Multi-Category Product-Style Images from Clothed Individuals**
|
| 39 |
+
[Davide Lobba](https://scholar.google.com/citations?user=WEMoLPEAAAAJ&hl=en&oi=ao)<sup>1,2,\*</sup>, [Fulvio Sanguigni](https://scholar.google.com/citations?user=tSpzMUEAAAAJ&hl=en)<sup>2,3,\*</sup>, [Bin Ren](https://scholar.google.com/citations?user=Md9maLYAAAAJ&hl=en)<sup>1,2</sup>, [Marcella Cornia](https://scholar.google.com/citations?user=DzgmSJEAAAAJ&hl=en)<sup>3</sup>, [Rita Cucchiara](https://scholar.google.com/citations?user=OM3sZEoAAAAJ&hl=en)<sup>3</sup>, [Nicu Sebe](https://scholar.google.com/citations?user=stFCYOAAAAAJ&hl=en)<sup>1</sup>
|
| 40 |
+
<sup>1</sup>University of Trento, <sup>2</sup>University of Pisa, <sup>3</sup>University of Modena and Reggio Emilia
|
| 41 |
+
<sup>*</sup> Equal contribution
|
| 42 |
+
</div>
|
| 43 |
+
|
| 44 |
+
<div align="center">
|
| 45 |
+
<a href="https://arxiv.org/abs/2505.21062" style="margin: 0 2px;">
|
| 46 |
+
<img src="https://img.shields.io/badge/Paper-Arxiv_2505.21062-darkred.svg" alt="Paper">
|
| 47 |
+
</a>
|
| 48 |
+
<a href="https://temu-vtoff-page.github.io/" style="margin: 0 2px;">
|
| 49 |
+
<img src='https://img.shields.io/badge/Webpage-Project-silver?style=flat&logo=&logoColor=orange' alt='Project Webpage'>
|
| 50 |
+
</a>
|
| 51 |
+
<a href="https://github.com/davidelobba/TEMU-VTOFF" style="margin: 0 2px;">
|
| 52 |
+
<img src="https://img.shields.io/badge/GitHub-Repo-blue.svg?logo=github" alt="GitHub Repository">
|
| 53 |
+
</a>
|
| 54 |
+
<!-- The Hugging Face model badge will be automatically displayed on the model page -->
|
| 55 |
+
</div>
|
| 56 |
+
|
| 57 |
+
## 💡 Model Description
|
| 58 |
+
|
| 59 |
+
**TEMU-VTOFF** is a novel dual-DiT (Diffusion Transformer) architecture designed for the Virtual Try-Off task: generating in-shop images of garments worn by a person. By combining a pretrained feature extractor with a text-enhanced generation module, our method can handle occlusions, multiple garment categories, and ambiguous appearances. It further refines generation fidelity via a feature alignment module based on DINOv2.
|
| 60 |
+
|
| 61 |
+
This model is based on `stabilityai/stable-diffusion-3-medium-diffusers`. The uploaded weights correspond to the finetuned feature extractor and the VTOFF DiT module.
|
| 62 |
+
|
| 63 |
+
## ✨ Key Features
|
| 64 |
+
Our contribution can be summarized as follows:
|
| 65 |
+
- **🎯 Multi-Category Try-Off**. We present a unified framework capable of handling multiple garment types (upper-body, lower-body, and full-body clothes) without requiring category-specific pipelines.
|
| 66 |
+
- **🔗 Multimodal Hybrid Attention**. We introduce a novel attention mechanism that integrates garment textual descriptions into the generative process by linking them with person-specific features. This helps the model synthesize occluded or ambiguous garment regions more accurately.
|
| 67 |
+
- **⚡ Garment Aligner Module**. We design a lightweight aligner that conditions generation on clean garment images, replacing conventional denoising objectives. This leads to better alignment consistency on the overall dataset and preserves more precise visual retention.
|
| 68 |
+
- **📊 Extensive experiments**. Experiments on the Dress Code and VITON-HD datasets demonstrate that TEMU-VTOFF outperforms prior methods in both the quality of generated images and alignment with the target garment, highlighting its strong generalization capabilities.
|
teaser.png
ADDED
|
Git LFS Details
|
transformer/config.json
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_class_name": "SD3Transformer2DModel",
|
| 3 |
+
"_diffusers_version": "0.33.0.dev0",
|
| 4 |
+
"_name_or_path": "stabilityai/stable-diffusion-3-medium-diffusers",
|
| 5 |
+
"attention_head_dim": 64,
|
| 6 |
+
"caption_projection_dim": 1536,
|
| 7 |
+
"encoder_depth": 8,
|
| 8 |
+
"in_channels": 16,
|
| 9 |
+
"joint_attention_dim": 4096,
|
| 10 |
+
"num_attention_heads": 24,
|
| 11 |
+
"num_layers": 24,
|
| 12 |
+
"out_channels": 16,
|
| 13 |
+
"patch_size": 2,
|
| 14 |
+
"pooled_projection_dim": 2048,
|
| 15 |
+
"pos_embed_max_size": 192,
|
| 16 |
+
"probing_method": "conv",
|
| 17 |
+
"projector_dim": 2048,
|
| 18 |
+
"sample_size": 128,
|
| 19 |
+
"z_dims": [
|
| 20 |
+
768
|
| 21 |
+
]
|
| 22 |
+
}
|
transformer/diffusion_pytorch_model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d11d615e99e02ecbe64e063eb705b4a38361a547348d10440441bfe891111e09
|
| 3 |
+
size 4254964176
|
transformer_vton/config.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_class_name": "SD3Transformer2DModel",
|
| 3 |
+
"_diffusers_version": "0.33.0.dev0",
|
| 4 |
+
"_name_or_path": "stabilityai/stable-diffusion-3-medium-diffusers",
|
| 5 |
+
"attention_head_dim": 64,
|
| 6 |
+
"caption_projection_dim": 1536,
|
| 7 |
+
"in_channels": 33,
|
| 8 |
+
"joint_attention_dim": 4096,
|
| 9 |
+
"num_attention_heads": 24,
|
| 10 |
+
"num_layers": 24,
|
| 11 |
+
"out_channels": 16,
|
| 12 |
+
"patch_size": 2,
|
| 13 |
+
"pooled_projection_dim": 2048,
|
| 14 |
+
"pos_embed_max_size": 192,
|
| 15 |
+
"sample_size": 128
|
| 16 |
+
}
|
transformer_vton/diffusion_pytorch_model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:108ad85011099576200585917042628a0e2b4ee2af09cfba3723e96a9d9a19a1
|
| 3 |
+
size 3830214056
|