Dan Bochman
commited on
Commit
·
a92eba4
0
Parent(s):
Initial release: FASHN Human Parser - SegFormer-B4 for human parsing
Browse files- .gitattributes +35 -0
- README.md +144 -0
- config.json +110 -0
- model.safetensors +3 -0
- preprocessor_config.json +15 -0
.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: other
|
| 3 |
+
license_name: nvidia-segformer
|
| 4 |
+
license_link: https://github.com/NVlabs/SegFormer/blob/master/LICENSE
|
| 5 |
+
library_name: transformers
|
| 6 |
+
pipeline_tag: image-segmentation
|
| 7 |
+
tags:
|
| 8 |
+
- segformer
|
| 9 |
+
- human-parsing
|
| 10 |
+
- semantic-segmentation
|
| 11 |
+
- fashion
|
| 12 |
+
- virtual-try-on
|
| 13 |
+
language:
|
| 14 |
+
- en
|
| 15 |
+
---
|
| 16 |
+
|
| 17 |
+
# FASHN Human Parser
|
| 18 |
+
|
| 19 |
+
A SegFormer-B4 model fine-tuned for human parsing with 18 semantic classes, optimized for fashion and virtual try-on applications.
|
| 20 |
+
|
| 21 |
+
## Model Description
|
| 22 |
+
|
| 23 |
+
This model segments human images into 18 semantic categories including body parts (face, hair, arms, hands, legs, feet, torso), clothing items (top, dress, skirt, pants, belt, scarf), and accessories (bag, hat, glasses, jewelry).
|
| 24 |
+
|
| 25 |
+
- **Architecture**: SegFormer-B4 (MIT-B4 encoder + MLP decoder)
|
| 26 |
+
- **Input Size**: 384 x 576 (width x height)
|
| 27 |
+
- **Output**: 18-class semantic segmentation mask
|
| 28 |
+
- **Base Model**: [nvidia/mit-b4](https://huggingface.co/nvidia/mit-b4)
|
| 29 |
+
|
| 30 |
+
## Usage
|
| 31 |
+
|
| 32 |
+
### Quick Start with Pipeline
|
| 33 |
+
|
| 34 |
+
```python
|
| 35 |
+
from transformers import pipeline
|
| 36 |
+
|
| 37 |
+
parser = pipeline("image-segmentation", model="fashn-ai/fashn-human-parser")
|
| 38 |
+
result = parser("path/to/image.jpg")
|
| 39 |
+
```
|
| 40 |
+
|
| 41 |
+
### Explicit Usage
|
| 42 |
+
|
| 43 |
+
```python
|
| 44 |
+
from transformers import SegformerForSemanticSegmentation, SegformerImageProcessor
|
| 45 |
+
from PIL import Image
|
| 46 |
+
import torch
|
| 47 |
+
|
| 48 |
+
# Load model and processor
|
| 49 |
+
processor = SegformerImageProcessor.from_pretrained("fashn-ai/fashn-human-parser")
|
| 50 |
+
model = SegformerForSemanticSegmentation.from_pretrained("fashn-ai/fashn-human-parser")
|
| 51 |
+
|
| 52 |
+
# Load and preprocess image
|
| 53 |
+
image = Image.open("path/to/image.jpg")
|
| 54 |
+
inputs = processor(images=image, return_tensors="pt")
|
| 55 |
+
|
| 56 |
+
# Inference
|
| 57 |
+
with torch.no_grad():
|
| 58 |
+
outputs = model(**inputs)
|
| 59 |
+
logits = outputs.logits # (1, 18, H/4, W/4)
|
| 60 |
+
|
| 61 |
+
# Upsample to original size and get predictions
|
| 62 |
+
upsampled = torch.nn.functional.interpolate(
|
| 63 |
+
logits, size=image.size[::-1], mode="bilinear", align_corners=False
|
| 64 |
+
)
|
| 65 |
+
predictions = upsampled.argmax(dim=1).squeeze().numpy()
|
| 66 |
+
```
|
| 67 |
+
|
| 68 |
+
### Production Usage (Recommended)
|
| 69 |
+
|
| 70 |
+
For production applications requiring maximum accuracy, we recommend using our Python package which implements the exact preprocessing used during training:
|
| 71 |
+
|
| 72 |
+
```bash
|
| 73 |
+
pip install fashn-human-parser
|
| 74 |
+
```
|
| 75 |
+
|
| 76 |
+
```python
|
| 77 |
+
from fashn_human_parser import FashnHumanParser
|
| 78 |
+
|
| 79 |
+
parser = FashnHumanParser(device="cuda")
|
| 80 |
+
segmentation = parser.predict(image)
|
| 81 |
+
```
|
| 82 |
+
|
| 83 |
+
## Label Definitions
|
| 84 |
+
|
| 85 |
+
| ID | Label |
|
| 86 |
+
|----|-------|
|
| 87 |
+
| 0 | background |
|
| 88 |
+
| 1 | face |
|
| 89 |
+
| 2 | hair |
|
| 90 |
+
| 3 | top |
|
| 91 |
+
| 4 | dress |
|
| 92 |
+
| 5 | skirt |
|
| 93 |
+
| 6 | pants |
|
| 94 |
+
| 7 | belt |
|
| 95 |
+
| 8 | bag |
|
| 96 |
+
| 9 | hat |
|
| 97 |
+
| 10 | scarf |
|
| 98 |
+
| 11 | glasses |
|
| 99 |
+
| 12 | arms |
|
| 100 |
+
| 13 | hands |
|
| 101 |
+
| 14 | legs |
|
| 102 |
+
| 15 | feet |
|
| 103 |
+
| 16 | torso |
|
| 104 |
+
| 17 | jewelry |
|
| 105 |
+
|
| 106 |
+
### Category Mappings
|
| 107 |
+
|
| 108 |
+
For virtual try-on applications:
|
| 109 |
+
|
| 110 |
+
| Category | Body Coverage | Relevant Labels |
|
| 111 |
+
|----------|--------------|-----------------|
|
| 112 |
+
| Tops | Upper body | top, dress, scarf |
|
| 113 |
+
| Bottoms | Lower body | skirt, pants, belt |
|
| 114 |
+
| One-pieces | Full body | top, dress, scarf, skirt, pants, belt |
|
| 115 |
+
|
| 116 |
+
### Identity Labels
|
| 117 |
+
|
| 118 |
+
Labels typically preserved during virtual try-on: `face`, `hair`, `jewelry`, `bag`, `glasses`, `hat`
|
| 119 |
+
|
| 120 |
+
## Training
|
| 121 |
+
|
| 122 |
+
This model was fine-tuned on a proprietary dataset curated and annotated by FASHN AI, specifically designed for virtual try-on applications. The 18-class label schema was developed to capture the semantic regions most relevant for clothing transfer and human body understanding in fashion contexts.
|
| 123 |
+
|
| 124 |
+
## Limitations
|
| 125 |
+
|
| 126 |
+
- Optimized for single-person images with clear visibility
|
| 127 |
+
- Best results on fashion/e-commerce style photography
|
| 128 |
+
- Input images are resized to 384x576; very small subjects may lose detail
|
| 129 |
+
|
| 130 |
+
## Citation
|
| 131 |
+
|
| 132 |
+
```bibtex
|
| 133 |
+
@misc{fashn-human-parser,
|
| 134 |
+
author = {FASHN AI},
|
| 135 |
+
title = {FASHN Human Parser: SegFormer for Fashion Human Parsing},
|
| 136 |
+
year = {2024},
|
| 137 |
+
publisher = {Hugging Face},
|
| 138 |
+
url = {https://huggingface.co/fashn-ai/fashn-human-parser}
|
| 139 |
+
}
|
| 140 |
+
```
|
| 141 |
+
|
| 142 |
+
## License
|
| 143 |
+
|
| 144 |
+
This model inherits the [NVIDIA Source Code License for SegFormer](https://github.com/NVlabs/SegFormer/blob/master/LICENSE). Please review the license terms before use.
|
config.json
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_name_or_path": "nvidia/mit-b4",
|
| 3 |
+
"architectures": [
|
| 4 |
+
"SegformerForSemanticSegmentation"
|
| 5 |
+
],
|
| 6 |
+
"attention_probs_dropout_prob": 0.0,
|
| 7 |
+
"classifier_dropout_prob": 0.1,
|
| 8 |
+
"decoder_hidden_size": 768,
|
| 9 |
+
"depths": [
|
| 10 |
+
3,
|
| 11 |
+
8,
|
| 12 |
+
27,
|
| 13 |
+
3
|
| 14 |
+
],
|
| 15 |
+
"downsampling_rates": [
|
| 16 |
+
1,
|
| 17 |
+
4,
|
| 18 |
+
8,
|
| 19 |
+
16
|
| 20 |
+
],
|
| 21 |
+
"drop_path_rate": 0.1,
|
| 22 |
+
"hidden_act": "gelu",
|
| 23 |
+
"hidden_dropout_prob": 0.0,
|
| 24 |
+
"hidden_sizes": [
|
| 25 |
+
64,
|
| 26 |
+
128,
|
| 27 |
+
320,
|
| 28 |
+
512
|
| 29 |
+
],
|
| 30 |
+
"id2label": {
|
| 31 |
+
"0": "background",
|
| 32 |
+
"1": "face",
|
| 33 |
+
"2": "hair",
|
| 34 |
+
"3": "top",
|
| 35 |
+
"4": "dress",
|
| 36 |
+
"5": "skirt",
|
| 37 |
+
"6": "pants",
|
| 38 |
+
"7": "belt",
|
| 39 |
+
"8": "bag",
|
| 40 |
+
"9": "hat",
|
| 41 |
+
"10": "scarf",
|
| 42 |
+
"11": "glasses",
|
| 43 |
+
"12": "arms",
|
| 44 |
+
"13": "hands",
|
| 45 |
+
"14": "legs",
|
| 46 |
+
"15": "feet",
|
| 47 |
+
"16": "torso",
|
| 48 |
+
"17": "jewelry"
|
| 49 |
+
},
|
| 50 |
+
"image_size": 224,
|
| 51 |
+
"initializer_range": 0.02,
|
| 52 |
+
"label2id": {
|
| 53 |
+
"arms": 12,
|
| 54 |
+
"background": 0,
|
| 55 |
+
"bag": 8,
|
| 56 |
+
"belt": 7,
|
| 57 |
+
"dress": 4,
|
| 58 |
+
"face": 1,
|
| 59 |
+
"feet": 15,
|
| 60 |
+
"glasses": 11,
|
| 61 |
+
"hair": 2,
|
| 62 |
+
"hands": 13,
|
| 63 |
+
"hat": 9,
|
| 64 |
+
"jewelry": 17,
|
| 65 |
+
"legs": 14,
|
| 66 |
+
"pants": 6,
|
| 67 |
+
"scarf": 10,
|
| 68 |
+
"skirt": 5,
|
| 69 |
+
"top": 3,
|
| 70 |
+
"torso": 16
|
| 71 |
+
},
|
| 72 |
+
"layer_norm_eps": 1e-06,
|
| 73 |
+
"mlp_ratios": [
|
| 74 |
+
4,
|
| 75 |
+
4,
|
| 76 |
+
4,
|
| 77 |
+
4
|
| 78 |
+
],
|
| 79 |
+
"model_type": "segformer",
|
| 80 |
+
"num_attention_heads": [
|
| 81 |
+
1,
|
| 82 |
+
2,
|
| 83 |
+
5,
|
| 84 |
+
8
|
| 85 |
+
],
|
| 86 |
+
"num_channels": 3,
|
| 87 |
+
"num_encoder_blocks": 4,
|
| 88 |
+
"patch_sizes": [
|
| 89 |
+
7,
|
| 90 |
+
3,
|
| 91 |
+
3,
|
| 92 |
+
3
|
| 93 |
+
],
|
| 94 |
+
"reshape_last_stage": true,
|
| 95 |
+
"semantic_loss_ignore_index": 255,
|
| 96 |
+
"sr_ratios": [
|
| 97 |
+
8,
|
| 98 |
+
4,
|
| 99 |
+
2,
|
| 100 |
+
1
|
| 101 |
+
],
|
| 102 |
+
"strides": [
|
| 103 |
+
4,
|
| 104 |
+
2,
|
| 105 |
+
2,
|
| 106 |
+
2
|
| 107 |
+
],
|
| 108 |
+
"torch_dtype": "float32",
|
| 109 |
+
"transformers_version": "4.42.4"
|
| 110 |
+
}
|
model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e43c8c8a9b04f28798f0a4630cf18caa2cdb27a0d454fae43a5716e6f7078244
|
| 3 |
+
size 256146352
|
preprocessor_config.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"do_reduce_labels": false,
|
| 3 |
+
"do_rescale": true,
|
| 4 |
+
"do_resize": true,
|
| 5 |
+
"do_normalize": true,
|
| 6 |
+
"image_mean": [0.485, 0.456, 0.406],
|
| 7 |
+
"image_std": [0.229, 0.224, 0.225],
|
| 8 |
+
"image_processor_type": "SegformerImageProcessor",
|
| 9 |
+
"resample": 1,
|
| 10 |
+
"rescale_factor": 0.00392156862745098,
|
| 11 |
+
"size": {
|
| 12 |
+
"height": 576,
|
| 13 |
+
"width": 384
|
| 14 |
+
}
|
| 15 |
+
}
|