Dan Bochman commited on
Commit
a92eba4
·
0 Parent(s):

Initial release: FASHN Human Parser - SegFormer-B4 for human parsing

Browse files
Files changed (5) hide show
  1. .gitattributes +35 -0
  2. README.md +144 -0
  3. config.json +110 -0
  4. model.safetensors +3 -0
  5. preprocessor_config.json +15 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: other
3
+ license_name: nvidia-segformer
4
+ license_link: https://github.com/NVlabs/SegFormer/blob/master/LICENSE
5
+ library_name: transformers
6
+ pipeline_tag: image-segmentation
7
+ tags:
8
+ - segformer
9
+ - human-parsing
10
+ - semantic-segmentation
11
+ - fashion
12
+ - virtual-try-on
13
+ language:
14
+ - en
15
+ ---
16
+
17
+ # FASHN Human Parser
18
+
19
+ A SegFormer-B4 model fine-tuned for human parsing with 18 semantic classes, optimized for fashion and virtual try-on applications.
20
+
21
+ ## Model Description
22
+
23
+ This model segments human images into 18 semantic categories including body parts (face, hair, arms, hands, legs, feet, torso), clothing items (top, dress, skirt, pants, belt, scarf), and accessories (bag, hat, glasses, jewelry).
24
+
25
+ - **Architecture**: SegFormer-B4 (MIT-B4 encoder + MLP decoder)
26
+ - **Input Size**: 384 x 576 (width x height)
27
+ - **Output**: 18-class semantic segmentation mask
28
+ - **Base Model**: [nvidia/mit-b4](https://huggingface.co/nvidia/mit-b4)
29
+
30
+ ## Usage
31
+
32
+ ### Quick Start with Pipeline
33
+
34
+ ```python
35
+ from transformers import pipeline
36
+
37
+ parser = pipeline("image-segmentation", model="fashn-ai/fashn-human-parser")
38
+ result = parser("path/to/image.jpg")
39
+ ```
40
+
41
+ ### Explicit Usage
42
+
43
+ ```python
44
+ from transformers import SegformerForSemanticSegmentation, SegformerImageProcessor
45
+ from PIL import Image
46
+ import torch
47
+
48
+ # Load model and processor
49
+ processor = SegformerImageProcessor.from_pretrained("fashn-ai/fashn-human-parser")
50
+ model = SegformerForSemanticSegmentation.from_pretrained("fashn-ai/fashn-human-parser")
51
+
52
+ # Load and preprocess image
53
+ image = Image.open("path/to/image.jpg")
54
+ inputs = processor(images=image, return_tensors="pt")
55
+
56
+ # Inference
57
+ with torch.no_grad():
58
+ outputs = model(**inputs)
59
+ logits = outputs.logits # (1, 18, H/4, W/4)
60
+
61
+ # Upsample to original size and get predictions
62
+ upsampled = torch.nn.functional.interpolate(
63
+ logits, size=image.size[::-1], mode="bilinear", align_corners=False
64
+ )
65
+ predictions = upsampled.argmax(dim=1).squeeze().numpy()
66
+ ```
67
+
68
+ ### Production Usage (Recommended)
69
+
70
+ For production applications requiring maximum accuracy, we recommend using our Python package which implements the exact preprocessing used during training:
71
+
72
+ ```bash
73
+ pip install fashn-human-parser
74
+ ```
75
+
76
+ ```python
77
+ from fashn_human_parser import FashnHumanParser
78
+
79
+ parser = FashnHumanParser(device="cuda")
80
+ segmentation = parser.predict(image)
81
+ ```
82
+
83
+ ## Label Definitions
84
+
85
+ | ID | Label |
86
+ |----|-------|
87
+ | 0 | background |
88
+ | 1 | face |
89
+ | 2 | hair |
90
+ | 3 | top |
91
+ | 4 | dress |
92
+ | 5 | skirt |
93
+ | 6 | pants |
94
+ | 7 | belt |
95
+ | 8 | bag |
96
+ | 9 | hat |
97
+ | 10 | scarf |
98
+ | 11 | glasses |
99
+ | 12 | arms |
100
+ | 13 | hands |
101
+ | 14 | legs |
102
+ | 15 | feet |
103
+ | 16 | torso |
104
+ | 17 | jewelry |
105
+
106
+ ### Category Mappings
107
+
108
+ For virtual try-on applications:
109
+
110
+ | Category | Body Coverage | Relevant Labels |
111
+ |----------|--------------|-----------------|
112
+ | Tops | Upper body | top, dress, scarf |
113
+ | Bottoms | Lower body | skirt, pants, belt |
114
+ | One-pieces | Full body | top, dress, scarf, skirt, pants, belt |
115
+
116
+ ### Identity Labels
117
+
118
+ Labels typically preserved during virtual try-on: `face`, `hair`, `jewelry`, `bag`, `glasses`, `hat`
119
+
120
+ ## Training
121
+
122
+ This model was fine-tuned on a proprietary dataset curated and annotated by FASHN AI, specifically designed for virtual try-on applications. The 18-class label schema was developed to capture the semantic regions most relevant for clothing transfer and human body understanding in fashion contexts.
123
+
124
+ ## Limitations
125
+
126
+ - Optimized for single-person images with clear visibility
127
+ - Best results on fashion/e-commerce style photography
128
+ - Input images are resized to 384x576; very small subjects may lose detail
129
+
130
+ ## Citation
131
+
132
+ ```bibtex
133
+ @misc{fashn-human-parser,
134
+ author = {FASHN AI},
135
+ title = {FASHN Human Parser: SegFormer for Fashion Human Parsing},
136
+ year = {2024},
137
+ publisher = {Hugging Face},
138
+ url = {https://huggingface.co/fashn-ai/fashn-human-parser}
139
+ }
140
+ ```
141
+
142
+ ## License
143
+
144
+ This model inherits the [NVIDIA Source Code License for SegFormer](https://github.com/NVlabs/SegFormer/blob/master/LICENSE). Please review the license terms before use.
config.json ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "nvidia/mit-b4",
3
+ "architectures": [
4
+ "SegformerForSemanticSegmentation"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.0,
7
+ "classifier_dropout_prob": 0.1,
8
+ "decoder_hidden_size": 768,
9
+ "depths": [
10
+ 3,
11
+ 8,
12
+ 27,
13
+ 3
14
+ ],
15
+ "downsampling_rates": [
16
+ 1,
17
+ 4,
18
+ 8,
19
+ 16
20
+ ],
21
+ "drop_path_rate": 0.1,
22
+ "hidden_act": "gelu",
23
+ "hidden_dropout_prob": 0.0,
24
+ "hidden_sizes": [
25
+ 64,
26
+ 128,
27
+ 320,
28
+ 512
29
+ ],
30
+ "id2label": {
31
+ "0": "background",
32
+ "1": "face",
33
+ "2": "hair",
34
+ "3": "top",
35
+ "4": "dress",
36
+ "5": "skirt",
37
+ "6": "pants",
38
+ "7": "belt",
39
+ "8": "bag",
40
+ "9": "hat",
41
+ "10": "scarf",
42
+ "11": "glasses",
43
+ "12": "arms",
44
+ "13": "hands",
45
+ "14": "legs",
46
+ "15": "feet",
47
+ "16": "torso",
48
+ "17": "jewelry"
49
+ },
50
+ "image_size": 224,
51
+ "initializer_range": 0.02,
52
+ "label2id": {
53
+ "arms": 12,
54
+ "background": 0,
55
+ "bag": 8,
56
+ "belt": 7,
57
+ "dress": 4,
58
+ "face": 1,
59
+ "feet": 15,
60
+ "glasses": 11,
61
+ "hair": 2,
62
+ "hands": 13,
63
+ "hat": 9,
64
+ "jewelry": 17,
65
+ "legs": 14,
66
+ "pants": 6,
67
+ "scarf": 10,
68
+ "skirt": 5,
69
+ "top": 3,
70
+ "torso": 16
71
+ },
72
+ "layer_norm_eps": 1e-06,
73
+ "mlp_ratios": [
74
+ 4,
75
+ 4,
76
+ 4,
77
+ 4
78
+ ],
79
+ "model_type": "segformer",
80
+ "num_attention_heads": [
81
+ 1,
82
+ 2,
83
+ 5,
84
+ 8
85
+ ],
86
+ "num_channels": 3,
87
+ "num_encoder_blocks": 4,
88
+ "patch_sizes": [
89
+ 7,
90
+ 3,
91
+ 3,
92
+ 3
93
+ ],
94
+ "reshape_last_stage": true,
95
+ "semantic_loss_ignore_index": 255,
96
+ "sr_ratios": [
97
+ 8,
98
+ 4,
99
+ 2,
100
+ 1
101
+ ],
102
+ "strides": [
103
+ 4,
104
+ 2,
105
+ 2,
106
+ 2
107
+ ],
108
+ "torch_dtype": "float32",
109
+ "transformers_version": "4.42.4"
110
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e43c8c8a9b04f28798f0a4630cf18caa2cdb27a0d454fae43a5716e6f7078244
3
+ size 256146352
preprocessor_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_reduce_labels": false,
3
+ "do_rescale": true,
4
+ "do_resize": true,
5
+ "do_normalize": true,
6
+ "image_mean": [0.485, 0.456, 0.406],
7
+ "image_std": [0.229, 0.224, 0.225],
8
+ "image_processor_type": "SegformerImageProcessor",
9
+ "resample": 1,
10
+ "rescale_factor": 0.00392156862745098,
11
+ "size": {
12
+ "height": 576,
13
+ "width": 384
14
+ }
15
+ }