pogzyb
/

face-segmenter

@@ -2,27 +2,10 @@
   "architectures": [
     "SegformerForSemanticSegmentation"
   ],
-  "attention_probs_dropout_prob": 0.0,
   "auto_map": {
     "AutoModelForImageSegmentation": "model.SegformerForSemanticSegmentation"
   },
-  "classifier_dropout_prob": 0.1,
-  "decoder_hidden_size": 256,
-  "depths": [
-    2,
-    2,
-    2,
-    2
-  ],
-  "drop_path_rate": 0.1,
-  "hidden_act": "gelu",
-  "hidden_dropout_prob": 0.0,
-  "hidden_sizes": [
-    32,
-    64,
-    160,
-    256
-  ],
   "id2label": {
     "0": "skin",
     "1": "l_brow",
@@ -43,7 +26,6 @@
     "16": "hair",
     "17": "hat"
   },
-  "initializer_range": 0.02,
   "label2id": {
     "cloth": 15,
     "ear_r": 8,
@@ -64,43 +46,8 @@
     "skin": 0,
     "u_lip": 11
   },
-  "layer_norm_eps": 1e-06,
-  "mlp_ratios": [
-    4,
-    4,
-    4,
-    4
-  ],
-  "model_type": "segformer",
-  "num_attention_heads": [
-    1,
-    2,
-    5,
-    8
-  ],
-  "num_channels": 3,
   "num_classes": 18,
-  "num_encoder_blocks": 4,
-  "patch_sizes": [
-    7,
-    3,
-    3,
-    3
-  ],
-  "reshape_last_stage": true,
-  "semantic_loss_ignore_index": 255,
-  "sr_ratios": [
-    8,
-    4,
-    2,
-    1
-  ],
-  "strides": [
-    4,
-    2,
-    2,
-    2
-  ],
   "torch_dtype": "float32",
-  "transformers_version": "4.36.2"
 }

   "architectures": [
     "SegformerForSemanticSegmentation"
   ],
   "auto_map": {
+    "AutoConfig": "model.FaceSegmenterConfig",
     "AutoModelForImageSegmentation": "model.SegformerForSemanticSegmentation"
   },
   "id2label": {
     "0": "skin",
     "1": "l_brow",
     "16": "hair",
     "17": "hat"
   },
   "label2id": {
     "cloth": 15,
     "ear_r": 8,
     "skin": 0,
     "u_lip": 11
   },
+  "model_type": "image-segmentation",
   "num_classes": 18,
   "torch_dtype": "float32",
+  "transformers_version": "4.37.0"
 }

model.py CHANGED Viewed

@@ -4,6 +4,65 @@ from torch import nn
 from transformers.modeling_outputs import SemanticSegmenterOutput
 def encode_down(c_in: int, c_out: int):
     return nn.Sequential(
         nn.Conv2d(in_channels=c_in, out_channels=c_out, kernel_size=3, padding=1),
@@ -28,7 +87,7 @@ class FaceUNet(nn.Module):
     def __init__(self, num_classes: int):
         super().__init__()
         self.num_classes = num_classes
         self.down_1 = nn.Conv2d(
             in_channels=3,
             out_channels=64,
@@ -42,6 +101,7 @@ class FaceUNet(nn.Module):
         self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
         self.up_1 = decode_up(1024)
         self.up_c1 = encode_down(1024, 512)
         self.up_2 = decode_up(512)
@@ -83,7 +143,7 @@ class FaceUNet(nn.Module):
 class Segformer(transformers.PreTrainedModel):
-    config_class = transformers.SegformerConfig
     def __init__(self, config):
         super().__init__(config)
@@ -95,7 +155,7 @@ class Segformer(transformers.PreTrainedModel):
 class SegformerForSemanticSegmentation(transformers.PreTrainedModel):
-    config_class = transformers.SegformerConfig
     def __init__(self, config):
         super().__init__(config)

 from transformers.modeling_outputs import SemanticSegmenterOutput
+class FaceSegmenterConfig(transformers.PretrainedConfig):
+    model_type = "image-segmentation"
+    _id2label = {
+        0: "skin",
+        1: "l_brow",
+        2: "r_brow",
+        3: "l_eye",
+        4: "r_eye",
+        5: "eye_g",
+        6: "l_ear",
+        7: "r_ear",
+        8: "ear_r",
+        9: "nose",
+        10: "mouth",
+        11: "u_lip",
+        12: "l_lip",
+        13: "neck",
+        14: "neck_l",
+        15: "cloth",
+        16: "hair",
+        17: "hat",
+    }
+    _label2id = {
+        "skin": 0,
+        "l_brow": 1,
+        "r_brow": 2,
+        "l_eye": 3,
+        "r_eye": 4,
+        "eye_g": 5,
+        "l_ear": 6,
+        "r_ear": 7,
+        "ear_r": 8,
+        "nose": 9,
+        "mouth": 10,
+        "u_lip": 11,
+        "l_lip": 12,
+        "neck": 13,
+        "neck_l": 14,
+        "cloth": 15,
+        "hair": 16,
+        "hat": 17,
+    }
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.id2label = kwargs.get("id2label", self._id2label)
+        # for some reason these are getting convert to strings when used in pipelines
+        id_keys = list(self.id2label.keys())
+        for label_id in id_keys:
+            label_value = self.id2label.pop(label_id)
+            self.id2label[int(label_id)] = label_value
+        self.label2id = kwargs.get("label2id", self._label2id)
+        self.num_classes = kwargs.get("num_classes", len(self.id2label))
 def encode_down(c_in: int, c_out: int):
     return nn.Sequential(
         nn.Conv2d(in_channels=c_in, out_channels=c_out, kernel_size=3, padding=1),
     def __init__(self, num_classes: int):
         super().__init__()
         self.num_classes = num_classes
+        # unet
         self.down_1 = nn.Conv2d(
             in_channels=3,
             out_channels=64,
         self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
+        # Below, `in_channels` again becomes 1024 as we are concatinating.
         self.up_1 = decode_up(1024)
         self.up_c1 = encode_down(1024, 512)
         self.up_2 = decode_up(512)
 class Segformer(transformers.PreTrainedModel):
+    config_class = FaceSegmenterConfig
     def __init__(self, config):
         super().__init__(config)
 class SegformerForSemanticSegmentation(transformers.PreTrainedModel):
+    config_class = FaceSegmenterConfig
     def __init__(self, config):
         super().__init__(config)