Add Custom model and pipeline to make usage easier.

by tcapelle - opened Dec 15, 2024

←

Files changed (6) hide show

config.json CHANGED Viewed

@@ -1,8 +1,22 @@
 {
   "architectures": [
-    "MultiHeadDebertaForSequenceClassification"
   ],
   "attention_probs_dropout_prob": 0.1,
   "hidden_act": "gelu",
   "hidden_dropout_prob": 0.1,
   "hidden_size": 768,
@@ -11,9 +25,10 @@
   "layer_norm_eps": 1e-07,
   "max_position_embeddings": 512,
   "max_relative_positions": -1,
-  "model_type": "deberta-v2",
   "norm_rel_ebd": "layer_norm",
   "num_attention_heads": 12,
   "num_hidden_layers": 6,
   "pad_token_id": 0,
   "pooler_dropout": 0,
@@ -28,7 +43,7 @@
   "relative_attention": true,
   "share_att_key": true,
   "torch_dtype": "float32",
-  "transformers_version": "4.45.2",
   "type_vocab_size": 0,
   "vocab_size": 128100
 }

 {
+  "_name_or_path": "./celadon",
   "architectures": [
+    "MultiHeadDebertaForSequenceClassificationModel"
   ],
   "attention_probs_dropout_prob": 0.1,
+  "auto_map": {
+    "AutoConfig": "configuration_deberta_multi.MultiHeadDebertaV2Config",
+    "AutoModelForSequenceClassification": "modelling_deberta_multi.MultiHeadDebertaForSequenceClassificationModel"
+  },
+  "custom_pipelines": {
+    "multi-head-text-classification": {
+      "impl": "custom_pipeline.CustomTextClassificationPipeline",
+      "pt": [
+        "AutoModelForSequenceClassification"
+      ],
+      "tf": []
+    }
+  },
   "hidden_act": "gelu",
   "hidden_dropout_prob": 0.1,
   "hidden_size": 768,
   "layer_norm_eps": 1e-07,
   "max_position_embeddings": 512,
   "max_relative_positions": -1,
+  "model_type": "multi-head-deberta-for-sequence-classification",
   "norm_rel_ebd": "layer_norm",
   "num_attention_heads": 12,
+  "num_heads": 5,
   "num_hidden_layers": 6,
   "pad_token_id": 0,
   "pooler_dropout": 0,
   "relative_attention": true,
   "share_att_key": true,
   "torch_dtype": "float32",
+  "transformers_version": "4.46.2",
   "type_vocab_size": 0,
   "vocab_size": 128100
 }

configuration_deberta_multi.py ADDED Viewed

+from transformers import DebertaV2Config
+class MultiHeadDebertaV2Config(DebertaV2Config):
+    model_type = "multi-head-deberta-for-sequence-classification"
+    def __init__(self, num_heads=5, **kwargs):
+        self.num_heads = num_heads
+        super().__init__(**kwargs)

custom_pipeline.py ADDED Viewed

+print("Loading Multi head pipeline")
+from transformers.pipelines import PIPELINE_REGISTRY
+from transformers import TextClassificationPipeline, AutoTokenizer, AutoModelForSequenceClassification
+class CustomTextClassificationPipeline(TextClassificationPipeline):
+    def __init__(self, model, tokenizer=None, **kwargs):
+        if tokenizer is None:
+            tokenizer = AutoTokenizer.from_pretrained(model.config._name_or_path)
+        super().__init__(model=model, tokenizer=tokenizer, **kwargs)
+    def _sanitize_parameters(self, **kwargs):
+        preprocess_kwargs = {}
+        return preprocess_kwargs, {}, {}
+    def preprocess(self, inputs):
+        return self.tokenizer(inputs, return_tensors='pt', truncation=True, padding=True)
+    def _forward(self, model_inputs):
+        input_ids = model_inputs['input_ids']
+        attention_mask = (input_ids != 0).long()
+        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
+        return outputs
+    def postprocess(self, model_outputs):
+        predictions = model_outputs.logits.argmax(dim=-1).squeeze().tolist()
+        categories = ["Race/Origin", "Gender/Sex", "Religion", "Ability", "Violence", "Other"]
+        return dict(zip(categories, predictions))
+PIPELINE_REGISTRY.register_pipeline(
+    "multi-head-text-classification",
+    pipeline_class=CustomTextClassificationPipeline,
+    pt_model=AutoModelForSequenceClassification,
+)

special_tokens_map.json CHANGED Viewed

@@ -1,10 +1,46 @@
 {
-  "bos_token": "[CLS]",
-  "cls_token": "[CLS]",
-  "eos_token": "[SEP]",
-  "mask_token": "[MASK]",
-  "pad_token": "[PAD]",
-  "sep_token": "[SEP]",
   "unk_token": {
     "content": "[UNK]",
     "lstrip": false,

 {
+  "bos_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "[MASK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
   "unk_token": {
     "content": "[UNK]",
     "lstrip": false,

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json CHANGED Viewed

@@ -47,7 +47,7 @@
   "do_lower_case": false,
   "eos_token": "[SEP]",
   "mask_token": "[MASK]",
-  "model_max_length": 1000000000000000019884624838656,
   "pad_token": "[PAD]",
   "sep_token": "[SEP]",
   "sp_model_kwargs": {},

   "do_lower_case": false,
   "eos_token": "[SEP]",
   "mask_token": "[MASK]",
+  "model_max_length": 512,
   "pad_token": "[PAD]",
   "sep_token": "[SEP]",
   "sp_model_kwargs": {},