Upload MultiTaskClassifierPipeline

Browse files

Files changed (6) hide show

classifier.py +182 -0
classifier_pipeline.py +23 -0
config.json +19 -1
special_tokens_map.json +35 -5
tokenizer.json +10 -1
tokenizer_config.json +3 -0

classifier.py ADDED Viewed

	@@ -0,0 +1,182 @@

+import numpy as np
+import torch.nn.functional as F
+from collections import OrderedDict
+class Classifier(object):
+    MULTI_CLASS = 'multi_class'
+    MULTI_LABEL = 'multi_label'
+    MODEL_CONFIG = 'classifier_config'
+    id2label = None
+    label2id = None
+    num_labels = 0
+    indices = {}
+    def __init__(self, config):
+        self.config = config
+        self.setup()
+    # @property
+    # def tokenizer_config(self):
+    #     config = {}
+    #     for cls, cls_items in self._config.items():
+    #         config[cls] = [
+    #             {"name": item["name"], "labels": item["labels"]} for item in cls_items
+    #         ]
+    #     return config
+    def setup(self):
+        all_items = [item for items in self.config.values() for item in items]
+        labels_dict = OrderedDict([(k, v) for item in all_items for (k,v) in item['labels']])
+        self.id2label = {idx : _l for (idx, _l) in enumerate(labels_dict)}
+        self.label2id = {_l : idx for (idx, _l) in enumerate(labels_dict)}
+        self.num_labels = len(self.id2label)
+        self._compute_indices()
+    def items(self, cls):
+        return self.config[cls]
+    def _compute_indices(self):
+        all_items = [item for items in self.config.values() for item in items]
+        self.indices = {}
+        range_offset = 0
+        for item in all_items:
+            cls_labels = OrderedDict(item['labels'])
+            name = item['name']
+            range_start = range_offset
+            range_end = range_start + len(cls_labels)
+            self.indices[name] = range(range_start, range_end)
+            range_offset = range_end
+    def encode_labels(self, row):
+        label_encodings = np.zeros(self.num_labels)
+        for item in self.items(self.MULTI_CLASS):
+            labels = OrderedDict(item['labels'])
+            cls_indices = self.indices[item['name']]
+            column_name = item['column']
+            offset = next(i for i in cls_indices)
+            cls_label2id = {_l: i for (i, _l) in enumerate(labels.keys())}
+            column_value = row[column_name].strip()
+            label_encodings[offset + cls_label2id[column_value]] = 1
+        for item in self.items(self.MULTI_LABEL):
+            cls_indices = self.indices[item['name']]
+            offset = next(i for i in cls_indices)
+            columns = item['columns']
+            for (cidx, column_name) in enumerate(columns):
+                cls_label2id = columns[column_name]
+                column_value = row[column_name].strip()
+                label_encodings[offset + cidx] = cls_label2id[column_value]
+        return label_encodings
+    def preds_from_logits(self, logits):
+        preds = np.zeros_like(logits)
+        (rows, _) = preds.shape
+        # print(logits)
+        for item in self.items(self.MULTI_CLASS):
+            cls_indices = self.indices[item['name']]
+            index_offset = next(i for i in cls_indices)
+            best_classes = np.argmax(logits[:,cls_indices], axis=-1)
+            preds[np.arange(rows), [i + index_offset for i in best_classes]] = 1
+        for item in self.items(self.MULTI_LABEL):
+            cls_indices = self.indices[item['name']]
+            threshold = item['threshold']
+            preds[:, cls_indices] = (logits[:, cls_indices] >= threshold).astype(float)
+        return preds
+    def compute_losses(self, logits, labels):
+        multi_class_losses = []
+        multi_label_losses = []
+        losses = {}
+        for item in self.items(self.MULTI_CLASS):
+            cls_indices = self.indices[item['name']]
+            cls_loss_weight = item.get('loss_weight', 1)
+            cls_loss = F.cross_entropy(logits[:,cls_indices], labels[:,cls_indices]).unsqueeze(dim=0)
+            multi_class_losses.append(cls_loss_weight * cls_loss)
+        for item in self.items(self.MULTI_LABEL):
+            cls_indices = self.indices[item['name']]
+            cls_loss_weight = item.get('loss_weight', 1)
+            cls_loss = F.binary_cross_entropy_with_logits(logits[:,cls_indices], labels[:,cls_indices]).unsqueeze(dim=0)
+            multi_label_losses.append(cls_loss_weight * cls_loss)
+        # return {
+        #     self.MULTI_CLASS: sum(*multi_class_losses),
+        #     self.MULTI_LABEL: sum(*multi_label_losses),
+        # }
+        losses.update({self.MULTI_CLASS: sum(*multi_class_losses)})
+        losses.update({self.MULTI_LABEL: sum(*multi_label_losses)})
+        return losses
+    def get_results(self, logits):
+        predictions = self.preds_from_logits(logits)
+        decoded_predictions = [
+            [self.id2label[i] for (i, _l) in enumerate(row) if _l == 1] \
+                for row in predictions
+        ]
+        results = []
+        for decoded in decoded_predictions:
+            result = {}
+            for item in self.items(self.MULTI_CLASS):
+                cls_labels = OrderedDict(item['labels'])
+                name = item['name']
+                key = next((_l for _l in decoded if _l in cls_labels), None)
+                if key is None:
+                    value = None
+                else:
+                    value = cls_labels[key]
+                result[name] = {
+                    'key': key,
+                    'value': value,
+                }
+            for item in self.items(self.MULTI_LABEL):
+                cls_labels = OrderedDict(item['labels'])
+                name = item['name']
+                result[name] = [cls_labels[_l] for _l in decoded if _l in cls_labels]
+            results.append(result)
+        return results
+    def random_logits(self, num_rows=1):
+        return np.random.uniform(-2, 2, (num_rows, self.num_labels))

classifier_pipeline.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from transformers import Pipeline
+from .classifier import Classifier
+class MultiTaskClassifierPipeline(Pipeline):
+    def _sanitize_parameters(self, **kwargs):
+        preprocess_kwargs = {}
+        postprocess_kwargs = {}
+        return  preprocess_kwargs, {}, postprocess_kwargs
+    def preprocess(self, inputs):
+        return self.tokenizer(inputs, padding="max_length", truncation=True, return_tensors=self.framework).to(self.device)
+    def _forward(self, model_inputs):
+        return self.model(**model_inputs)
+    def postprocess(self, model_outputs):
+        model_config = self.model.config
+        classifier = Classifier(model_config.task_specific_params[Classifier.MODEL_CONFIG])
+        logits = model_outputs.logits.numpy()
+        return classifier.get_results(logits)[0]

config.json CHANGED Viewed

@@ -1,10 +1,28 @@
 {
-  "_name_or_path": "google-bert/bert-large-uncased",
   "architectures": [
     "BertForSequenceClassification"
   ],
   "attention_probs_dropout_prob": 0.1,
   "classifier_dropout": null,
   "gradient_checkpointing": false,
   "hidden_act": "gelu",
   "hidden_dropout_prob": 0.1,

 {
+  "_name_or_path": "ai-research-lab/bert-question-classifier",
   "architectures": [
     "BertForSequenceClassification"
   ],
   "attention_probs_dropout_prob": 0.1,
   "classifier_dropout": null,
+  "custom_pipelines": {
+    "question-classifier": {
+      "default": {
+        "model": {
+          "pt": [
+            "ai-research-lab/bert-question-classifier",
+            "main"
+          ]
+        }
+      },
+      "impl": "classifier_pipeline.MultiTaskClassifierPipeline",
+      "pt": [
+        "AutoModelForSequenceClassification"
+      ],
+      "tf": [],
+      "type": "text"
+    }
+  },
   "gradient_checkpointing": false,
   "hidden_act": "gelu",
   "hidden_dropout_prob": 0.1,

special_tokens_map.json CHANGED Viewed

@@ -1,7 +1,37 @@
 {
-  "cls_token": "[CLS]",
-  "mask_token": "[MASK]",
-  "pad_token": "[PAD]",
-  "sep_token": "[SEP]",
-  "unk_token": "[UNK]"
 }

 {
+  "cls_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "[MASK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "[UNK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
 }

tokenizer.json CHANGED Viewed

@@ -6,7 +6,16 @@
     "strategy": "LongestFirst",
     "stride": 0
   },
-  "padding": null,
   "added_tokens": [
     {
       "id": 0,

     "strategy": "LongestFirst",
     "stride": 0
   },
+  "padding": {
+    "strategy": {
+      "Fixed": 512
+    },
+    "direction": "Right",
+    "pad_to_multiple_of": null,
+    "pad_id": 0,
+    "pad_type_id": 0,
+    "pad_token": "[PAD]"
+  },
   "added_tokens": [
     {
       "id": 0,

tokenizer_config.json CHANGED Viewed

@@ -50,8 +50,11 @@
   "model_max_length": 512,
   "pad_token": "[PAD]",
   "sep_token": "[SEP]",
   "strip_accents": null,
   "tokenize_chinese_chars": true,
   "tokenizer_class": "BertTokenizer",
   "unk_token": "[UNK]"
 }

   "model_max_length": 512,
   "pad_token": "[PAD]",
   "sep_token": "[SEP]",
+  "stride": 0,
   "strip_accents": null,
   "tokenize_chinese_chars": true,
   "tokenizer_class": "BertTokenizer",
+  "truncation_side": "right",
+  "truncation_strategy": "longest_first",
   "unk_token": "[UNK]"
 }