Kalaoke commited on
Commit
daa2f46
·
1 Parent(s): d6df1bf

add custom handler and modify pipeline

Browse files
__pycache__/bert_for_sequence_classification.cpython-37.pyc CHANGED
Binary files a/__pycache__/bert_for_sequence_classification.cpython-37.pyc and b/__pycache__/bert_for_sequence_classification.cpython-37.pyc differ
 
__pycache__/bibert_multitask_classification.cpython-37.pyc CHANGED
Binary files a/__pycache__/bibert_multitask_classification.cpython-37.pyc and b/__pycache__/bibert_multitask_classification.cpython-37.pyc differ
 
__pycache__/handler.cpython-37.pyc CHANGED
Binary files a/__pycache__/handler.cpython-37.pyc and b/__pycache__/handler.cpython-37.pyc differ
 
handler.py CHANGED
@@ -1,150 +1,11 @@
1
- from typing import Dict, List, Any, Optional, Tuple, Union
2
  from dataclasses import dataclass
3
  import torch
4
- from torch import nn
5
- from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
6
- import numpy as np
7
- import transformers
8
- from transformers import AutoTokenizer, BertTokenizer
9
- from transformers import Pipeline, pipeline
10
  from transformers.pipelines import PIPELINE_REGISTRY
11
- from transformers import models
12
- from transformers.modeling_outputs import SequenceClassifierOutput
13
- from transformers.models.bert.configuration_bert import BertConfig
14
- from transformers.models.bert.modeling_bert import (
15
- BertPreTrainedModel,
16
- BERT_INPUTS_DOCSTRING,
17
- _TOKENIZER_FOR_DOC,
18
- _CHECKPOINT_FOR_DOC,
19
- BERT_START_DOCSTRING,
20
- _CONFIG_FOR_DOC,
21
- _SEQ_CLASS_EXPECTED_OUTPUT,
22
- _SEQ_CLASS_EXPECTED_LOSS,
23
- BertModel,
24
- )
25
-
26
- from transformers.file_utils import (
27
- add_code_sample_docstrings,
28
- add_start_docstrings_to_model_forward,
29
- add_start_docstrings
30
- )
31
-
32
- @add_start_docstrings(
33
- """
34
- Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
35
- output) e.g. for GLUE tasks.
36
- """,
37
- BERT_START_DOCSTRING,
38
- )
39
- class BertForSequenceClassification(BertPreTrainedModel):
40
- def __init__(self, config, **kwargs):
41
- super().__init__(transformers.PretrainedConfig())
42
- #task_labels_map={"binary_classification": 2, "label_classification": 5}
43
- self.tasks = kwargs.get("tasks_map", {})
44
- self.config = config
45
-
46
- self.bert = BertModel(config)
47
- classifier_dropout = (
48
- config.classifier_dropout
49
- if config.classifier_dropout is not None
50
- else config.hidden_dropout_prob
51
- )
52
- self.dropout = nn.Dropout(classifier_dropout)
53
- ## add task specific output heads
54
- self.classifier1 = nn.Linear(
55
- config.hidden_size, self.tasks[0].num_labels
56
- )
57
- self.classifier2 = nn.Linear(
58
- config.hidden_size, self.tasks[1].num_labels
59
- )
60
-
61
- self.init_weights()
62
-
63
- @add_start_docstrings_to_model_forward(
64
- BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")
65
- )
66
- @add_code_sample_docstrings(
67
- processor_class=_TOKENIZER_FOR_DOC,
68
- checkpoint=_CHECKPOINT_FOR_DOC,
69
- output_type=SequenceClassifierOutput,
70
- config_class=_CONFIG_FOR_DOC,
71
- expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
72
- expected_loss=_SEQ_CLASS_EXPECTED_LOSS,
73
- )
74
- def forward(
75
- self,
76
- input_ids: Optional[torch.Tensor] = None,
77
- attention_mask: Optional[torch.Tensor] = None,
78
- token_type_ids: Optional[torch.Tensor] = None,
79
- position_ids: Optional[torch.Tensor] = None,
80
- head_mask: Optional[torch.Tensor] = None,
81
- inputs_embeds: Optional[torch.Tensor] = None,
82
- labels: Optional[torch.Tensor] = None,
83
- output_attentions: Optional[bool] = None,
84
- output_hidden_states: Optional[bool] = None,
85
- return_dict: Optional[bool] = None,
86
- task_ids=None,
87
- ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
88
- r"""
89
- labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
90
- Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
91
- config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
92
- If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
93
- """
94
- return_dict = (
95
- return_dict if return_dict is not None else self.config.use_return_dict
96
- )
97
-
98
- outputs = self.bert(
99
- input_ids,
100
- attention_mask=attention_mask,
101
- token_type_ids=token_type_ids,
102
- position_ids=position_ids,
103
- head_mask=head_mask,
104
- inputs_embeds=inputs_embeds,
105
- output_attentions=output_attentions,
106
- output_hidden_states=output_hidden_states,
107
- return_dict=return_dict,
108
- )
109
-
110
- pooled_output = outputs[1]
111
-
112
- pooled_output = self.dropout(pooled_output)
113
-
114
- unique_task_ids_list = torch.unique(task_ids).tolist()
115
- loss_list = []
116
- logits = None
117
- for unique_task_id in unique_task_ids_list:
118
-
119
- loss = None
120
- task_id_filter = task_ids == unique_task_id
121
-
122
- if unique_task_id == 0:
123
- logits = self.classifier1(pooled_output[task_id_filter])
124
- elif unique_task_id == 1:
125
- logits = self.classifier2(pooled_output[task_id_filter])
126
-
127
-
128
- if labels is not None:
129
- loss_fct = CrossEntropyLoss()
130
- loss = loss_fct(logits.view(-1, self.tasks[unique_task_id].num_labels), labels[task_id_filter].view(-1))
131
- loss_list.append(loss)
132
-
133
- # logits are only used for eval. and in case of eval the batch is not multi task
134
- # For training only the loss is used
135
-
136
- if loss_list:
137
- loss = torch.stack(loss_list).mean()
138
- if not return_dict:
139
- output = (logits,) + outputs[2:]
140
- return ((loss,) + output) if loss is not None else output
141
-
142
- return SequenceClassifierOutput(
143
- loss=loss,
144
- logits=logits,
145
- hidden_states=outputs.hidden_states,
146
- attentions=outputs.attentions,
147
- )
148
 
149
  @dataclass
150
  class Task:
@@ -153,59 +14,6 @@ class Task:
153
  type: str
154
  num_labels: int
155
 
156
- def softmax(_outputs):
157
- maxes = np.max(_outputs, axis=-1, keepdims=True)
158
- shifted_exp = np.exp(_outputs - maxes)
159
- return shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)
160
-
161
- class BiBert_MultiTaskPipeline(Pipeline):
162
-
163
-
164
- def _sanitize_parameters(self, **kwargs):
165
-
166
- preprocess_kwargs = {}
167
- if "task_id" in kwargs:
168
- preprocess_kwargs["task_id"] = kwargs["task_id"]
169
-
170
- forward_kwargs = {}
171
- if "task_id" in kwargs:
172
- forward_kwargs["task_id"] = kwargs["task_id"]
173
-
174
- postprocess_kwargs = {}
175
- if "top_k" in kwargs:
176
- postprocess_kwargs["top_k"] = kwargs["top_k"]
177
- postprocess_kwargs["_legacy"] = False
178
- return preprocess_kwargs, forward_kwargs, postprocess_kwargs
179
-
180
-
181
-
182
- def preprocess(self, inputs, task_id):
183
- return_tensors = self.framework
184
- feature = self.tokenizer(inputs, padding = True, return_tensors=return_tensors).to(self.device)
185
- task_ids = np.full(shape=1,fill_value=task_id, dtype=int)
186
- feature["task_ids"] = torch.IntTensor(task_ids)
187
- return feature
188
-
189
- def _forward(self, model_inputs, task_id):
190
- return self.model(**model_inputs)
191
-
192
- def postprocess(self, model_outputs, top_k=1, _legacy=True):
193
- outputs = model_outputs["logits"][0]
194
- outputs = outputs.numpy()
195
- scores = softmax(outputs)
196
-
197
- if top_k == 1 and _legacy:
198
- return {"label": self.model.config.id2label[scores.argmax().item()], "score": scores.max().item()}
199
-
200
- dict_scores = [
201
- {"label": self.model.config.id2label[i], "score": score.item()} for i, score in enumerate(scores)
202
- ]
203
- if not _legacy:
204
- dict_scores.sort(key=lambda x: x["score"], reverse=True)
205
- if top_k is not None:
206
- dict_scores = dict_scores[:top_k]
207
- return dict_scores
208
-
209
 
210
  class EndpointHandler():
211
  def __init__(self, path=""):
 
1
+ from typing import Dict, List, Any
2
  from dataclasses import dataclass
3
  import torch
4
+ from transformers import AutoTokenizer
5
+ from transformers import pipeline
 
 
 
 
6
  from transformers.pipelines import PIPELINE_REGISTRY
7
+ from bibert_multitask_classification import BiBert_MultiTaskPipeline
8
+ from bert_for_sequence_classification import BertForSequenceClassification
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  @dataclass
11
  class Task:
 
14
  type: str
15
  num_labels: int
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
  class EndpointHandler():
19
  def __init__(self, path=""):