| import torch |
| from transformers import Pipeline |
| from transformers import AutoTokenizer |
| from transformers.pipelines import PIPELINE_REGISTRY |
| from transformers import pipeline |
| from transformers import AutoModelForTokenClassification |
| from huggingface_hub import Repository |
| import sys |
| import os |
|
|
|
|
| class TokenizeAndAlignLabelsStep(): |
|
|
| |
| def tokenize_and_align_labels(self, examples, tokenizer): |
| |
| tokenized_inputs = tokenizer(examples, padding='max_length', truncation=True, max_length=128, is_split_into_words=True) |
| |
| |
| word_ids = tokenized_inputs.word_ids() |
|
|
| previous_word_idx = None |
| |
| labels_mask = [] |
| |
| for word_idx in word_ids: |
| if word_idx is None: |
| labels_mask.append(False) |
| |
| elif word_idx != previous_word_idx: |
| labels_mask.append(True) |
| else: |
| labels_mask.append(False) |
| |
| previous_word_idx = word_idx |
| |
| tokenized_inputs["labels_mask"] = labels_mask |
|
|
| return tokenized_inputs |
|
|
|
|
|
|
| class BERT_CRF_Pipeline(Pipeline): |
|
|
| def _sanitize_parameters(self, **kwargs): |
| return {}, {}, {} |
|
|
| def preprocess(self, inputs): |
| tokens = inputs['tokens'] |
|
|
| tokenizer = AutoTokenizer.from_pretrained( |
| "neuralmind/bert-base-portuguese-cased", do_lower_case=False) |
| |
| return TokenizeAndAlignLabelsStep().tokenize_and_align_labels(examples=tokens, tokenizer=tokenizer) |
|
|
|
|
| def _forward(self, tokenizer_results): |
|
|
| input_ids = torch.tensor( |
| tokenizer_results['input_ids'], dtype=torch.long, device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")).unsqueeze(0) |
|
|
| token_type_ids = torch.tensor( |
| tokenizer_results['token_type_ids'], dtype=torch.long, device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")).unsqueeze(0) |
|
|
| attention_mask = torch.tensor( |
| tokenizer_results['attention_mask'], dtype=torch.bool, device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")).unsqueeze(0) |
|
|
| labels_mask = torch.tensor( |
| tokenizer_results['labels_mask'], dtype=torch.bool, device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")).unsqueeze(0) |
|
|
| |
| outputs = self.model(input_ids=input_ids, token_type_ids=token_type_ids, |
| attention_mask=attention_mask, labels=None, labels_mask=labels_mask) |
|
|
| return outputs |
|
|
| def postprocess(self, model_outputs): |
| |
| |
| for i, label in enumerate(model_outputs[0]): |
| model_outputs[0][i] = self.model.config.id2label[label] |
| |
| return model_outputs[0] |
|
|
|
|
|
|
|
|
| def main(): |
|
|
| PIPELINE_REGISTRY.register_pipeline("PT-BERT-Large-CRF-HAREM-Default-pipeline", |
| pipeline_class=BERT_CRF_Pipeline, |
| pt_model=AutoModelForTokenClassification, |
| ) |
| classifier = pipeline("PT-BERT-Large-CRF-HAREM-Default-pipeline", model="arubenruben/PT-BERT-Large-CRF-HAREM-Default", |
| device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu"), trust_remote_code=True) |
| out_path = os.path.join(sys.path[0], 'out', 'pipeline') |
| repo = Repository( |
| out_path, clone_from=f"arubenruben/PT-BERT-Large-CRF-HAREM-Default", use_auth_token=True) |
|
|
| |
|
|
| classifier.save_pretrained(out_path) |
| repo.push_to_hub() |