| from typing import List |
|
|
| import torch |
| from datasets import Dataset |
| from torch.utils.data import DataLoader |
| from tqdm import tqdm |
| from transformers import PerceiverTokenizer |
|
|
|
|
| def _map_outputs(predictions): |
| """ |
| Map model outputs to classes. |
| |
| :param predictions: model ouptut batch |
| :return: |
| """ |
|
|
| labels = [ |
| "admiration", |
| "amusement", |
| "anger", |
| "annoyance", |
| "approval", |
| "caring", |
| "confusion", |
| "curiosity", |
| "desire", |
| "disappointment", |
| "disapproval", |
| "disgust", |
| "embarrassment", |
| "excitement", |
| "fear", |
| "gratitude", |
| "grief", |
| "joy", |
| "love", |
| "nervousness", |
| "optimism", |
| "pride", |
| "realization", |
| "relief", |
| "remorse", |
| "sadness", |
| "surprise", |
| "neutral" |
| ] |
| classes = [] |
| for i, example in enumerate(predictions): |
| out_batch = [] |
| for j, category in enumerate(example): |
| out_batch.append(labels[j]) if category > 0.5 else None |
| classes.append(out_batch) |
| return classes |
|
|
|
|
| class MultiLabelPipeline: |
| """ |
| Multi label classification pipeline. |
| """ |
|
|
| def __init__(self, model_path): |
| """ |
| Init MLC pipeline. |
| |
| :param model_path: model to use |
| """ |
|
|
| |
| self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') |
| if self.device == 'cuda': |
| self.model = torch.load(model_path).eval().to(self.device) |
| else: |
| self.model = torch.load(model_path, map_location=torch.device('cpu')).eval().to(self.device) |
| self.tokenizer = PerceiverTokenizer.from_pretrained('deepmind/language-perceiver') |
|
|
| def __call__(self, dataset, batch_size: int = 4): |
| """ |
| Processing pipeline. |
| |
| :param dataset: dataset |
| :return: |
| """ |
|
|
| |
| dataset = dataset.map(lambda row: self.tokenizer(row['text'], padding="max_length", truncation=True), |
| batched=True, remove_columns=['text'], desc='Tokenizing') |
| dataset.set_format('torch', columns=['input_ids', 'attention_mask']) |
| dataloader = DataLoader(dataset, batch_size=batch_size) |
|
|
| |
| classes = [] |
| mem_logs = [] |
|
|
| with tqdm(dataloader, unit='batches') as progression: |
| for batch in progression: |
| progression.set_description('Inference') |
| |
| outputs = self.model(inputs=batch['input_ids'].to(self.device), |
| attention_mask=batch['attention_mask'].to(self.device), ) |
|
|
| |
| predictions = outputs.logits.cpu().detach().numpy() |
|
|
| |
| batch_classes = _map_outputs(predictions) |
|
|
| for row in batch_classes: |
| classes.append(row) |
|
|
| |
| memory = round(torch.cuda.memory_reserved(self.device) / 1e9, 2) |
| mem_logs.append(memory) |
|
|
| |
| progression.set_postfix(memory=f"{round(sum(mem_logs) / len(mem_logs), 2)}Go") |
|
|
| return classes |
|
|
|
|
| def inputs_to_dataset(inputs: List[str]): |
| """ |
| Convert a list of strings to a dataset object. |
| |
| :param inputs: list of strings |
| :return: |
| """ |
|
|
| inputs = {'text': [input for input in inputs]} |
|
|
| return Dataset.from_dict(inputs) |
|
|