dariadaria
/

reviews_classifier

Text Classification

text-embeddings-inference

Model card Files Files and versions

dariadaria commited on Aug 10, 2023

Commit

1aec583

·

1 Parent(s): c0d4e65

batched handler

Files changed (1) hide show

handler.py +29 -13

handler.py CHANGED Viewed

@@ -1,16 +1,17 @@
 from typing import Dict, List, Any
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 import torch
 class EndpointHandler:
-    def __init__(self, path=""):
         self.tokenizer = AutoTokenizer.from_pretrained(path)
-        self.model = AutoModelForSequenceClassification.from_pretrained(path, num_labels=3)
-        def tokenize(text, topic):
             return self.tokenizer(
-                topic,
-                text,
                 max_length=384, #512
                 truncation="only_second",
                 return_offsets_mapping=False,
@@ -22,16 +23,31 @@ class EndpointHandler:
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
         """
          data args:
-              topic (:obj: `str`)
-              text (:obj: `str`)
         Return:
               A :obj:`list` | `dict`: will be serialized and returned
         """
-        topic = data.pop("topic", data)
-        text = data.pop("text", data)
-        tokenized_inputs = self.tokenize(text, topic)
         output = self.model(**tokenized_inputs)
-        prediction = torch.argmax(output.logits, dim=-1).item()
-        label = self.model.config.id2label[prediction]
-        return label

 from typing import Dict, List, Any
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 import torch
+from datasets import Dataset
 class EndpointHandler:
+    def __init__(self, path=""):
         self.tokenizer = AutoTokenizer.from_pretrained(path)
+        self.model = AutoModelForSequenceClassification.from_pretrained(path)
+        def tokenize(batch):
             return self.tokenizer(
+                batch['topic'],
+                batch['text'],
                 max_length=384, #512
                 truncation="only_second",
                 return_offsets_mapping=False,
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
         """
          data args:
+              topics List[str]
+              texts List[Dict[str, str]]: keys shouls be id and text
         Return:
               A :obj:`list` | `dict`: will be serialized and returned
         """
+        topics = data.pop("topics", data)
+        texts = data.pop("texts", data)
+        batch_dict = {
+            'id': [],
+            'text': [],
+            'topic': []
+        }
+        for topic in topics:
+          for text in texts:
+            batch_dict['id'].append(text['id'])
+            batch_dict['text'].append(text['text'])
+            batch_dict['topic'].append(topic)
+        batch = Dataset.from_dict(batch_dict)
+        tokenized_inputs = self.tokenize(batch)
+        # run normal prediction
         output = self.model(**tokenized_inputs)
+        batch = batch.add_column('predictions', torch.argmax(output.logits, dim=-1).numpy(force=True))
+        batch = batch.map(lambda b: {'label': [self.model.config.id2label[p] for p in b['predictions']]}, batched=True, remove_columns=['text', 'predictions'])
+        return batch.to_dict()