philschmid commited on
Commit
cb8919b
·
1 Parent(s): c010e78

Create new file

Browse files
Files changed (1) hide show
  1. handler.py +46 -0
handler.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List, Any
2
+ from transformers import AutoTokenizer, LayoutLMForSequenceClassification
3
+ import torch
4
+ import os
5
+
6
+
7
+ os.system("sudo apt install -y tesseract-ocr")
8
+ os.system("pip3 install pytesseract==0.3.9")
9
+
10
+
11
+ class EndpointHandler():
12
+ def __init__(self, path=""):
13
+ self.tokenizer = AutoTokenizer.from_pretrained("microsoft/layoutlm-base-uncased")
14
+ self.model = LayoutLMForSequenceClassification.from_pretrained("microsoft/layoutlm-base-uncased") # load the optimized model
15
+
16
+
17
+ def __call__(self, data: Any) -> List[List[Dict[str, float]]]:
18
+ words = ["Hello", "world"]
19
+ normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782]
20
+
21
+ token_boxes = []
22
+ for word, box in zip(words, normalized_word_boxes):
23
+ word_tokens = tokenizer.tokenize(word)
24
+ token_boxes.extend([box] * len(word_tokens))
25
+ # add bounding boxes of cls + sep tokens
26
+ token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]
27
+
28
+ encoding = tokenizer(" ".join(words), return_tensors="pt")
29
+ input_ids = encoding["input_ids"]
30
+ attention_mask = encoding["attention_mask"]
31
+ token_type_ids = encoding["token_type_ids"]
32
+ bbox = torch.tensor([token_boxes])
33
+ sequence_label = torch.tensor([1])
34
+
35
+ outputs = self.model(
36
+ input_ids=input_ids,
37
+ bbox=bbox,
38
+ attention_mask=attention_mask,
39
+ token_type_ids=token_type_ids,
40
+ labels=sequence_label,
41
+ )
42
+
43
+ loss = outputs.loss
44
+ logits = outputs.logits
45
+ return {"logits": logits}
46
+