Spaces:

Duplicated from pierreguillou/Inference-APP-Document-Understanding-at-linelevel-LiLT-base-LayoutXLM-base-v1

pierreguillou
/

Inference-APP-Document-Understanding-at-paragraphlevel-v3

Runtime error

App Files Files Community

pierreguillou commited on Apr 2, 2023

Commit

c9b4e80

·

1 Parent(s): 70bfccd

Update files/functions.py

Files changed (1) hide show

files/functions.py +36 -0

files/functions.py CHANGED Viewed

@@ -147,6 +147,42 @@ for lang_t, langcode_t in zip(langs_t,langscode_t):
 langdetect2Tesseract = {v:k for k,v in Tesseract2langdetect.items()}
 # General
 # get text and bounding boxes from an image

 langdetect2Tesseract = {v:k for k,v in Tesseract2langdetect.items()}
+## model / feature extractor / tokenizer
+# get device
+import torch
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+## model LiLT
+import transformers
+from transformers import AutoTokenizer, AutoModelForTokenClassification
+tokenizer_lilt = AutoTokenizer.from_pretrained(model_id_lilt)
+model_lilt = AutoModelForTokenClassification.from_pretrained(model_id_lilt);
+model_lilt.to(device);
+## model LayoutXLM
+from transformers import LayoutLMv2ForTokenClassification # LayoutXLMTokenizerFast,
+model_layoutxlm = LayoutLMv2ForTokenClassification.from_pretrained(model_id_layoutxlm);
+model_layoutxlm.to(device);
+# feature extractor
+from transformers import LayoutLMv2FeatureExtractor
+feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False)
+# tokenizer
+from transformers import AutoTokenizer
+tokenizer_layoutxlm = AutoTokenizer.from_pretrained(tokenizer_id_layoutxlm)
+# get labels
+id2label_lilt = model_lilt.config.id2label
+label2id_lilt = model_lilt.config.label2id
+num_labels_lilt = len(id2label_lilt)
+id2label_layoutxlm = model_layoutxlm.config.id2label
+label2id_layoutxlm = model_layoutxlm.config.label2id
+num_labels_layoutxlm = len(id2label_layoutxlm)
 # General
 # get text and bounding boxes from an image