Spaces:
Runtime error
Runtime error
| import os | |
| # workaround: install old version of pytorch since detectron2 hasn't released packages for pytorch 1.9 (issue: https://github.com/facebookresearch/detectron2/issues/3158) | |
| # os.system('pip install torch==1.8.0+cu101 torchvision==0.9.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html') | |
| os.system('pip install -q torch==1.10.0+cu111 torchvision==0.11+cu111 -f https://download.pytorch.org/whl/torch_stable.html') | |
| # install detectron2 that matches pytorch 1.8 | |
| # See https://detectron2.readthedocs.io/tutorials/install.html for instructions | |
| #os.system('pip install -q detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu101/torch1.8/index.html') | |
| os.system('pip install git+https://github.com/facebookresearch/detectron2.git') | |
| import detectron2 | |
| from detectron2.utils.logger import setup_logger | |
| setup_logger() | |
| import gradio as gr | |
| import re | |
| import string | |
| import torch | |
| from operator import itemgetter | |
| import collections | |
| import pypdf | |
| from pypdf import PdfReader | |
| from pypdf.errors import PdfReadError | |
| import pypdfium2 as pdfium | |
| import langdetect | |
| from langdetect import detect_langs | |
| import pandas as pd | |
| import numpy as np | |
| import random | |
| import tempfile | |
| import itertools | |
| from matplotlib import font_manager | |
| from PIL import Image, ImageDraw, ImageFont | |
| import cv2 | |
| import pathlib | |
| from pathlib import Path | |
| import shutil | |
| # Tesseract | |
| print(os.popen(f'cat /etc/debian_version').read()) | |
| print(os.popen(f'cat /etc/issue').read()) | |
| print(os.popen(f'apt search tesseract').read()) | |
| import pytesseract | |
| ## Key parameters | |
| # categories colors | |
| label2color = { | |
| 'Caption': 'brown', | |
| 'Footnote': 'orange', | |
| 'Formula': 'gray', | |
| 'List-item': 'yellow', | |
| 'Page-footer': 'red', | |
| 'Page-header': 'red', | |
| 'Picture': 'violet', | |
| 'Section-header': 'orange', | |
| 'Table': 'green', | |
| 'Text': 'blue', | |
| 'Title': 'pink' | |
| } | |
| # bounding boxes start and end of a sequence | |
| cls_box = [0, 0, 0, 0] | |
| sep_box = [1000, 1000, 1000, 1000] | |
| # model | |
| model_id = "pierreguillou/layout-xlm-base-finetuned-with-DocLayNet-base-at-paragraphlevel-ml512" | |
| # tokenizer | |
| tokenizer_id = "xlm-roberta-base" | |
| # (tokenization) The maximum length of a feature (sequence) | |
| if str(384) in model_id: | |
| max_length = 384 | |
| elif str(512) in model_id: | |
| max_length = 512 | |
| else: | |
| print("Error with max_length of chunks!") | |
| # (tokenization) overlap | |
| doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed. | |
| # max PDF page images that will be displayed | |
| max_imgboxes = 2 | |
| # get files | |
| examples_dir = 'files/' | |
| Path(examples_dir).mkdir(parents=True, exist_ok=True) | |
| from huggingface_hub import hf_hub_download | |
| files = ["example.pdf", "blank.pdf", "blank.png", "languages_iso.csv", "languages_tesseract.csv", "wo_content.png"] | |
| for file_name in files: | |
| path_to_file = hf_hub_download( | |
| repo_id = "pierreguillou/Inference-APP-Document-Understanding-at-paragraphlevel-v2", | |
| filename = "files/" + file_name, | |
| repo_type = "space" | |
| ) | |
| shutil.copy(path_to_file,examples_dir) | |
| # path to files | |
| image_wo_content = examples_dir + "wo_content.png" # image without content | |
| pdf_blank = examples_dir + "blank.pdf" # blank PDF | |
| image_blank = examples_dir + "blank.png" # blank image | |
| ## get langdetect2Tesseract dictionary | |
| t = "files/languages_tesseract.csv" | |
| l = "files/languages_iso.csv" | |
| df_t = pd.read_csv(t) | |
| df_l = pd.read_csv(l) | |
| langs_t = df_t["Language"].to_list() | |
| langs_t = [lang_t.lower().strip().translate(str.maketrans('', '', string.punctuation)) for lang_t in langs_t] | |
| langs_l = df_l["Language"].to_list() | |
| langs_l = [lang_l.lower().strip().translate(str.maketrans('', '', string.punctuation)) for lang_l in langs_l] | |
| langscode_t = df_t["LangCode"].to_list() | |
| langscode_l = df_l["LangCode"].to_list() | |
| Tesseract2langdetect, langdetect2Tesseract = dict(), dict() | |
| for lang_t, langcode_t in zip(langs_t,langscode_t): | |
| try: | |
| if lang_t == "Chinese - Simplified".lower().strip().translate(str.maketrans('', '', string.punctuation)): lang_t = "chinese" | |
| index = langs_l.index(lang_t) | |
| langcode_l = langscode_l[index] | |
| Tesseract2langdetect[langcode_t] = langcode_l | |
| except: | |
| continue | |
| langdetect2Tesseract = {v:k for k,v in Tesseract2langdetect.items()} | |
| ## model / feature extractor / tokenizer | |
| # get device | |
| import torch | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| ## model LiLT | |
| import transformers | |
| from transformers import AutoTokenizer, AutoModelForTokenClassification | |
| tokenizer_lilt = AutoTokenizer.from_pretrained(model_id_lilt) | |
| model_lilt = AutoModelForTokenClassification.from_pretrained(model_id_lilt); | |
| model_lilt.to(device); | |
| ## model LayoutXLM | |
| from transformers import LayoutLMv2ForTokenClassification # LayoutXLMTokenizerFast, | |
| model_layoutxlm = LayoutLMv2ForTokenClassification.from_pretrained(model_id_layoutxlm); | |
| model_layoutxlm.to(device); | |
| # feature extractor | |
| from transformers import LayoutLMv2FeatureExtractor | |
| feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False) | |
| # tokenizer | |
| from transformers import AutoTokenizer | |
| tokenizer_layoutxlm = AutoTokenizer.from_pretrained(tokenizer_id_layoutxlm) | |
| # get labels | |
| id2label_lilt = model_lilt.config.id2label | |
| label2id_lilt = model_lilt.config.label2id | |
| num_labels_lilt = len(id2label_lilt) | |
| id2label_layoutxlm = model_layoutxlm.config.id2label | |
| label2id_layoutxlm = model_layoutxlm.config.label2id | |
| num_labels_layoutxlm = len(id2label_layoutxlm) | |