Inference-comparison-APP-Document-Understanding

Runtime error

App Files Files Community

Inference-comparison-APP-Document-Understanding / files /functions.py

pierreguillou

Update files/functions.py

a2cda1e over 2 years ago

raw

history blame

5.41 kB

	import os

	# workaround: install old version of pytorch since detectron2 hasn't released packages for pytorch 1.9 (issue: https://github.com/facebookresearch/detectron2/issues/3158)
	# os.system('pip install torch==1.8.0+cu101 torchvision==0.9.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html')
	os.system('pip install -q torch==1.10.0+cu111 torchvision==0.11+cu111 -f https://download.pytorch.org/whl/torch_stable.html')

	# install detectron2 that matches pytorch 1.8
	# See https://detectron2.readthedocs.io/tutorials/install.html for instructions
	#os.system('pip install -q detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu101/torch1.8/index.html')
	os.system('pip install git+https://github.com/facebookresearch/detectron2.git')

	import detectron2
	from detectron2.utils.logger import setup_logger
	setup_logger()

	import gradio as gr
	import re
	import string
	import torch

	from operator import itemgetter
	import collections

	import pypdf
	from pypdf import PdfReader
	from pypdf.errors import PdfReadError

	import pypdfium2 as pdfium
	import langdetect
	from langdetect import detect_langs

	import pandas as pd
	import numpy as np
	import random
	import tempfile
	import itertools

	from matplotlib import font_manager
	from PIL import Image, ImageDraw, ImageFont
	import cv2

	import pathlib
	from pathlib import Path
	import shutil

	# Tesseract
	print(os.popen(f'cat /etc/debian_version').read())
	print(os.popen(f'cat /etc/issue').read())
	print(os.popen(f'apt search tesseract').read())
	import pytesseract

	## Key parameters

	# categories colors
	label2color = {
	'Caption': 'brown',
	'Footnote': 'orange',
	'Formula': 'gray',
	'List-item': 'yellow',
	'Page-footer': 'red',
	'Page-header': 'red',
	'Picture': 'violet',
	'Section-header': 'orange',
	'Table': 'green',
	'Text': 'blue',
	'Title': 'pink'
	}

	# bounding boxes start and end of a sequence
	cls_box = [0, 0, 0, 0]
	sep_box = [1000, 1000, 1000, 1000]

	# model
	model_id = "pierreguillou/layout-xlm-base-finetuned-with-DocLayNet-base-at-paragraphlevel-ml512"

	# tokenizer
	tokenizer_id = "xlm-roberta-base"

	# (tokenization) The maximum length of a feature (sequence)
	if str(384) in model_id:
	max_length = 384
	elif str(512) in model_id:
	max_length = 512
	else:
	print("Error with max_length of chunks!")

	# (tokenization) overlap
	doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.

	# max PDF page images that will be displayed
	max_imgboxes = 2

	# get files
	examples_dir = 'files/'
	Path(examples_dir).mkdir(parents=True, exist_ok=True)
	from huggingface_hub import hf_hub_download
	files = ["example.pdf", "blank.pdf", "blank.png", "languages_iso.csv", "languages_tesseract.csv", "wo_content.png"]
	for file_name in files:
	path_to_file = hf_hub_download(
	repo_id = "pierreguillou/Inference-APP-Document-Understanding-at-paragraphlevel-v2",
	filename = "files/" + file_name,
	repo_type = "space"
	)
	shutil.copy(path_to_file,examples_dir)

	# path to files
	image_wo_content = examples_dir + "wo_content.png" # image without content
	pdf_blank = examples_dir + "blank.pdf" # blank PDF
	image_blank = examples_dir + "blank.png" # blank image

	## get langdetect2Tesseract dictionary
	t = "files/languages_tesseract.csv"
	l = "files/languages_iso.csv"

	df_t = pd.read_csv(t)
	df_l = pd.read_csv(l)

	langs_t = df_t["Language"].to_list()
	langs_t = [lang_t.lower().strip().translate(str.maketrans('', '', string.punctuation)) for lang_t in langs_t]
	langs_l = df_l["Language"].to_list()
	langs_l = [lang_l.lower().strip().translate(str.maketrans('', '', string.punctuation)) for lang_l in langs_l]
	langscode_t = df_t["LangCode"].to_list()
	langscode_l = df_l["LangCode"].to_list()

	Tesseract2langdetect, langdetect2Tesseract = dict(), dict()
	for lang_t, langcode_t in zip(langs_t,langscode_t):
	try:
	if lang_t == "Chinese - Simplified".lower().strip().translate(str.maketrans('', '', string.punctuation)): lang_t = "chinese"
	index = langs_l.index(lang_t)
	langcode_l = langscode_l[index]
	Tesseract2langdetect[langcode_t] = langcode_l
	except:
	continue

	langdetect2Tesseract = {v:k for k,v in Tesseract2langdetect.items()}

	## model / feature extractor / tokenizer

	# get device
	import torch
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	## model LiLT
	import transformers
	from transformers import AutoTokenizer, AutoModelForTokenClassification
	tokenizer_lilt = AutoTokenizer.from_pretrained(model_id_lilt)
	model_lilt = AutoModelForTokenClassification.from_pretrained(model_id_lilt);
	model_lilt.to(device);

	## model LayoutXLM
	from transformers import LayoutLMv2ForTokenClassification # LayoutXLMTokenizerFast,
	model_layoutxlm = LayoutLMv2ForTokenClassification.from_pretrained(model_id_layoutxlm);
	model_layoutxlm.to(device);

	# feature extractor
	from transformers import LayoutLMv2FeatureExtractor
	feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False)

	# tokenizer
	from transformers import AutoTokenizer
	tokenizer_layoutxlm = AutoTokenizer.from_pretrained(tokenizer_id_layoutxlm)

	# get labels
	id2label_lilt = model_lilt.config.id2label
	label2id_lilt = model_lilt.config.label2id
	num_labels_lilt = len(id2label_lilt)

	id2label_layoutxlm = model_layoutxlm.config.id2label
	label2id_layoutxlm = model_layoutxlm.config.label2id
	num_labels_layoutxlm = len(id2label_layoutxlm)