Spaces:
Build error
Build error
Commit
·
cc37d53
1
Parent(s):
25ec3a7
Upload 2 files
Browse files- app.py +314 -0
- requirements.txt +10 -0
app.py
ADDED
|
@@ -0,0 +1,314 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import io
|
| 2 |
+
import json
|
| 3 |
+
import os
|
| 4 |
+
import shutil
|
| 5 |
+
import time
|
| 6 |
+
from collections import Counter
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
|
| 9 |
+
import fitz
|
| 10 |
+
import numpy as np
|
| 11 |
+
import pandas as pd
|
| 12 |
+
import plotly.express as px
|
| 13 |
+
import streamlit as st
|
| 14 |
+
import torch
|
| 15 |
+
import torch.nn.functional as F
|
| 16 |
+
from easyocr import Reader
|
| 17 |
+
from PIL import Image
|
| 18 |
+
from tqdm import tqdm
|
| 19 |
+
from transformers import (LayoutLMv3FeatureExtractor,
|
| 20 |
+
LayoutLMv3ForSequenceClassification,
|
| 21 |
+
LayoutLMv3Processor, LayoutLMv3TokenizerFast)
|
| 22 |
+
|
| 23 |
+
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
|
| 24 |
+
# DEVICE = "cpu"
|
| 25 |
+
MICROSOFT_HODEL_NAME = "microsoft/layoutlmv3-base"
|
| 26 |
+
MODEL_NAME = "arthur-lima/layoutlmv3-triagem-documentos"
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def create_bounding_box(bbox_data, width_scale: float, height_scale: float):
|
| 30 |
+
xs = []
|
| 31 |
+
ys = []
|
| 32 |
+
for x, y in bbox_data:
|
| 33 |
+
xs.append(x)
|
| 34 |
+
ys.append(y)
|
| 35 |
+
left = int(min(xs) * width_scale)
|
| 36 |
+
top = int(min(ys) * height_scale)
|
| 37 |
+
right = int(max(xs) * width_scale)
|
| 38 |
+
bottom = int(max(ys) * height_scale)
|
| 39 |
+
return [left, top, right, bottom]
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
@st.experimental_singleton
|
| 43 |
+
def create_ocr_reader():
|
| 44 |
+
return Reader(["pt", "en"], gpu=True)
|
| 45 |
+
# return Reader(["pt", "en"], gpu=False)
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
@st.experimental_singleton
|
| 49 |
+
def create_processor():
|
| 50 |
+
feature_extractor = LayoutLMv3FeatureExtractor(apply_ocr=False)
|
| 51 |
+
tokenizer = LayoutLMv3TokenizerFast.from_pretrained(MICROSOFT_HODEL_NAME)
|
| 52 |
+
return LayoutLMv3Processor(feature_extractor, tokenizer)
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
@st.experimental_singleton
|
| 56 |
+
def create_model(revision="main"):
|
| 57 |
+
model = LayoutLMv3ForSequenceClassification.from_pretrained(MODEL_NAME, revision=revision)
|
| 58 |
+
return model.eval().to(DEVICE)
|
| 59 |
+
|
| 60 |
+
def pdf2jpg(src: Path, dest_path: Path=None, dpi=100, limit=None):
|
| 61 |
+
"""
|
| 62 |
+
Converte um arquivo PDF em JPG.
|
| 63 |
+
Se forem várias páginas, serão geradas várias imagens
|
| 64 |
+
"""
|
| 65 |
+
# Tratamento dos caminhos de destino
|
| 66 |
+
if (dest_path is None):
|
| 67 |
+
# Não passou caminho
|
| 68 |
+
dest = src.parent / src.stem
|
| 69 |
+
elif (dest_path.suffix == ""):
|
| 70 |
+
# Só passou uma pasta
|
| 71 |
+
dest = dest_path / src.stem
|
| 72 |
+
else:
|
| 73 |
+
# Passou um caminho com arquivo
|
| 74 |
+
dest = dest_path.parent / dest_path.stem
|
| 75 |
+
|
| 76 |
+
zoom = dpi / 72 # zoom factor, standard: 72 dpi
|
| 77 |
+
magnify = fitz.Matrix(zoom, zoom) # magnifies in x, resp. y direction
|
| 78 |
+
try:
|
| 79 |
+
doc = fitz.open(src) # open document
|
| 80 |
+
for page in doc:
|
| 81 |
+
pix = page.get_pixmap(matrix=magnify) # render page to an image
|
| 82 |
+
dest_final_filename = Path(str(dest) + f"-{page.number}.jpg")
|
| 83 |
+
pix.save(dest_final_filename)
|
| 84 |
+
return True
|
| 85 |
+
except Exception as e:
|
| 86 |
+
print(f"\nProblemas na conversão para JPG do arquivo PDF {src}: " + str(e))
|
| 87 |
+
return False
|
| 88 |
+
|
| 89 |
+
def classifyPDF(
|
| 90 |
+
pdfpath: Path, model, processor, reader: Reader = None, dpi=100
|
| 91 |
+
) -> str:
|
| 92 |
+
def create_bounding_box(bbox_data, width_scale: float = 1, height_scale: float = 1):
|
| 93 |
+
xs = []
|
| 94 |
+
ys = []
|
| 95 |
+
for x, y in bbox_data:
|
| 96 |
+
xs.append(x)
|
| 97 |
+
ys.append(y)
|
| 98 |
+
left = int(min(xs) * width_scale)
|
| 99 |
+
top = int(min(ys) * height_scale)
|
| 100 |
+
right = int(max(xs) * width_scale)
|
| 101 |
+
bottom = int(max(ys) * height_scale)
|
| 102 |
+
return [left, top, right, bottom]
|
| 103 |
+
|
| 104 |
+
# Cria pasta temporária para converter em JPG
|
| 105 |
+
tmp = Path("temp")
|
| 106 |
+
if os.path.exists(tmp):
|
| 107 |
+
tmp = Path("temp_classification")
|
| 108 |
+
shutil.rmtree(tmp, ignore_errors=True)
|
| 109 |
+
os.mkdir(tmp)
|
| 110 |
+
image_path = tmp / Path(pdfpath.name).with_suffix(".jpg")
|
| 111 |
+
pdf2jpg(pdfpath, image_path, dpi)
|
| 112 |
+
if reader is None:
|
| 113 |
+
reader = Reader(["pt", "en"])
|
| 114 |
+
time.sleep(0.5)
|
| 115 |
+
|
| 116 |
+
# Verificar se há várias páginas
|
| 117 |
+
if len(os.listdir(tmp)) > 1:
|
| 118 |
+
# Várias páginas, escolher a da maioria
|
| 119 |
+
results = []
|
| 120 |
+
all_probs = []
|
| 121 |
+
for img in tqdm(os.listdir(tmp)):
|
| 122 |
+
image_path = tmp / img
|
| 123 |
+
# Ler cada página (em bytes) via OCR
|
| 124 |
+
image = Image.open(image_path)
|
| 125 |
+
with open(image_path, "rb") as f:
|
| 126 |
+
image_bytes = f.read()
|
| 127 |
+
ocr_result = reader.readtext(image_bytes, batch_size=1)
|
| 128 |
+
ocr_page = []
|
| 129 |
+
for bbox, word, confidence in ocr_result:
|
| 130 |
+
ocr_page.append(
|
| 131 |
+
{"word": word, "bounding_box": create_bounding_box(bbox)}
|
| 132 |
+
)
|
| 133 |
+
with Path(image_path).with_suffix(".json").open("w") as f:
|
| 134 |
+
json.dump(ocr_page, f)
|
| 135 |
+
|
| 136 |
+
# Fazer a previsão
|
| 137 |
+
predicted_class, probabilities = predict(
|
| 138 |
+
image, image_bytes, reader, processor, model
|
| 139 |
+
)
|
| 140 |
+
# result = model.config.id2label[predicted_class]
|
| 141 |
+
results.append(predicted_class)
|
| 142 |
+
|
| 143 |
+
if (len(all_probs) == 0): all_probs = np.array(probabilities)
|
| 144 |
+
else: all_probs += np.array(probabilities)
|
| 145 |
+
# Resultado é o mais comum
|
| 146 |
+
result = Counter(results).most_common(1)
|
| 147 |
+
result = result[0][0]
|
| 148 |
+
all_probs = all_probs * (1 / len(os.listdir(tmp)))
|
| 149 |
+
predicted_class, probabilities = result, all_probs
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
else:
|
| 153 |
+
# Uma página
|
| 154 |
+
image_path = tmp / (os.listdir(tmp)[0])
|
| 155 |
+
|
| 156 |
+
# Ler a imagem via OCR
|
| 157 |
+
image = Image.open(image_path)
|
| 158 |
+
with open(image_path, "rb") as f:
|
| 159 |
+
image_bytes = f.read()
|
| 160 |
+
ocr_result = reader.readtext(image_bytes, batch_size=1)
|
| 161 |
+
ocr_page = []
|
| 162 |
+
for bbox, word, confidence in ocr_result:
|
| 163 |
+
ocr_page.append({"word": word, "bounding_box": create_bounding_box(bbox)})
|
| 164 |
+
with image_path.with_suffix(".json").open("w") as f:
|
| 165 |
+
json.dump(ocr_page, f)
|
| 166 |
+
|
| 167 |
+
# Fazer a previsão
|
| 168 |
+
predicted_class, probabilities = predict(
|
| 169 |
+
image, image_bytes, reader, processor, model
|
| 170 |
+
)
|
| 171 |
+
probabilities = np.array(probabilities)
|
| 172 |
+
# result = model.config.id2label[predicted_class]
|
| 173 |
+
|
| 174 |
+
probabilities = probabilities / np.sqrt(np.sum(probabilities**2))
|
| 175 |
+
return predicted_class, probabilities
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
def predict(
|
| 179 |
+
image: Image.Image,
|
| 180 |
+
image_bytes: bytes,
|
| 181 |
+
reader: Reader,
|
| 182 |
+
processor: LayoutLMv3Processor,
|
| 183 |
+
model: LayoutLMv3ForSequenceClassification,
|
| 184 |
+
):
|
| 185 |
+
|
| 186 |
+
ocr_result = reader.readtext(image_bytes)
|
| 187 |
+
|
| 188 |
+
width, height = image.size
|
| 189 |
+
width_scale = 1000 / width
|
| 190 |
+
height_scale = 1000 / height
|
| 191 |
+
|
| 192 |
+
words = []
|
| 193 |
+
boxes = []
|
| 194 |
+
for bbox, word, _ in ocr_result:
|
| 195 |
+
boxes.append(create_bounding_box(bbox, width_scale, height_scale))
|
| 196 |
+
words.append(word)
|
| 197 |
+
|
| 198 |
+
encoding = processor(
|
| 199 |
+
image,
|
| 200 |
+
words,
|
| 201 |
+
boxes=boxes,
|
| 202 |
+
max_length=512,
|
| 203 |
+
padding="max_length",
|
| 204 |
+
truncation=True,
|
| 205 |
+
return_tensors="pt",
|
| 206 |
+
)
|
| 207 |
+
|
| 208 |
+
with torch.inference_mode():
|
| 209 |
+
output = model(
|
| 210 |
+
input_ids=encoding["input_ids"].to(DEVICE),
|
| 211 |
+
attention_mask=encoding["attention_mask"].to(DEVICE),
|
| 212 |
+
bbox=encoding["bbox"].to(DEVICE),
|
| 213 |
+
pixel_values=encoding["pixel_values"].to(DEVICE),
|
| 214 |
+
)
|
| 215 |
+
|
| 216 |
+
logits = output.logits
|
| 217 |
+
predicted_class = logits.argmax()
|
| 218 |
+
probabilities = (
|
| 219 |
+
F.softmax(logits, dim=-1).flatten().tolist()
|
| 220 |
+
) # Convertendo em probabilidades novamente
|
| 221 |
+
# return model.config.id2label[predicted_class.item()]
|
| 222 |
+
return predicted_class.detach().item(), probabilities
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
reader = create_ocr_reader()
|
| 226 |
+
processor = create_processor()
|
| 227 |
+
model = create_model(revision="e34c270")
|
| 228 |
+
|
| 229 |
+
# Logo
|
| 230 |
+
c1, c2, c3 = st.columns([2.7,5,1])
|
| 231 |
+
c2.image("resources/previsa_cinza.png", width=250)
|
| 232 |
+
|
| 233 |
+
# Caixas de Upload
|
| 234 |
+
col1, col2 = st.columns(2)
|
| 235 |
+
with col1:
|
| 236 |
+
uploaded_file = st.file_uploader("Upload: Notas Fiscais de Entrada", ["jpg", "pdf"])
|
| 237 |
+
uploaded_file = st.file_uploader("Upload: Notas Fiscais de Saída", ["jpg", "pdf"])
|
| 238 |
+
uploaded_file = st.file_uploader("Upload: Notas Fiscais de Retenção", ["jpg", "pdf"])
|
| 239 |
+
uploaded_file = st.file_uploader("Upload: Notas Fiscais de Serviços", ["jpg", "pdf"])
|
| 240 |
+
with col2:
|
| 241 |
+
uploaded_file = st.file_uploader("Upload: Documentos Aluguel", ["jpg", "pdf"])
|
| 242 |
+
uploaded_file = st.file_uploader("Upload: Documentos Contábeis", ["jpg", "pdf"])
|
| 243 |
+
uploaded_file = st.file_uploader("Upload: Documentos Tributos", ["jpg", "pdf"])
|
| 244 |
+
uploaded_file = st.file_uploader("Upload: Documentos MEI", ["jpg", "pdf"])
|
| 245 |
+
uploaded_file = st.file_uploader("Upload: Extrato Bancário", ["jpg", "pdf"])
|
| 246 |
+
|
| 247 |
+
def plot_confianca(probabilities, model):
|
| 248 |
+
# Desenhar o gráfico de confianças
|
| 249 |
+
with st.spinner("Criando gráficos de confiança..."):
|
| 250 |
+
df_predictions = pd.DataFrame(
|
| 251 |
+
{
|
| 252 |
+
"Tipo Documento": list(model.config.id2label.values()),
|
| 253 |
+
"Confiança": probabilities,
|
| 254 |
+
}
|
| 255 |
+
)
|
| 256 |
+
fig = px.bar(df_predictions, x="Tipo Documento", y="Confiança")
|
| 257 |
+
fig.update_layout({
|
| 258 |
+
'plot_bgcolor': '#FFFFFF'
|
| 259 |
+
})
|
| 260 |
+
fig.update_traces(marker_color='#fcaf17')
|
| 261 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 262 |
+
|
| 263 |
+
# Processamento
|
| 264 |
+
if uploaded_file is not None:
|
| 265 |
+
c1, c2, c3 = st.columns([2.4,5,1])
|
| 266 |
+
|
| 267 |
+
try:
|
| 268 |
+
# Tentar decodificar como PDF
|
| 269 |
+
if os.path.exists("temp"):
|
| 270 |
+
shutil.rmtree("temp", ignore_errors=True)
|
| 271 |
+
os.mkdir("temp")
|
| 272 |
+
doc = fitz.Document(stream=uploaded_file.getvalue())
|
| 273 |
+
pdfPath = Path("temp/temp.pdf")
|
| 274 |
+
doc.save(pdfPath)
|
| 275 |
+
|
| 276 |
+
# Imprimir a primeira página
|
| 277 |
+
for page in doc:
|
| 278 |
+
pix = page.get_pixmap()
|
| 279 |
+
pix.save("temp/icon-page-1.jpg")
|
| 280 |
+
c2.image("temp/icon-page-1.jpg", "Página do documento", width=300)
|
| 281 |
+
break
|
| 282 |
+
|
| 283 |
+
# Fazer a previsão
|
| 284 |
+
with st.spinner("Fazendo previsão..."):
|
| 285 |
+
predicted_class, probabilities = classifyPDF(pdfPath, model, processor, reader)
|
| 286 |
+
print(probabilities)
|
| 287 |
+
except fitz.fitz.FileDataError:
|
| 288 |
+
# Carregar a imagem passada
|
| 289 |
+
image_bytes = uploaded_file.getvalue()
|
| 290 |
+
bytes_data = io.BytesIO(image_bytes)
|
| 291 |
+
image = Image.open(bytes_data)
|
| 292 |
+
|
| 293 |
+
# Mostrar a imagem
|
| 294 |
+
c2.image(image, "Página do documento", width=300)
|
| 295 |
+
|
| 296 |
+
# Fazer a previsão
|
| 297 |
+
with st.spinner("Fazendo previsão..."):
|
| 298 |
+
predicted_class, probabilities = predict(
|
| 299 |
+
image, image_bytes, reader, processor, model
|
| 300 |
+
)
|
| 301 |
+
finally:
|
| 302 |
+
# Remover a pasta temporária se ainda existir
|
| 303 |
+
if os.path.exists("temp"):
|
| 304 |
+
shutil.rmtree("temp", ignore_errors=True)
|
| 305 |
+
if os.path.exists("temp_classification"):
|
| 306 |
+
shutil.rmtree("temp_classification", ignore_errors=True)
|
| 307 |
+
|
| 308 |
+
# Imprimir o resultado na tela
|
| 309 |
+
predicted_label = model.config.id2label[predicted_class]
|
| 310 |
+
st.markdown(f"Tipo do documento previsto: **{predicted_label}**")
|
| 311 |
+
|
| 312 |
+
plot_confianca(probabilities, model)
|
| 313 |
+
|
| 314 |
+
|
requirements.txt
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
PyMuPDF==1.21.1
|
| 2 |
+
numpy==1.24.2
|
| 3 |
+
streamlit==1.15.2
|
| 4 |
+
transformers==4.25.1
|
| 5 |
+
pandas==2.0.0
|
| 6 |
+
plotly-express==0.4.1
|
| 7 |
+
python-dotenv==1.0.0
|
| 8 |
+
Pillow==9.4.0
|
| 9 |
+
torch==2.0.0
|
| 10 |
+
easyocr==1.6.2
|