arthur-lima commited on
Commit
cc37d53
·
1 Parent(s): 25ec3a7

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +314 -0
  2. requirements.txt +10 -0
app.py ADDED
@@ -0,0 +1,314 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import json
3
+ import os
4
+ import shutil
5
+ import time
6
+ from collections import Counter
7
+ from pathlib import Path
8
+
9
+ import fitz
10
+ import numpy as np
11
+ import pandas as pd
12
+ import plotly.express as px
13
+ import streamlit as st
14
+ import torch
15
+ import torch.nn.functional as F
16
+ from easyocr import Reader
17
+ from PIL import Image
18
+ from tqdm import tqdm
19
+ from transformers import (LayoutLMv3FeatureExtractor,
20
+ LayoutLMv3ForSequenceClassification,
21
+ LayoutLMv3Processor, LayoutLMv3TokenizerFast)
22
+
23
+ DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
24
+ # DEVICE = "cpu"
25
+ MICROSOFT_HODEL_NAME = "microsoft/layoutlmv3-base"
26
+ MODEL_NAME = "arthur-lima/layoutlmv3-triagem-documentos"
27
+
28
+
29
+ def create_bounding_box(bbox_data, width_scale: float, height_scale: float):
30
+ xs = []
31
+ ys = []
32
+ for x, y in bbox_data:
33
+ xs.append(x)
34
+ ys.append(y)
35
+ left = int(min(xs) * width_scale)
36
+ top = int(min(ys) * height_scale)
37
+ right = int(max(xs) * width_scale)
38
+ bottom = int(max(ys) * height_scale)
39
+ return [left, top, right, bottom]
40
+
41
+
42
+ @st.experimental_singleton
43
+ def create_ocr_reader():
44
+ return Reader(["pt", "en"], gpu=True)
45
+ # return Reader(["pt", "en"], gpu=False)
46
+
47
+
48
+ @st.experimental_singleton
49
+ def create_processor():
50
+ feature_extractor = LayoutLMv3FeatureExtractor(apply_ocr=False)
51
+ tokenizer = LayoutLMv3TokenizerFast.from_pretrained(MICROSOFT_HODEL_NAME)
52
+ return LayoutLMv3Processor(feature_extractor, tokenizer)
53
+
54
+
55
+ @st.experimental_singleton
56
+ def create_model(revision="main"):
57
+ model = LayoutLMv3ForSequenceClassification.from_pretrained(MODEL_NAME, revision=revision)
58
+ return model.eval().to(DEVICE)
59
+
60
+ def pdf2jpg(src: Path, dest_path: Path=None, dpi=100, limit=None):
61
+ """
62
+ Converte um arquivo PDF em JPG.
63
+ Se forem várias páginas, serão geradas várias imagens
64
+ """
65
+ # Tratamento dos caminhos de destino
66
+ if (dest_path is None):
67
+ # Não passou caminho
68
+ dest = src.parent / src.stem
69
+ elif (dest_path.suffix == ""):
70
+ # Só passou uma pasta
71
+ dest = dest_path / src.stem
72
+ else:
73
+ # Passou um caminho com arquivo
74
+ dest = dest_path.parent / dest_path.stem
75
+
76
+ zoom = dpi / 72 # zoom factor, standard: 72 dpi
77
+ magnify = fitz.Matrix(zoom, zoom) # magnifies in x, resp. y direction
78
+ try:
79
+ doc = fitz.open(src) # open document
80
+ for page in doc:
81
+ pix = page.get_pixmap(matrix=magnify) # render page to an image
82
+ dest_final_filename = Path(str(dest) + f"-{page.number}.jpg")
83
+ pix.save(dest_final_filename)
84
+ return True
85
+ except Exception as e:
86
+ print(f"\nProblemas na conversão para JPG do arquivo PDF {src}: " + str(e))
87
+ return False
88
+
89
+ def classifyPDF(
90
+ pdfpath: Path, model, processor, reader: Reader = None, dpi=100
91
+ ) -> str:
92
+ def create_bounding_box(bbox_data, width_scale: float = 1, height_scale: float = 1):
93
+ xs = []
94
+ ys = []
95
+ for x, y in bbox_data:
96
+ xs.append(x)
97
+ ys.append(y)
98
+ left = int(min(xs) * width_scale)
99
+ top = int(min(ys) * height_scale)
100
+ right = int(max(xs) * width_scale)
101
+ bottom = int(max(ys) * height_scale)
102
+ return [left, top, right, bottom]
103
+
104
+ # Cria pasta temporária para converter em JPG
105
+ tmp = Path("temp")
106
+ if os.path.exists(tmp):
107
+ tmp = Path("temp_classification")
108
+ shutil.rmtree(tmp, ignore_errors=True)
109
+ os.mkdir(tmp)
110
+ image_path = tmp / Path(pdfpath.name).with_suffix(".jpg")
111
+ pdf2jpg(pdfpath, image_path, dpi)
112
+ if reader is None:
113
+ reader = Reader(["pt", "en"])
114
+ time.sleep(0.5)
115
+
116
+ # Verificar se há várias páginas
117
+ if len(os.listdir(tmp)) > 1:
118
+ # Várias páginas, escolher a da maioria
119
+ results = []
120
+ all_probs = []
121
+ for img in tqdm(os.listdir(tmp)):
122
+ image_path = tmp / img
123
+ # Ler cada página (em bytes) via OCR
124
+ image = Image.open(image_path)
125
+ with open(image_path, "rb") as f:
126
+ image_bytes = f.read()
127
+ ocr_result = reader.readtext(image_bytes, batch_size=1)
128
+ ocr_page = []
129
+ for bbox, word, confidence in ocr_result:
130
+ ocr_page.append(
131
+ {"word": word, "bounding_box": create_bounding_box(bbox)}
132
+ )
133
+ with Path(image_path).with_suffix(".json").open("w") as f:
134
+ json.dump(ocr_page, f)
135
+
136
+ # Fazer a previsão
137
+ predicted_class, probabilities = predict(
138
+ image, image_bytes, reader, processor, model
139
+ )
140
+ # result = model.config.id2label[predicted_class]
141
+ results.append(predicted_class)
142
+
143
+ if (len(all_probs) == 0): all_probs = np.array(probabilities)
144
+ else: all_probs += np.array(probabilities)
145
+ # Resultado é o mais comum
146
+ result = Counter(results).most_common(1)
147
+ result = result[0][0]
148
+ all_probs = all_probs * (1 / len(os.listdir(tmp)))
149
+ predicted_class, probabilities = result, all_probs
150
+
151
+
152
+ else:
153
+ # Uma página
154
+ image_path = tmp / (os.listdir(tmp)[0])
155
+
156
+ # Ler a imagem via OCR
157
+ image = Image.open(image_path)
158
+ with open(image_path, "rb") as f:
159
+ image_bytes = f.read()
160
+ ocr_result = reader.readtext(image_bytes, batch_size=1)
161
+ ocr_page = []
162
+ for bbox, word, confidence in ocr_result:
163
+ ocr_page.append({"word": word, "bounding_box": create_bounding_box(bbox)})
164
+ with image_path.with_suffix(".json").open("w") as f:
165
+ json.dump(ocr_page, f)
166
+
167
+ # Fazer a previsão
168
+ predicted_class, probabilities = predict(
169
+ image, image_bytes, reader, processor, model
170
+ )
171
+ probabilities = np.array(probabilities)
172
+ # result = model.config.id2label[predicted_class]
173
+
174
+ probabilities = probabilities / np.sqrt(np.sum(probabilities**2))
175
+ return predicted_class, probabilities
176
+
177
+
178
+ def predict(
179
+ image: Image.Image,
180
+ image_bytes: bytes,
181
+ reader: Reader,
182
+ processor: LayoutLMv3Processor,
183
+ model: LayoutLMv3ForSequenceClassification,
184
+ ):
185
+
186
+ ocr_result = reader.readtext(image_bytes)
187
+
188
+ width, height = image.size
189
+ width_scale = 1000 / width
190
+ height_scale = 1000 / height
191
+
192
+ words = []
193
+ boxes = []
194
+ for bbox, word, _ in ocr_result:
195
+ boxes.append(create_bounding_box(bbox, width_scale, height_scale))
196
+ words.append(word)
197
+
198
+ encoding = processor(
199
+ image,
200
+ words,
201
+ boxes=boxes,
202
+ max_length=512,
203
+ padding="max_length",
204
+ truncation=True,
205
+ return_tensors="pt",
206
+ )
207
+
208
+ with torch.inference_mode():
209
+ output = model(
210
+ input_ids=encoding["input_ids"].to(DEVICE),
211
+ attention_mask=encoding["attention_mask"].to(DEVICE),
212
+ bbox=encoding["bbox"].to(DEVICE),
213
+ pixel_values=encoding["pixel_values"].to(DEVICE),
214
+ )
215
+
216
+ logits = output.logits
217
+ predicted_class = logits.argmax()
218
+ probabilities = (
219
+ F.softmax(logits, dim=-1).flatten().tolist()
220
+ ) # Convertendo em probabilidades novamente
221
+ # return model.config.id2label[predicted_class.item()]
222
+ return predicted_class.detach().item(), probabilities
223
+
224
+
225
+ reader = create_ocr_reader()
226
+ processor = create_processor()
227
+ model = create_model(revision="e34c270")
228
+
229
+ # Logo
230
+ c1, c2, c3 = st.columns([2.7,5,1])
231
+ c2.image("resources/previsa_cinza.png", width=250)
232
+
233
+ # Caixas de Upload
234
+ col1, col2 = st.columns(2)
235
+ with col1:
236
+ uploaded_file = st.file_uploader("Upload: Notas Fiscais de Entrada", ["jpg", "pdf"])
237
+ uploaded_file = st.file_uploader("Upload: Notas Fiscais de Saída", ["jpg", "pdf"])
238
+ uploaded_file = st.file_uploader("Upload: Notas Fiscais de Retenção", ["jpg", "pdf"])
239
+ uploaded_file = st.file_uploader("Upload: Notas Fiscais de Serviços", ["jpg", "pdf"])
240
+ with col2:
241
+ uploaded_file = st.file_uploader("Upload: Documentos Aluguel", ["jpg", "pdf"])
242
+ uploaded_file = st.file_uploader("Upload: Documentos Contábeis", ["jpg", "pdf"])
243
+ uploaded_file = st.file_uploader("Upload: Documentos Tributos", ["jpg", "pdf"])
244
+ uploaded_file = st.file_uploader("Upload: Documentos MEI", ["jpg", "pdf"])
245
+ uploaded_file = st.file_uploader("Upload: Extrato Bancário", ["jpg", "pdf"])
246
+
247
+ def plot_confianca(probabilities, model):
248
+ # Desenhar o gráfico de confianças
249
+ with st.spinner("Criando gráficos de confiança..."):
250
+ df_predictions = pd.DataFrame(
251
+ {
252
+ "Tipo Documento": list(model.config.id2label.values()),
253
+ "Confiança": probabilities,
254
+ }
255
+ )
256
+ fig = px.bar(df_predictions, x="Tipo Documento", y="Confiança")
257
+ fig.update_layout({
258
+ 'plot_bgcolor': '#FFFFFF'
259
+ })
260
+ fig.update_traces(marker_color='#fcaf17')
261
+ st.plotly_chart(fig, use_container_width=True)
262
+
263
+ # Processamento
264
+ if uploaded_file is not None:
265
+ c1, c2, c3 = st.columns([2.4,5,1])
266
+
267
+ try:
268
+ # Tentar decodificar como PDF
269
+ if os.path.exists("temp"):
270
+ shutil.rmtree("temp", ignore_errors=True)
271
+ os.mkdir("temp")
272
+ doc = fitz.Document(stream=uploaded_file.getvalue())
273
+ pdfPath = Path("temp/temp.pdf")
274
+ doc.save(pdfPath)
275
+
276
+ # Imprimir a primeira página
277
+ for page in doc:
278
+ pix = page.get_pixmap()
279
+ pix.save("temp/icon-page-1.jpg")
280
+ c2.image("temp/icon-page-1.jpg", "Página do documento", width=300)
281
+ break
282
+
283
+ # Fazer a previsão
284
+ with st.spinner("Fazendo previsão..."):
285
+ predicted_class, probabilities = classifyPDF(pdfPath, model, processor, reader)
286
+ print(probabilities)
287
+ except fitz.fitz.FileDataError:
288
+ # Carregar a imagem passada
289
+ image_bytes = uploaded_file.getvalue()
290
+ bytes_data = io.BytesIO(image_bytes)
291
+ image = Image.open(bytes_data)
292
+
293
+ # Mostrar a imagem
294
+ c2.image(image, "Página do documento", width=300)
295
+
296
+ # Fazer a previsão
297
+ with st.spinner("Fazendo previsão..."):
298
+ predicted_class, probabilities = predict(
299
+ image, image_bytes, reader, processor, model
300
+ )
301
+ finally:
302
+ # Remover a pasta temporária se ainda existir
303
+ if os.path.exists("temp"):
304
+ shutil.rmtree("temp", ignore_errors=True)
305
+ if os.path.exists("temp_classification"):
306
+ shutil.rmtree("temp_classification", ignore_errors=True)
307
+
308
+ # Imprimir o resultado na tela
309
+ predicted_label = model.config.id2label[predicted_class]
310
+ st.markdown(f"Tipo do documento previsto: **{predicted_label}**")
311
+
312
+ plot_confianca(probabilities, model)
313
+
314
+
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ PyMuPDF==1.21.1
2
+ numpy==1.24.2
3
+ streamlit==1.15.2
4
+ transformers==4.25.1
5
+ pandas==2.0.0
6
+ plotly-express==0.4.1
7
+ python-dotenv==1.0.0
8
+ Pillow==9.4.0
9
+ torch==2.0.0
10
+ easyocr==1.6.2