Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,13 +1,61 @@
|
|
| 1 |
from pathlib import Path
|
| 2 |
import gradio as gr
|
| 3 |
import fitz
|
|
|
|
| 4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
def upload_file(filepath):
|
| 6 |
name = Path(filepath).name
|
| 7 |
# load pdf
|
| 8 |
doc = fitz.open(filepath)
|
|
|
|
| 9 |
# now create the excel file
|
| 10 |
-
return [gr.UploadButton(visible=False), gr.DownloadButton(label=f"Download
|
| 11 |
|
| 12 |
def download_file():
|
| 13 |
return [gr.UploadButton(visible=True), gr.DownloadButton(visible=False)]
|
|
|
|
| 1 |
from pathlib import Path
|
| 2 |
import gradio as gr
|
| 3 |
import fitz
|
| 4 |
+
import pandas as pd
|
| 5 |
|
| 6 |
+
def create_excel(doc):
|
| 7 |
+
HEADERS = ['TR [min]','Nome','Area','Fator Capacidade','Pratos Te贸ricos','Sinal-ru铆do (USP)','Resolu莽茫o','Assimetr铆a','Altura','Pureza']
|
| 8 |
+
LIMITS = [(17,50),(50,130),(130,184),(184,240),(240,311),(311,360),(330,418),(400,487),(450,533),(500,600)]
|
| 9 |
+
# LIMITS will be used to correctly identify to which block the data pertains
|
| 10 |
+
def within_limits(x,idx_limit):
|
| 11 |
+
return (x >= LIMITS[idx_limit][0] and x < LIMITS[idx_limit][1])
|
| 12 |
+
# to create the dataframe
|
| 13 |
+
data = {val:list() for val in HEADERS}
|
| 14 |
+
# Extracting text from all pages
|
| 15 |
+
all_text = []
|
| 16 |
+
for page_num in range(len(doc)):
|
| 17 |
+
page = doc[page_num]
|
| 18 |
+
blocks = page.get_text(option = "words")
|
| 19 |
+
# visit each page
|
| 20 |
+
idx = 0
|
| 21 |
+
while (idx < len(blocks)) and (blocks[idx][4] != 'TR'):
|
| 22 |
+
idx = idx + 1
|
| 23 |
+
# check if the next is [min]
|
| 24 |
+
if (idx + 1 < len(blocks)) and blocks[idx + 1][4] != "[min]":
|
| 25 |
+
continue
|
| 26 |
+
#
|
| 27 |
+
#print(blocks[idx:(idx+14)])
|
| 28 |
+
idx = idx + 14
|
| 29 |
+
while (idx < len(blocks)):
|
| 30 |
+
if(blocks[idx][4] == 'Relat贸rio'):
|
| 31 |
+
break
|
| 32 |
+
idx_col = 0
|
| 33 |
+
while (idx_col < len(HEADERS)) and (idx < len(blocks)):
|
| 34 |
+
if within_limits(blocks[idx][0],idx_col):
|
| 35 |
+
if idx_col == 1:
|
| 36 |
+
final_string = "" #blocks[idx][4]
|
| 37 |
+
while (idx < len(blocks)) and within_limits(blocks[idx][0],idx_col):
|
| 38 |
+
final_string = final_string + " " + blocks[idx][4]
|
| 39 |
+
idx = idx + 1
|
| 40 |
+
data[HEADERS[idx_col]].append(final_string)
|
| 41 |
+
idx = idx - 1
|
| 42 |
+
else:
|
| 43 |
+
data[HEADERS[idx_col]].append(blocks[idx][4])
|
| 44 |
+
idx = idx + 1
|
| 45 |
+
else:
|
| 46 |
+
data[HEADERS[idx_col]].append(None)
|
| 47 |
+
idx_col = idx_col + 1
|
| 48 |
+
# SHOW THE RECOVERED DATA
|
| 49 |
+
df_table = pd.DataFrame.from_dict(data)
|
| 50 |
+
return (df.to_excel("tabla.xlsx", index=False))
|
| 51 |
+
|
| 52 |
def upload_file(filepath):
|
| 53 |
name = Path(filepath).name
|
| 54 |
# load pdf
|
| 55 |
doc = fitz.open(filepath)
|
| 56 |
+
df_table = create_excel(doc)
|
| 57 |
# now create the excel file
|
| 58 |
+
return [gr.UploadButton(visible=False), gr.DownloadButton(label=f"Download tabla.xlsx", value=df_table, visible=True)]
|
| 59 |
|
| 60 |
def download_file():
|
| 61 |
return [gr.UploadButton(visible=True), gr.DownloadButton(visible=False)]
|