|
|
|
|
|
import gradio as gr |
|
|
import tempfile |
|
|
import pandas as pd |
|
|
import os |
|
|
import sys |
|
|
import subprocess |
|
|
|
|
|
|
|
|
subprocess.run(["pip", "install", "gradio", "pandas", "transformers", "torch"], check=True) |
|
|
|
|
|
|
|
|
subprocess.run(["pip", "install", "git+https://github.com/docling-project/docling-core.git"], check=True) |
|
|
|
|
|
|
|
|
os.environ["DOCLING_MODEL_HOME"] = os.path.expanduser("~/.docling/models") |
|
|
|
|
|
|
|
|
model_dir = os.path.expanduser("~/.docling/models/tableformer/accurate") |
|
|
os.makedirs(model_dir, exist_ok=True) |
|
|
|
|
|
|
|
|
from docling_core.pipelines.table import TableExtractionPipeline |
|
|
from docling_core.models import ModelManager |
|
|
|
|
|
def process_pdf(file): |
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf: |
|
|
tmp_pdf.write(file.read()) |
|
|
pdf_path = tmp_pdf.name |
|
|
|
|
|
manager = ModelManager(auto_download=True) |
|
|
pipeline = TableExtractionPipeline(model_manager=manager) |
|
|
result = pipeline.run(pdf_path) |
|
|
tables = result["tables"] |
|
|
|
|
|
outputs = [] |
|
|
if not tables: |
|
|
return "Nenhuma tabela detectada.", None |
|
|
|
|
|
for i, table in enumerate(tables): |
|
|
df = pd.DataFrame(table.rows) |
|
|
csv_path = f"tabela_{i+1}.csv" |
|
|
df.to_csv(csv_path, index=False) |
|
|
outputs.append((f"Tabela {i+1} (página {table.page_number})", df)) |
|
|
|
|
|
os.unlink(pdf_path) |
|
|
return "Tabelas extraídas com sucesso!", outputs |
|
|
|
|
|
def show_tables(file): |
|
|
status, results = process_pdf(file) |
|
|
if not results: |
|
|
return status, None, None, None, None |
|
|
|
|
|
views = [None, None, None, None] |
|
|
for i, (title, df) in enumerate(results[:4]): |
|
|
views[i] = (gr.Markdown(f"### {title}"), gr.Dataframe(df)) |
|
|
|
|
|
return (status,) + tuple(x for pair in views if pair for x in pair) |
|
|
|
|
|
with gr.Blocks() as demo: |
|
|
gr.Markdown("# 🧾 TableFormer via Docling") |
|
|
|
|
|
with gr.Row(): |
|
|
file = gr.File(label="Envie o PDF do balancete", file_types=[".pdf"]) |
|
|
btn = gr.Button("Processar") |
|
|
|
|
|
status = gr.Textbox(label="Status") |
|
|
output1_md = gr.Markdown(visible=False) |
|
|
output1_df = gr.Dataframe(visible=False) |
|
|
output2_md = gr.Markdown(visible=False) |
|
|
output2_df = gr.Dataframe(visible=False) |
|
|
|
|
|
btn.click( |
|
|
fn=show_tables, |
|
|
inputs=file, |
|
|
outputs=[status, output1_md, output1_df, output2_md, output2_df] |
|
|
) |
|
|
|
|
|
demo.launch() |