llatos's picture
Upload folder using huggingface_hub
dd217a7 verified
import gradio as gr
import pdfplumber
import pandas as pd
import re
import datetime
def process_pdf(pdf_file):
# Extract the original PDF filename
uploaded_filename = pdf_file.name.split("/")[-1].replace(".pdf", "")
# Generate timestamp
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
# Open the uploaded PDF file
expanded_sample_lines = []
with pdfplumber.open(pdf_file.name) as pdf:
for page in pdf.pages:
text = page.extract_text()
if text:
expanded_sample_lines.extend(text.split("\n"))
# Process extracted text to extract relevant data
structured_data = []
current_cota = None
current_nome_cliente = None
current_vlr_comissao = None
# Patterns for identifying DT VENDA rows (including all transaction types)
valid_transaction_types = ["PAGAMENTO COMISSAO", "CANCELAMENTO DE PLANO", "INCLUSAO DE PLANO"]
for i in range(len(expanded_sample_lines) - 1):
first_line = expanded_sample_lines[i].strip()
second_line = expanded_sample_lines[i + 1].strip()
# Identify COTA and NOME DO CLIENTE (first line)
cota_match = re.match(r"(\d{4}\.\d{4}\.\d)\s*(D|E)?", first_line)
if cota_match:
current_cota = cota_match.group(1) # Extract COTA without 'D' or 'E'
parts = first_line.split(" E ", 1) # Splitting at " E " to separate COTA and Name
if len(parts) == 2:
current_nome_cliente = parts[1].split(" /")[0].strip() # Extract name before slash "/"
continue
# Identify VLR.COMISSAO dynamically from rows starting with 8-digit values (e.g., 20433I08)
if re.match(r"\d{5}[A-Z]\d{2}", first_line): # Matches patterns like 20433I08
first_parts = first_line.split()
# Locate the VLR.COMISSAO dynamically by looking for monetary values that are NOT 0,00 or 50,00
possible_values = [val for val in first_parts if
re.match(r"^\d{1,3}(\.\d{3})*,\d{2}$", val) and val not in ["0,00", "50,00"]]
if possible_values:
current_vlr_comissao = possible_values[-1] # Last monetary value that isn't 0,00 or 50,00
# Swap comma and dot while capturing the value
current_vlr_comissao = current_vlr_comissao.replace(".", "X").replace(",", ".").replace("X", ",")
# Handle negative commission values (e.g., "5.000,00-")
if current_vlr_comissao.endswith("-"):
current_vlr_comissao = "-" + current_vlr_comissao[:-1]
# Identify DT VENDA from transaction types
if any(tx_type in first_line for tx_type in valid_transaction_types) and current_vlr_comissao:
date_matches = re.findall(r"\d{2}/\d{2}/\d{4}", first_line)
dt_venda = date_matches[1] if len(date_matches) > 1 else None # Get second date if available
# Append structured data if all values are present
if current_cota and current_nome_cliente and current_vlr_comissao and dt_venda:
structured_data.append([current_cota, current_nome_cliente, current_vlr_comissao, dt_venda])
# Reset values for the next entry
current_cota = None
current_nome_cliente = None
current_vlr_comissao = None
# Convert extracted data into a DataFrame
df_final = pd.DataFrame(structured_data, columns=["COTA", "NOME DO CLIENTE", "VLR.COMISSAO", "DT VENDA"])
# Ensure correct Brazilian formatting for VLR.COMISSAO
df_final["VLR.COMISSAO"] = df_final["VLR.COMISSAO"].str.replace(",", "X", regex=False).str.replace(".", ",",
regex=False).str.replace(
"X", ".", regex=False)
# Update the NOME DO CLIENTE column with the required pattern
df_final["NOME DO CLIENTE"] = df_final.apply(
lambda row: f'CLIENTE: {row["NOME DO CLIENTE"]} COTA: {row["COTA"][:4]} GRUPO: {row["COTA"][5:]}' if pd.notna(
row["COTA"]) else row["NOME DO CLIENTE"],
axis=1
)
# Generate dynamic output filename
output_excel_path = f"{uploaded_filename}_final_{timestamp}.xlsx"
df_final.to_excel(output_excel_path, index=False)
return df_final, output_excel_path
# Gradio UI
with gr.Blocks() as app:
gr.Markdown("## 📄 PDF to Excel Converter - Brazilian Format")
gr.Markdown(
"Upload a PDF file containing financial data, and receive a properly formatted Excel file for download.")
with gr.Row():
pdf_input = gr.File(label="Upload your PDF file")
process_button = gr.Button("Process PDF")
output_df = gr.Dataframe(label="Extracted Data Preview")
output_file = gr.File(label="Download Excel File")
process_button.click(fn=process_pdf, inputs=[pdf_input], outputs=[output_df, output_file])
# Launch the Gradio app
app.launch(share=True)