Spaces:

llatos
/

unimoConvertPdfToExcel

Paused

File size: 5,010 Bytes

import gradio as gr
import pdfplumber
import pandas as pd
import re
import datetime


def process_pdf(pdf_file):
    # Extract the original PDF filename
    uploaded_filename = pdf_file.name.split("/")[-1].replace(".pdf", "")

    # Generate timestamp
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

    # Open the uploaded PDF file
    expanded_sample_lines = []
    with pdfplumber.open(pdf_file.name) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                expanded_sample_lines.extend(text.split("\n"))

    # Process extracted text to extract relevant data
    structured_data = []
    current_cota = None
    current_nome_cliente = None
    current_vlr_comissao = None

    # Patterns for identifying DT VENDA rows (including all transaction types)
    valid_transaction_types = ["PAGAMENTO COMISSAO", "CANCELAMENTO DE PLANO", "INCLUSAO DE PLANO"]

    for i in range(len(expanded_sample_lines) - 1):
        first_line = expanded_sample_lines[i].strip()
        second_line = expanded_sample_lines[i + 1].strip()

        # Identify COTA and NOME DO CLIENTE (first line)
        cota_match = re.match(r"(\d{4}\.\d{4}\.\d)\s*(D|E)?", first_line)
        if cota_match:
            current_cota = cota_match.group(1)  # Extract COTA without 'D' or 'E'
            parts = first_line.split(" E ", 1)  # Splitting at " E " to separate COTA and Name
            if len(parts) == 2:
                current_nome_cliente = parts[1].split(" /")[0].strip()  # Extract name before slash "/"
            continue

        # Identify VLR.COMISSAO dynamically from rows starting with 8-digit values (e.g., 20433I08)
        if re.match(r"\d{5}[A-Z]\d{2}", first_line):  # Matches patterns like 20433I08
            first_parts = first_line.split()

            # Locate the VLR.COMISSAO dynamically by looking for monetary values that are NOT 0,00 or 50,00
            possible_values = [val for val in first_parts if
                               re.match(r"^\d{1,3}(\.\d{3})*,\d{2}$", val) and val not in ["0,00", "50,00"]]

            if possible_values:
                current_vlr_comissao = possible_values[-1]  # Last monetary value that isn't 0,00 or 50,00
                # Swap comma and dot while capturing the value
                current_vlr_comissao = current_vlr_comissao.replace(".", "X").replace(",", ".").replace("X", ",")

                # Handle negative commission values (e.g., "5.000,00-")
                if current_vlr_comissao.endswith("-"):
                    current_vlr_comissao = "-" + current_vlr_comissao[:-1]

        # Identify DT VENDA from transaction types
        if any(tx_type in first_line for tx_type in valid_transaction_types) and current_vlr_comissao:
            date_matches = re.findall(r"\d{2}/\d{2}/\d{4}", first_line)
            dt_venda = date_matches[1] if len(date_matches) > 1 else None  # Get second date if available

            # Append structured data if all values are present
            if current_cota and current_nome_cliente and current_vlr_comissao and dt_venda:
                structured_data.append([current_cota, current_nome_cliente, current_vlr_comissao, dt_venda])

            # Reset values for the next entry
            current_cota = None
            current_nome_cliente = None
            current_vlr_comissao = None

    # Convert extracted data into a DataFrame
    df_final = pd.DataFrame(structured_data, columns=["COTA", "NOME DO CLIENTE", "VLR.COMISSAO", "DT VENDA"])

    # Ensure correct Brazilian formatting for VLR.COMISSAO
    df_final["VLR.COMISSAO"] = df_final["VLR.COMISSAO"].str.replace(",", "X", regex=False).str.replace(".", ",",
                                                                                                       regex=False).str.replace(
        "X", ".", regex=False)

    # Update the NOME DO CLIENTE column with the required pattern
    df_final["NOME DO CLIENTE"] = df_final.apply(
        lambda row: f'CLIENTE: {row["NOME DO CLIENTE"]} COTA: {row["COTA"][:4]} GRUPO: {row["COTA"][5:]}' if pd.notna(
            row["COTA"]) else row["NOME DO CLIENTE"],
        axis=1
    )

    # Generate dynamic output filename
    output_excel_path = f"{uploaded_filename}_final_{timestamp}.xlsx"

    df_final.to_excel(output_excel_path, index=False)

    return df_final, output_excel_path


# Gradio UI
with gr.Blocks() as app:
    gr.Markdown("## 📄 PDF to Excel Converter - Brazilian Format")
    gr.Markdown(
        "Upload a PDF file containing financial data, and receive a properly formatted Excel file for download.")

    with gr.Row():
        pdf_input = gr.File(label="Upload your PDF file")
        process_button = gr.Button("Process PDF")

    output_df = gr.Dataframe(label="Extracted Data Preview")
    output_file = gr.File(label="Download Excel File")

    process_button.click(fn=process_pdf, inputs=[pdf_input], outputs=[output_df, output_file])

# Launch the Gradio app
app.launch(share=True)