File size: 5,010 Bytes
998035e
 
 
 
dd217a7
 
998035e
 
dd217a7
 
 
 
 
 
998035e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dd217a7
 
 
998035e
 
 
 
 
dd217a7
998035e
 
 
dd217a7
 
 
 
 
 
998035e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dd217a7
 
 
 
998035e
2764e8c
 
dd217a7
 
 
2764e8c
 
dd217a7
 
 
998035e
 
2764e8c
998035e
dd217a7
998035e
 
 
dd217a7
 
998035e
 
 
 
 
2764e8c
998035e
 
2764e8c
998035e
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import gradio as gr
import pdfplumber
import pandas as pd
import re
import datetime


def process_pdf(pdf_file):
    # Extract the original PDF filename
    uploaded_filename = pdf_file.name.split("/")[-1].replace(".pdf", "")

    # Generate timestamp
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

    # Open the uploaded PDF file
    expanded_sample_lines = []
    with pdfplumber.open(pdf_file.name) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                expanded_sample_lines.extend(text.split("\n"))

    # Process extracted text to extract relevant data
    structured_data = []
    current_cota = None
    current_nome_cliente = None
    current_vlr_comissao = None

    # Patterns for identifying DT VENDA rows (including all transaction types)
    valid_transaction_types = ["PAGAMENTO COMISSAO", "CANCELAMENTO DE PLANO", "INCLUSAO DE PLANO"]

    for i in range(len(expanded_sample_lines) - 1):
        first_line = expanded_sample_lines[i].strip()
        second_line = expanded_sample_lines[i + 1].strip()

        # Identify COTA and NOME DO CLIENTE (first line)
        cota_match = re.match(r"(\d{4}\.\d{4}\.\d)\s*(D|E)?", first_line)
        if cota_match:
            current_cota = cota_match.group(1)  # Extract COTA without 'D' or 'E'
            parts = first_line.split(" E ", 1)  # Splitting at " E " to separate COTA and Name
            if len(parts) == 2:
                current_nome_cliente = parts[1].split(" /")[0].strip()  # Extract name before slash "/"
            continue

        # Identify VLR.COMISSAO dynamically from rows starting with 8-digit values (e.g., 20433I08)
        if re.match(r"\d{5}[A-Z]\d{2}", first_line):  # Matches patterns like 20433I08
            first_parts = first_line.split()

            # Locate the VLR.COMISSAO dynamically by looking for monetary values that are NOT 0,00 or 50,00
            possible_values = [val for val in first_parts if
                               re.match(r"^\d{1,3}(\.\d{3})*,\d{2}$", val) and val not in ["0,00", "50,00"]]

            if possible_values:
                current_vlr_comissao = possible_values[-1]  # Last monetary value that isn't 0,00 or 50,00
                # Swap comma and dot while capturing the value
                current_vlr_comissao = current_vlr_comissao.replace(".", "X").replace(",", ".").replace("X", ",")

                # Handle negative commission values (e.g., "5.000,00-")
                if current_vlr_comissao.endswith("-"):
                    current_vlr_comissao = "-" + current_vlr_comissao[:-1]

        # Identify DT VENDA from transaction types
        if any(tx_type in first_line for tx_type in valid_transaction_types) and current_vlr_comissao:
            date_matches = re.findall(r"\d{2}/\d{2}/\d{4}", first_line)
            dt_venda = date_matches[1] if len(date_matches) > 1 else None  # Get second date if available

            # Append structured data if all values are present
            if current_cota and current_nome_cliente and current_vlr_comissao and dt_venda:
                structured_data.append([current_cota, current_nome_cliente, current_vlr_comissao, dt_venda])

            # Reset values for the next entry
            current_cota = None
            current_nome_cliente = None
            current_vlr_comissao = None

    # Convert extracted data into a DataFrame
    df_final = pd.DataFrame(structured_data, columns=["COTA", "NOME DO CLIENTE", "VLR.COMISSAO", "DT VENDA"])

    # Ensure correct Brazilian formatting for VLR.COMISSAO
    df_final["VLR.COMISSAO"] = df_final["VLR.COMISSAO"].str.replace(",", "X", regex=False).str.replace(".", ",",
                                                                                                       regex=False).str.replace(
        "X", ".", regex=False)

    # Update the NOME DO CLIENTE column with the required pattern
    df_final["NOME DO CLIENTE"] = df_final.apply(
        lambda row: f'CLIENTE: {row["NOME DO CLIENTE"]} COTA: {row["COTA"][:4]} GRUPO: {row["COTA"][5:]}' if pd.notna(
            row["COTA"]) else row["NOME DO CLIENTE"],
        axis=1
    )

    # Generate dynamic output filename
    output_excel_path = f"{uploaded_filename}_final_{timestamp}.xlsx"

    df_final.to_excel(output_excel_path, index=False)

    return df_final, output_excel_path


# Gradio UI
with gr.Blocks() as app:
    gr.Markdown("## 📄 PDF to Excel Converter - Brazilian Format")
    gr.Markdown(
        "Upload a PDF file containing financial data, and receive a properly formatted Excel file for download.")

    with gr.Row():
        pdf_input = gr.File(label="Upload your PDF file")
        process_button = gr.Button("Process PDF")

    output_df = gr.Dataframe(label="Extracted Data Preview")
    output_file = gr.File(label="Download Excel File")

    process_button.click(fn=process_pdf, inputs=[pdf_input], outputs=[output_df, output_file])

# Launch the Gradio app
app.launch(share=True)