| import gradio as gr |
| import pdfplumber |
| import pandas as pd |
| import re |
| import datetime |
|
|
|
|
| def process_pdf(pdf_file): |
| |
| uploaded_filename = pdf_file.name.split("/")[-1].replace(".pdf", "") |
|
|
| |
| timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") |
|
|
| |
| expanded_sample_lines = [] |
| with pdfplumber.open(pdf_file.name) as pdf: |
| for page in pdf.pages: |
| text = page.extract_text() |
| if text: |
| expanded_sample_lines.extend(text.split("\n")) |
|
|
| |
| structured_data = [] |
| current_cota = None |
| current_nome_cliente = None |
| current_vlr_comissao = None |
|
|
| |
| valid_transaction_types = ["PAGAMENTO COMISSAO", "CANCELAMENTO DE PLANO", "INCLUSAO DE PLANO"] |
|
|
| for i in range(len(expanded_sample_lines) - 1): |
| first_line = expanded_sample_lines[i].strip() |
| second_line = expanded_sample_lines[i + 1].strip() |
|
|
| |
| cota_match = re.match(r"(\d{4}\.\d{4}\.\d)\s*(D|E)?", first_line) |
| if cota_match: |
| current_cota = cota_match.group(1) |
| parts = first_line.split(" E ", 1) |
| if len(parts) == 2: |
| current_nome_cliente = parts[1].split(" /")[0].strip() |
| continue |
|
|
| |
| if re.match(r"\d{5}[A-Z]\d{2}", first_line): |
| first_parts = first_line.split() |
|
|
| |
| possible_values = [val for val in first_parts if |
| re.match(r"^\d{1,3}(\.\d{3})*,\d{2}$", val) and val not in ["0,00", "50,00"]] |
|
|
| if possible_values: |
| current_vlr_comissao = possible_values[-1] |
| |
| current_vlr_comissao = current_vlr_comissao.replace(".", "X").replace(",", ".").replace("X", ",") |
|
|
| |
| if current_vlr_comissao.endswith("-"): |
| current_vlr_comissao = "-" + current_vlr_comissao[:-1] |
|
|
| |
| if any(tx_type in first_line for tx_type in valid_transaction_types) and current_vlr_comissao: |
| date_matches = re.findall(r"\d{2}/\d{2}/\d{4}", first_line) |
| dt_venda = date_matches[1] if len(date_matches) > 1 else None |
|
|
| |
| if current_cota and current_nome_cliente and current_vlr_comissao and dt_venda: |
| structured_data.append([current_cota, current_nome_cliente, current_vlr_comissao, dt_venda]) |
|
|
| |
| current_cota = None |
| current_nome_cliente = None |
| current_vlr_comissao = None |
|
|
| |
| df_final = pd.DataFrame(structured_data, columns=["COTA", "NOME DO CLIENTE", "VLR.COMISSAO", "DT VENDA"]) |
|
|
| |
| df_final["VLR.COMISSAO"] = df_final["VLR.COMISSAO"].str.replace(",", "X", regex=False).str.replace(".", ",", |
| regex=False).str.replace( |
| "X", ".", regex=False) |
|
|
| |
| df_final["NOME DO CLIENTE"] = df_final.apply( |
| lambda row: f'CLIENTE: {row["NOME DO CLIENTE"]} COTA: {row["COTA"][:4]} GRUPO: {row["COTA"][5:]}' if pd.notna( |
| row["COTA"]) else row["NOME DO CLIENTE"], |
| axis=1 |
| ) |
|
|
| |
| output_excel_path = f"{uploaded_filename}_final_{timestamp}.xlsx" |
|
|
| df_final.to_excel(output_excel_path, index=False) |
|
|
| return df_final, output_excel_path |
|
|
|
|
| |
| with gr.Blocks() as app: |
| gr.Markdown("## 📄 PDF to Excel Converter - Brazilian Format") |
| gr.Markdown( |
| "Upload a PDF file containing financial data, and receive a properly formatted Excel file for download.") |
|
|
| with gr.Row(): |
| pdf_input = gr.File(label="Upload your PDF file") |
| process_button = gr.Button("Process PDF") |
|
|
| output_df = gr.Dataframe(label="Extracted Data Preview") |
| output_file = gr.File(label="Download Excel File") |
|
|
| process_button.click(fn=process_pdf, inputs=[pdf_input], outputs=[output_df, output_file]) |
|
|
| |
| app.launch(share=True) |
|
|