import gradio as gr import pdfplumber import pandas as pd import re import datetime def process_pdf(pdf_file): # Extract the original PDF filename uploaded_filename = pdf_file.name.split("/")[-1].replace(".pdf", "") # Generate timestamp timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") # Open the uploaded PDF file expanded_sample_lines = [] with pdfplumber.open(pdf_file.name) as pdf: for page in pdf.pages: text = page.extract_text() if text: expanded_sample_lines.extend(text.split("\n")) # Process extracted text to extract relevant data structured_data = [] current_cota = None current_nome_cliente = None current_vlr_comissao = None # Patterns for identifying DT VENDA rows (including all transaction types) valid_transaction_types = ["PAGAMENTO COMISSAO", "CANCELAMENTO DE PLANO", "INCLUSAO DE PLANO"] for i in range(len(expanded_sample_lines) - 1): first_line = expanded_sample_lines[i].strip() second_line = expanded_sample_lines[i + 1].strip() # Identify COTA and NOME DO CLIENTE (first line) cota_match = re.match(r"(\d{4}\.\d{4}\.\d)\s*(D|E)?", first_line) if cota_match: current_cota = cota_match.group(1) # Extract COTA without 'D' or 'E' parts = first_line.split(" E ", 1) # Splitting at " E " to separate COTA and Name if len(parts) == 2: current_nome_cliente = parts[1].split(" /")[0].strip() # Extract name before slash "/" continue # Identify VLR.COMISSAO dynamically from rows starting with 8-digit values (e.g., 20433I08) if re.match(r"\d{5}[A-Z]\d{2}", first_line): # Matches patterns like 20433I08 first_parts = first_line.split() # Locate the VLR.COMISSAO dynamically by looking for monetary values that are NOT 0,00 or 50,00 possible_values = [val for val in first_parts if re.match(r"^\d{1,3}(\.\d{3})*,\d{2}$", val) and val not in ["0,00", "50,00"]] if possible_values: current_vlr_comissao = possible_values[-1] # Last monetary value that isn't 0,00 or 50,00 # Swap comma and dot while capturing the value current_vlr_comissao = current_vlr_comissao.replace(".", "X").replace(",", ".").replace("X", ",") # Handle negative commission values (e.g., "5.000,00-") if current_vlr_comissao.endswith("-"): current_vlr_comissao = "-" + current_vlr_comissao[:-1] # Identify DT VENDA from transaction types if any(tx_type in first_line for tx_type in valid_transaction_types) and current_vlr_comissao: date_matches = re.findall(r"\d{2}/\d{2}/\d{4}", first_line) dt_venda = date_matches[1] if len(date_matches) > 1 else None # Get second date if available # Append structured data if all values are present if current_cota and current_nome_cliente and current_vlr_comissao and dt_venda: structured_data.append([current_cota, current_nome_cliente, current_vlr_comissao, dt_venda]) # Reset values for the next entry current_cota = None current_nome_cliente = None current_vlr_comissao = None # Convert extracted data into a DataFrame df_final = pd.DataFrame(structured_data, columns=["COTA", "NOME DO CLIENTE", "VLR.COMISSAO", "DT VENDA"]) # Ensure correct Brazilian formatting for VLR.COMISSAO df_final["VLR.COMISSAO"] = df_final["VLR.COMISSAO"].str.replace(",", "X", regex=False).str.replace(".", ",", regex=False).str.replace( "X", ".", regex=False) # Update the NOME DO CLIENTE column with the required pattern df_final["NOME DO CLIENTE"] = df_final.apply( lambda row: f'CLIENTE: {row["NOME DO CLIENTE"]} COTA: {row["COTA"][:4]} GRUPO: {row["COTA"][5:]}' if pd.notna( row["COTA"]) else row["NOME DO CLIENTE"], axis=1 ) # Generate dynamic output filename output_excel_path = f"{uploaded_filename}_final_{timestamp}.xlsx" df_final.to_excel(output_excel_path, index=False) return df_final, output_excel_path # Gradio UI with gr.Blocks() as app: gr.Markdown("## 📄 PDF to Excel Converter - Brazilian Format") gr.Markdown( "Upload a PDF file containing financial data, and receive a properly formatted Excel file for download.") with gr.Row(): pdf_input = gr.File(label="Upload your PDF file") process_button = gr.Button("Process PDF") output_df = gr.Dataframe(label="Extracted Data Preview") output_file = gr.File(label="Download Excel File") process_button.click(fn=process_pdf, inputs=[pdf_input], outputs=[output_df, output_file]) # Launch the Gradio app app.launch(share=True)