Spaces:

llatos
/

unimoConvertPdfToExcel

Paused

App Files Files Community

unimoConvertPdfToExcel / main.py

llatos

Upload folder using huggingface_hub

dd217a7 verified about 1 year ago

raw

history blame contribute delete

5.01 kB

	import gradio as gr
	import pdfplumber
	import pandas as pd
	import re
	import datetime


	def process_pdf(pdf_file):
	# Extract the original PDF filename
	uploaded_filename = pdf_file.name.split("/")[-1].replace(".pdf", "")

	# Generate timestamp
	timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

	# Open the uploaded PDF file
	expanded_sample_lines = []
	with pdfplumber.open(pdf_file.name) as pdf:
	for page in pdf.pages:
	text = page.extract_text()
	if text:
	expanded_sample_lines.extend(text.split("\n"))

	# Process extracted text to extract relevant data
	structured_data = []
	current_cota = None
	current_nome_cliente = None
	current_vlr_comissao = None

	# Patterns for identifying DT VENDA rows (including all transaction types)
	valid_transaction_types = ["PAGAMENTO COMISSAO", "CANCELAMENTO DE PLANO", "INCLUSAO DE PLANO"]

	for i in range(len(expanded_sample_lines) - 1):
	first_line = expanded_sample_lines[i].strip()
	second_line = expanded_sample_lines[i + 1].strip()

	# Identify COTA and NOME DO CLIENTE (first line)
	cota_match = re.match(r"(\d{4}\.\d{4}\.\d)\s*(D\|E)?", first_line)
	if cota_match:
	current_cota = cota_match.group(1) # Extract COTA without 'D' or 'E'
	parts = first_line.split(" E ", 1) # Splitting at " E " to separate COTA and Name
	if len(parts) == 2:
	current_nome_cliente = parts[1].split(" /")[0].strip() # Extract name before slash "/"
	continue

	# Identify VLR.COMISSAO dynamically from rows starting with 8-digit values (e.g., 20433I08)
	if re.match(r"\d{5}[A-Z]\d{2}", first_line): # Matches patterns like 20433I08
	first_parts = first_line.split()

	# Locate the VLR.COMISSAO dynamically by looking for monetary values that are NOT 0,00 or 50,00
	possible_values = [val for val in first_parts if
	re.match(r"^\d{1,3}(\.\d{3})*,\d{2}$", val) and val not in ["0,00", "50,00"]]

	if possible_values:
	current_vlr_comissao = possible_values[-1] # Last monetary value that isn't 0,00 or 50,00
	# Swap comma and dot while capturing the value
	current_vlr_comissao = current_vlr_comissao.replace(".", "X").replace(",", ".").replace("X", ",")

	# Handle negative commission values (e.g., "5.000,00-")
	if current_vlr_comissao.endswith("-"):
	current_vlr_comissao = "-" + current_vlr_comissao[:-1]

	# Identify DT VENDA from transaction types
	if any(tx_type in first_line for tx_type in valid_transaction_types) and current_vlr_comissao:
	date_matches = re.findall(r"\d{2}/\d{2}/\d{4}", first_line)
	dt_venda = date_matches[1] if len(date_matches) > 1 else None # Get second date if available

	# Append structured data if all values are present
	if current_cota and current_nome_cliente and current_vlr_comissao and dt_venda:
	structured_data.append([current_cota, current_nome_cliente, current_vlr_comissao, dt_venda])

	# Reset values for the next entry
	current_cota = None
	current_nome_cliente = None
	current_vlr_comissao = None

	# Convert extracted data into a DataFrame
	df_final = pd.DataFrame(structured_data, columns=["COTA", "NOME DO CLIENTE", "VLR.COMISSAO", "DT VENDA"])

	# Ensure correct Brazilian formatting for VLR.COMISSAO
	df_final["VLR.COMISSAO"] = df_final["VLR.COMISSAO"].str.replace(",", "X", regex=False).str.replace(".", ",",
	regex=False).str.replace(
	"X", ".", regex=False)

	# Update the NOME DO CLIENTE column with the required pattern
	df_final["NOME DO CLIENTE"] = df_final.apply(
	lambda row: f'CLIENTE: {row["NOME DO CLIENTE"]} COTA: {row["COTA"][:4]} GRUPO: {row["COTA"][5:]}' if pd.notna(
	row["COTA"]) else row["NOME DO CLIENTE"],
	axis=1
	)

	# Generate dynamic output filename
	output_excel_path = f"{uploaded_filename}_final_{timestamp}.xlsx"

	df_final.to_excel(output_excel_path, index=False)

	return df_final, output_excel_path


	# Gradio UI
	with gr.Blocks() as app:
	gr.Markdown("## 📄 PDF to Excel Converter - Brazilian Format")
	gr.Markdown(
	"Upload a PDF file containing financial data, and receive a properly formatted Excel file for download.")

	with gr.Row():
	pdf_input = gr.File(label="Upload your PDF file")
	process_button = gr.Button("Process PDF")

	output_df = gr.Dataframe(label="Extracted Data Preview")
	output_file = gr.File(label="Download Excel File")

	process_button.click(fn=process_pdf, inputs=[pdf_input], outputs=[output_df, output_file])

	# Launch the Gradio app
	app.launch(share=True)