Upload folder using huggingface_hub
Browse files- .idea/workspace.xml +1 -1
- Fechamento 0602 Preenchido_final_20250311_152105.xlsx +0 -0
- main.py +31 -12
.idea/workspace.xml
CHANGED
|
@@ -49,7 +49,7 @@
|
|
| 49 |
<option name="number" value="Default" />
|
| 50 |
<option name="presentableId" value="Default" />
|
| 51 |
<updated>1741724175531</updated>
|
| 52 |
-
<workItem from="1741724176678" duration="
|
| 53 |
</task>
|
| 54 |
<task id="LOCAL-00001" summary="Update project configuration and dependencies Updated the Python SDK configuration in the `.iml` file to use Python 3.12. Enabled app sharing in `main.py` by setting `share=True` in the Gradio app launch. Added a `requirements.txt` file to document and manage project dependencies.">
|
| 55 |
<option name="closed" value="true" />
|
|
|
|
| 49 |
<option name="number" value="Default" />
|
| 50 |
<option name="presentableId" value="Default" />
|
| 51 |
<updated>1741724175531</updated>
|
| 52 |
+
<workItem from="1741724176678" duration="3053000" />
|
| 53 |
</task>
|
| 54 |
<task id="LOCAL-00001" summary="Update project configuration and dependencies Updated the Python SDK configuration in the `.iml` file to use Python 3.12. Enabled app sharing in `main.py` by setting `share=True` in the Gradio app launch. Added a `requirements.txt` file to document and manage project dependencies.">
|
| 55 |
<option name="closed" value="true" />
|
Fechamento 0602 Preenchido_final_20250311_152105.xlsx
ADDED
|
Binary file (6.14 kB). View file
|
|
|
main.py
CHANGED
|
@@ -2,8 +2,16 @@ import gradio as gr
|
|
| 2 |
import pdfplumber
|
| 3 |
import pandas as pd
|
| 4 |
import re
|
|
|
|
|
|
|
| 5 |
|
| 6 |
def process_pdf(pdf_file):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
# Open the uploaded PDF file
|
| 8 |
expanded_sample_lines = []
|
| 9 |
with pdfplumber.open(pdf_file.name) as pdf:
|
|
@@ -26,20 +34,24 @@ def process_pdf(pdf_file):
|
|
| 26 |
second_line = expanded_sample_lines[i + 1].strip()
|
| 27 |
|
| 28 |
# Identify COTA and NOME DO CLIENTE (first line)
|
| 29 |
-
|
|
|
|
|
|
|
| 30 |
parts = first_line.split(" E ", 1) # Splitting at " E " to separate COTA and Name
|
| 31 |
if len(parts) == 2:
|
| 32 |
-
current_cota = parts[0].strip()
|
| 33 |
current_nome_cliente = parts[1].split(" /")[0].strip() # Extract name before slash "/"
|
| 34 |
continue
|
| 35 |
|
| 36 |
-
# Identify VLR.COMISSAO from rows starting with 8-digit values (e.g., 20433I08)
|
| 37 |
if re.match(r"\d{5}[A-Z]\d{2}", first_line): # Matches patterns like 20433I08
|
| 38 |
first_parts = first_line.split()
|
| 39 |
|
| 40 |
-
#
|
| 41 |
-
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
| 43 |
# Swap comma and dot while capturing the value
|
| 44 |
current_vlr_comissao = current_vlr_comissao.replace(".", "X").replace(",", ".").replace("X", ",")
|
| 45 |
|
|
@@ -64,24 +76,31 @@ def process_pdf(pdf_file):
|
|
| 64 |
# Convert extracted data into a DataFrame
|
| 65 |
df_final = pd.DataFrame(structured_data, columns=["COTA", "NOME DO CLIENTE", "VLR.COMISSAO", "DT VENDA"])
|
| 66 |
|
| 67 |
-
# Ensure correct Brazilian formatting
|
| 68 |
-
df_final["VLR.COMISSAO"] = df_final["VLR.COMISSAO"].str.replace(",", "X", regex=False).str.replace(".", ",",
|
|
|
|
|
|
|
| 69 |
|
| 70 |
# Update the NOME DO CLIENTE column with the required pattern
|
| 71 |
df_final["NOME DO CLIENTE"] = df_final.apply(
|
| 72 |
-
lambda row: f'CLIENTE: {row["NOME DO CLIENTE"]} COTA: {row["COTA"][:4]} GRUPO: {row["COTA"][5:]}'
|
|
|
|
|
|
|
| 73 |
)
|
| 74 |
|
| 75 |
-
#
|
| 76 |
-
output_excel_path = "
|
|
|
|
| 77 |
df_final.to_excel(output_excel_path, index=False)
|
| 78 |
|
| 79 |
return df_final, output_excel_path
|
| 80 |
|
|
|
|
| 81 |
# Gradio UI
|
| 82 |
with gr.Blocks() as app:
|
| 83 |
gr.Markdown("## 📄 PDF to Excel Converter - Brazilian Format")
|
| 84 |
-
gr.Markdown(
|
|
|
|
| 85 |
|
| 86 |
with gr.Row():
|
| 87 |
pdf_input = gr.File(label="Upload your PDF file")
|
|
|
|
| 2 |
import pdfplumber
|
| 3 |
import pandas as pd
|
| 4 |
import re
|
| 5 |
+
import datetime
|
| 6 |
+
|
| 7 |
|
| 8 |
def process_pdf(pdf_file):
|
| 9 |
+
# Extract the original PDF filename
|
| 10 |
+
uploaded_filename = pdf_file.name.split("/")[-1].replace(".pdf", "")
|
| 11 |
+
|
| 12 |
+
# Generate timestamp
|
| 13 |
+
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 14 |
+
|
| 15 |
# Open the uploaded PDF file
|
| 16 |
expanded_sample_lines = []
|
| 17 |
with pdfplumber.open(pdf_file.name) as pdf:
|
|
|
|
| 34 |
second_line = expanded_sample_lines[i + 1].strip()
|
| 35 |
|
| 36 |
# Identify COTA and NOME DO CLIENTE (first line)
|
| 37 |
+
cota_match = re.match(r"(\d{4}\.\d{4}\.\d)\s*(D|E)?", first_line)
|
| 38 |
+
if cota_match:
|
| 39 |
+
current_cota = cota_match.group(1) # Extract COTA without 'D' or 'E'
|
| 40 |
parts = first_line.split(" E ", 1) # Splitting at " E " to separate COTA and Name
|
| 41 |
if len(parts) == 2:
|
|
|
|
| 42 |
current_nome_cliente = parts[1].split(" /")[0].strip() # Extract name before slash "/"
|
| 43 |
continue
|
| 44 |
|
| 45 |
+
# Identify VLR.COMISSAO dynamically from rows starting with 8-digit values (e.g., 20433I08)
|
| 46 |
if re.match(r"\d{5}[A-Z]\d{2}", first_line): # Matches patterns like 20433I08
|
| 47 |
first_parts = first_line.split()
|
| 48 |
|
| 49 |
+
# Locate the VLR.COMISSAO dynamically by looking for monetary values that are NOT 0,00 or 50,00
|
| 50 |
+
possible_values = [val for val in first_parts if
|
| 51 |
+
re.match(r"^\d{1,3}(\.\d{3})*,\d{2}$", val) and val not in ["0,00", "50,00"]]
|
| 52 |
+
|
| 53 |
+
if possible_values:
|
| 54 |
+
current_vlr_comissao = possible_values[-1] # Last monetary value that isn't 0,00 or 50,00
|
| 55 |
# Swap comma and dot while capturing the value
|
| 56 |
current_vlr_comissao = current_vlr_comissao.replace(".", "X").replace(",", ".").replace("X", ",")
|
| 57 |
|
|
|
|
| 76 |
# Convert extracted data into a DataFrame
|
| 77 |
df_final = pd.DataFrame(structured_data, columns=["COTA", "NOME DO CLIENTE", "VLR.COMISSAO", "DT VENDA"])
|
| 78 |
|
| 79 |
+
# Ensure correct Brazilian formatting for VLR.COMISSAO
|
| 80 |
+
df_final["VLR.COMISSAO"] = df_final["VLR.COMISSAO"].str.replace(",", "X", regex=False).str.replace(".", ",",
|
| 81 |
+
regex=False).str.replace(
|
| 82 |
+
"X", ".", regex=False)
|
| 83 |
|
| 84 |
# Update the NOME DO CLIENTE column with the required pattern
|
| 85 |
df_final["NOME DO CLIENTE"] = df_final.apply(
|
| 86 |
+
lambda row: f'CLIENTE: {row["NOME DO CLIENTE"]} COTA: {row["COTA"][:4]} GRUPO: {row["COTA"][5:]}' if pd.notna(
|
| 87 |
+
row["COTA"]) else row["NOME DO CLIENTE"],
|
| 88 |
+
axis=1
|
| 89 |
)
|
| 90 |
|
| 91 |
+
# Generate dynamic output filename
|
| 92 |
+
output_excel_path = f"{uploaded_filename}_final_{timestamp}.xlsx"
|
| 93 |
+
|
| 94 |
df_final.to_excel(output_excel_path, index=False)
|
| 95 |
|
| 96 |
return df_final, output_excel_path
|
| 97 |
|
| 98 |
+
|
| 99 |
# Gradio UI
|
| 100 |
with gr.Blocks() as app:
|
| 101 |
gr.Markdown("## 📄 PDF to Excel Converter - Brazilian Format")
|
| 102 |
+
gr.Markdown(
|
| 103 |
+
"Upload a PDF file containing financial data, and receive a properly formatted Excel file for download.")
|
| 104 |
|
| 105 |
with gr.Row():
|
| 106 |
pdf_input = gr.File(label="Upload your PDF file")
|