Spaces:

llatos
/

unimoConvertPdfToExcel

Paused

App Files Files Community

llatos commited on Mar 11, 2025

Commit

dd217a7

verified ·

1 Parent(s): 7a34ad6

Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

.idea/workspace.xml +1 -1
Fechamento 0602 Preenchido_final_20250311_152105.xlsx +0 -0
main.py +31 -12

.idea/workspace.xml CHANGED Viewed

@@ -49,7 +49,7 @@
       <option name="number" value="Default" />
       <option name="presentableId" value="Default" />
       <updated>1741724175531</updated>
-      <workItem from="1741724176678" duration="2449000" />
     </task>
     <task id="LOCAL-00001" summary="Update project configuration and dependencies&#10;&#10;Updated the Python SDK configuration in the `.iml` file to use Python 3.12. Enabled app sharing in `main.py` by setting `share=True` in the Gradio app launch. Added a `requirements.txt` file to document and manage project dependencies.">
       <option name="closed" value="true" />

       <option name="number" value="Default" />
       <option name="presentableId" value="Default" />
       <updated>1741724175531</updated>
+      <workItem from="1741724176678" duration="3053000" />
     </task>
     <task id="LOCAL-00001" summary="Update project configuration and dependencies&#10;&#10;Updated the Python SDK configuration in the `.iml` file to use Python 3.12. Enabled app sharing in `main.py` by setting `share=True` in the Gradio app launch. Added a `requirements.txt` file to document and manage project dependencies.">
       <option name="closed" value="true" />

Fechamento 0602 Preenchido_final_20250311_152105.xlsx ADDED Viewed

Binary file (6.14 kB). View file

main.py CHANGED Viewed

@@ -2,8 +2,16 @@ import gradio as gr
 import pdfplumber
 import pandas as pd
 import re
 def process_pdf(pdf_file):
     # Open the uploaded PDF file
     expanded_sample_lines = []
     with pdfplumber.open(pdf_file.name) as pdf:
@@ -26,20 +34,24 @@ def process_pdf(pdf_file):
         second_line = expanded_sample_lines[i + 1].strip()
         # Identify COTA and NOME DO CLIENTE (first line)
-        if re.match(r"\d{4}\.\d{4}\.\d", first_line):
             parts = first_line.split(" E ", 1)  # Splitting at " E " to separate COTA and Name
             if len(parts) == 2:
-                current_cota = parts[0].strip()
                 current_nome_cliente = parts[1].split(" /")[0].strip()  # Extract name before slash "/"
             continue
-        # Identify VLR.COMISSAO from rows starting with 8-digit values (e.g., 20433I08)
         if re.match(r"\d{5}[A-Z]\d{2}", first_line):  # Matches patterns like 20433I08
             first_parts = first_line.split()
-            # Ensure line has enough elements
-            if len(first_parts) >= 7:
-                current_vlr_comissao = first_parts[6]  # Extract commission value
                 # Swap comma and dot while capturing the value
                 current_vlr_comissao = current_vlr_comissao.replace(".", "X").replace(",", ".").replace("X", ",")
@@ -64,24 +76,31 @@ def process_pdf(pdf_file):
     # Convert extracted data into a DataFrame
     df_final = pd.DataFrame(structured_data, columns=["COTA", "NOME DO CLIENTE", "VLR.COMISSAO", "DT VENDA"])
-    # Ensure correct Brazilian formatting
-    df_final["VLR.COMISSAO"] = df_final["VLR.COMISSAO"].str.replace(",", "X", regex=False).str.replace(".", ",", regex=False).str.replace("X", ".", regex=False)
     # Update the NOME DO CLIENTE column with the required pattern
     df_final["NOME DO CLIENTE"] = df_final.apply(
-        lambda row: f'CLIENTE: {row["NOME DO CLIENTE"]} COTA: {row["COTA"][:4]} GRUPO: {row["COTA"][5:]}', axis=1
     )
-    # Save the correctly formatted file
-    output_excel_path = "final_filtered_output_brazilian.xlsx"
     df_final.to_excel(output_excel_path, index=False)
     return df_final, output_excel_path
 # Gradio UI
 with gr.Blocks() as app:
     gr.Markdown("## 📄 PDF to Excel Converter - Brazilian Format")
-    gr.Markdown("Upload a PDF file containing financial data, and receive a properly formatted Excel file for download.")
     with gr.Row():
         pdf_input = gr.File(label="Upload your PDF file")

 import pdfplumber
 import pandas as pd
 import re
+import datetime
 def process_pdf(pdf_file):
+    # Extract the original PDF filename
+    uploaded_filename = pdf_file.name.split("/")[-1].replace(".pdf", "")
+    # Generate timestamp
+    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
     # Open the uploaded PDF file
     expanded_sample_lines = []
     with pdfplumber.open(pdf_file.name) as pdf:
         second_line = expanded_sample_lines[i + 1].strip()
         # Identify COTA and NOME DO CLIENTE (first line)
+        cota_match = re.match(r"(\d{4}\.\d{4}\.\d)\s*(D|E)?", first_line)
+        if cota_match:
+            current_cota = cota_match.group(1)  # Extract COTA without 'D' or 'E'
             parts = first_line.split(" E ", 1)  # Splitting at " E " to separate COTA and Name
             if len(parts) == 2:
                 current_nome_cliente = parts[1].split(" /")[0].strip()  # Extract name before slash "/"
             continue
+        # Identify VLR.COMISSAO dynamically from rows starting with 8-digit values (e.g., 20433I08)
         if re.match(r"\d{5}[A-Z]\d{2}", first_line):  # Matches patterns like 20433I08
             first_parts = first_line.split()
+            # Locate the VLR.COMISSAO dynamically by looking for monetary values that are NOT 0,00 or 50,00
+            possible_values = [val for val in first_parts if
+                               re.match(r"^\d{1,3}(\.\d{3})*,\d{2}$", val) and val not in ["0,00", "50,00"]]
+            if possible_values:
+                current_vlr_comissao = possible_values[-1]  # Last monetary value that isn't 0,00 or 50,00
                 # Swap comma and dot while capturing the value
                 current_vlr_comissao = current_vlr_comissao.replace(".", "X").replace(",", ".").replace("X", ",")
     # Convert extracted data into a DataFrame
     df_final = pd.DataFrame(structured_data, columns=["COTA", "NOME DO CLIENTE", "VLR.COMISSAO", "DT VENDA"])
+    # Ensure correct Brazilian formatting for VLR.COMISSAO
+    df_final["VLR.COMISSAO"] = df_final["VLR.COMISSAO"].str.replace(",", "X", regex=False).str.replace(".", ",",
+                                                                                                       regex=False).str.replace(
+        "X", ".", regex=False)
     # Update the NOME DO CLIENTE column with the required pattern
     df_final["NOME DO CLIENTE"] = df_final.apply(
+        lambda row: f'CLIENTE: {row["NOME DO CLIENTE"]} COTA: {row["COTA"][:4]} GRUPO: {row["COTA"][5:]}' if pd.notna(
+            row["COTA"]) else row["NOME DO CLIENTE"],
+        axis=1
     )
+    # Generate dynamic output filename
+    output_excel_path = f"{uploaded_filename}_final_{timestamp}.xlsx"
     df_final.to_excel(output_excel_path, index=False)
     return df_final, output_excel_path
 # Gradio UI
 with gr.Blocks() as app:
     gr.Markdown("## 📄 PDF to Excel Converter - Brazilian Format")
+    gr.Markdown(
+        "Upload a PDF file containing financial data, and receive a properly formatted Excel file for download.")
     with gr.Row():
         pdf_input = gr.File(label="Upload your PDF file")