llatos commited on
Commit
dd217a7
·
verified ·
1 Parent(s): 7a34ad6

Upload folder using huggingface_hub

Browse files
.idea/workspace.xml CHANGED
@@ -49,7 +49,7 @@
49
  <option name="number" value="Default" />
50
  <option name="presentableId" value="Default" />
51
  <updated>1741724175531</updated>
52
- <workItem from="1741724176678" duration="2449000" />
53
  </task>
54
  <task id="LOCAL-00001" summary="Update project configuration and dependencies&#10;&#10;Updated the Python SDK configuration in the `.iml` file to use Python 3.12. Enabled app sharing in `main.py` by setting `share=True` in the Gradio app launch. Added a `requirements.txt` file to document and manage project dependencies.">
55
  <option name="closed" value="true" />
 
49
  <option name="number" value="Default" />
50
  <option name="presentableId" value="Default" />
51
  <updated>1741724175531</updated>
52
+ <workItem from="1741724176678" duration="3053000" />
53
  </task>
54
  <task id="LOCAL-00001" summary="Update project configuration and dependencies&#10;&#10;Updated the Python SDK configuration in the `.iml` file to use Python 3.12. Enabled app sharing in `main.py` by setting `share=True` in the Gradio app launch. Added a `requirements.txt` file to document and manage project dependencies.">
55
  <option name="closed" value="true" />
Fechamento 0602 Preenchido_final_20250311_152105.xlsx ADDED
Binary file (6.14 kB). View file
 
main.py CHANGED
@@ -2,8 +2,16 @@ import gradio as gr
2
  import pdfplumber
3
  import pandas as pd
4
  import re
 
 
5
 
6
  def process_pdf(pdf_file):
 
 
 
 
 
 
7
  # Open the uploaded PDF file
8
  expanded_sample_lines = []
9
  with pdfplumber.open(pdf_file.name) as pdf:
@@ -26,20 +34,24 @@ def process_pdf(pdf_file):
26
  second_line = expanded_sample_lines[i + 1].strip()
27
 
28
  # Identify COTA and NOME DO CLIENTE (first line)
29
- if re.match(r"\d{4}\.\d{4}\.\d", first_line):
 
 
30
  parts = first_line.split(" E ", 1) # Splitting at " E " to separate COTA and Name
31
  if len(parts) == 2:
32
- current_cota = parts[0].strip()
33
  current_nome_cliente = parts[1].split(" /")[0].strip() # Extract name before slash "/"
34
  continue
35
 
36
- # Identify VLR.COMISSAO from rows starting with 8-digit values (e.g., 20433I08)
37
  if re.match(r"\d{5}[A-Z]\d{2}", first_line): # Matches patterns like 20433I08
38
  first_parts = first_line.split()
39
 
40
- # Ensure line has enough elements
41
- if len(first_parts) >= 7:
42
- current_vlr_comissao = first_parts[6] # Extract commission value
 
 
 
43
  # Swap comma and dot while capturing the value
44
  current_vlr_comissao = current_vlr_comissao.replace(".", "X").replace(",", ".").replace("X", ",")
45
 
@@ -64,24 +76,31 @@ def process_pdf(pdf_file):
64
  # Convert extracted data into a DataFrame
65
  df_final = pd.DataFrame(structured_data, columns=["COTA", "NOME DO CLIENTE", "VLR.COMISSAO", "DT VENDA"])
66
 
67
- # Ensure correct Brazilian formatting
68
- df_final["VLR.COMISSAO"] = df_final["VLR.COMISSAO"].str.replace(",", "X", regex=False).str.replace(".", ",", regex=False).str.replace("X", ".", regex=False)
 
 
69
 
70
  # Update the NOME DO CLIENTE column with the required pattern
71
  df_final["NOME DO CLIENTE"] = df_final.apply(
72
- lambda row: f'CLIENTE: {row["NOME DO CLIENTE"]} COTA: {row["COTA"][:4]} GRUPO: {row["COTA"][5:]}', axis=1
 
 
73
  )
74
 
75
- # Save the correctly formatted file
76
- output_excel_path = "final_filtered_output_brazilian.xlsx"
 
77
  df_final.to_excel(output_excel_path, index=False)
78
 
79
  return df_final, output_excel_path
80
 
 
81
  # Gradio UI
82
  with gr.Blocks() as app:
83
  gr.Markdown("## 📄 PDF to Excel Converter - Brazilian Format")
84
- gr.Markdown("Upload a PDF file containing financial data, and receive a properly formatted Excel file for download.")
 
85
 
86
  with gr.Row():
87
  pdf_input = gr.File(label="Upload your PDF file")
 
2
  import pdfplumber
3
  import pandas as pd
4
  import re
5
+ import datetime
6
+
7
 
8
  def process_pdf(pdf_file):
9
+ # Extract the original PDF filename
10
+ uploaded_filename = pdf_file.name.split("/")[-1].replace(".pdf", "")
11
+
12
+ # Generate timestamp
13
+ timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
14
+
15
  # Open the uploaded PDF file
16
  expanded_sample_lines = []
17
  with pdfplumber.open(pdf_file.name) as pdf:
 
34
  second_line = expanded_sample_lines[i + 1].strip()
35
 
36
  # Identify COTA and NOME DO CLIENTE (first line)
37
+ cota_match = re.match(r"(\d{4}\.\d{4}\.\d)\s*(D|E)?", first_line)
38
+ if cota_match:
39
+ current_cota = cota_match.group(1) # Extract COTA without 'D' or 'E'
40
  parts = first_line.split(" E ", 1) # Splitting at " E " to separate COTA and Name
41
  if len(parts) == 2:
 
42
  current_nome_cliente = parts[1].split(" /")[0].strip() # Extract name before slash "/"
43
  continue
44
 
45
+ # Identify VLR.COMISSAO dynamically from rows starting with 8-digit values (e.g., 20433I08)
46
  if re.match(r"\d{5}[A-Z]\d{2}", first_line): # Matches patterns like 20433I08
47
  first_parts = first_line.split()
48
 
49
+ # Locate the VLR.COMISSAO dynamically by looking for monetary values that are NOT 0,00 or 50,00
50
+ possible_values = [val for val in first_parts if
51
+ re.match(r"^\d{1,3}(\.\d{3})*,\d{2}$", val) and val not in ["0,00", "50,00"]]
52
+
53
+ if possible_values:
54
+ current_vlr_comissao = possible_values[-1] # Last monetary value that isn't 0,00 or 50,00
55
  # Swap comma and dot while capturing the value
56
  current_vlr_comissao = current_vlr_comissao.replace(".", "X").replace(",", ".").replace("X", ",")
57
 
 
76
  # Convert extracted data into a DataFrame
77
  df_final = pd.DataFrame(structured_data, columns=["COTA", "NOME DO CLIENTE", "VLR.COMISSAO", "DT VENDA"])
78
 
79
+ # Ensure correct Brazilian formatting for VLR.COMISSAO
80
+ df_final["VLR.COMISSAO"] = df_final["VLR.COMISSAO"].str.replace(",", "X", regex=False).str.replace(".", ",",
81
+ regex=False).str.replace(
82
+ "X", ".", regex=False)
83
 
84
  # Update the NOME DO CLIENTE column with the required pattern
85
  df_final["NOME DO CLIENTE"] = df_final.apply(
86
+ lambda row: f'CLIENTE: {row["NOME DO CLIENTE"]} COTA: {row["COTA"][:4]} GRUPO: {row["COTA"][5:]}' if pd.notna(
87
+ row["COTA"]) else row["NOME DO CLIENTE"],
88
+ axis=1
89
  )
90
 
91
+ # Generate dynamic output filename
92
+ output_excel_path = f"{uploaded_filename}_final_{timestamp}.xlsx"
93
+
94
  df_final.to_excel(output_excel_path, index=False)
95
 
96
  return df_final, output_excel_path
97
 
98
+
99
  # Gradio UI
100
  with gr.Blocks() as app:
101
  gr.Markdown("## 📄 PDF to Excel Converter - Brazilian Format")
102
+ gr.Markdown(
103
+ "Upload a PDF file containing financial data, and receive a properly formatted Excel file for download.")
104
 
105
  with gr.Row():
106
  pdf_input = gr.File(label="Upload your PDF file")