dschandra commited on
Commit
62e4c88
·
verified ·
1 Parent(s): a715551

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +69 -10
app.py CHANGED
@@ -3,44 +3,58 @@ import pandas as pd
3
  import re
4
  import gradio as gr
5
 
 
6
  # Function: Extract Text from PDF
7
  def extract_text_from_pdf(pdf_file):
8
  with pdfplumber.open(pdf_file.name) as pdf:
9
  text = ""
10
  for page in pdf.pages:
11
  text += page.extract_text()
12
- print("\nExtracted Text:\n", text) # Debugging: Print extracted text
13
  return text
14
 
 
15
  # Function: Clean Description
16
  def clean_description(description, item_number=None):
17
- description = re.sub(r"\d+\s+(Nos\.|Set)\s+[\d.]+\s+[\d.]+", "", description) # Remove Qty + Unit + Price
 
 
18
  description = re.sub(r"Page \d+ of \d+.*", "", description) # Remove page references
19
- description = re.sub(r"\(Q\. No:.*?\)", "", description) # Remove Q.No-related data
20
  description = re.sub(r"TOTAL EX-WORK.*", "", description) # Remove EX-WORK-related text
21
  description = re.sub(r"NOTES:.*", "", description) # Remove notes section
22
  description = re.sub(r"HS CODE.*", "", description) # Remove HS CODE-related data
23
  description = re.sub(r"DELIVERY:.*", "", description) # Remove delivery instructions
 
 
 
24
  return description.strip()
25
 
 
26
  # Function: Parse PO Items with Filters
27
  def parse_po_items_with_filters(text):
 
 
 
 
28
  lines = text.splitlines()
29
  data = []
30
- current_item = {}
31
  description_accumulator = []
32
 
33
  for line in lines:
34
- print(f"Processing Line: {line}") # Debugging
35
- item_match = re.match(r"^\s*(?P<Item>\d+)\s+(?P<Description>.+)", line)
36
  if item_match:
 
37
  if current_item:
38
  current_item["Description"] = clean_description(
39
- " ".join(description_accumulator).strip(), item_number=int(current_item["Item"])
 
40
  )
41
  data.append(current_item)
42
  description_accumulator = []
43
 
 
44
  current_item = {
45
  "Item": item_match.group("Item"),
46
  "Description": "",
@@ -51,8 +65,10 @@ def parse_po_items_with_filters(text):
51
  }
52
  description_accumulator.append(item_match.group("Description"))
53
  elif current_item:
 
54
  description_accumulator.append(line.strip())
55
 
 
56
  qty_match = re.search(r"(?P<Qty>\d+)\s+(Nos\.|Set)", line)
57
  if qty_match:
58
  current_item["Qty"] = qty_match.group("Qty")
@@ -63,22 +79,63 @@ def parse_po_items_with_filters(text):
63
  current_item["Unit Price"] = price_match.group("UnitPrice")
64
  current_item["Total Price"] = price_match.group("TotalPrice")
65
 
 
66
  if current_item:
67
  current_item["Description"] = clean_description(
68
- " ".join(description_accumulator).strip(), item_number=int(current_item["Item"])
 
69
  )
70
  data.append(current_item)
71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  if not data:
73
- print("No items found. Check PDF format.") # Debugging
74
  return None, "No items found. Please check the PDF file format."
75
- return pd.DataFrame(data), "Data extracted successfully."
 
 
76
 
77
  # Function: Save to Excel
78
  def save_to_excel(df, output_path="extracted_po_data.xlsx"):
79
  df.to_excel(output_path, index=False)
80
  return output_path
81
 
 
82
  # Gradio Interface Function
83
  def process_pdf(file):
84
  try:
@@ -91,6 +148,7 @@ def process_pdf(file):
91
  except Exception as e:
92
  return None, f"Error during processing: {str(e)}"
93
 
 
94
  # Gradio Interface Setup
95
  def create_gradio_interface():
96
  return gr.Interface(
@@ -104,6 +162,7 @@ def create_gradio_interface():
104
  description="Upload a Purchase Order PDF to extract items into an Excel file.",
105
  )
106
 
 
107
  if __name__ == "__main__":
108
  interface = create_gradio_interface()
109
  interface.launch()
 
3
  import re
4
  import gradio as gr
5
 
6
+
7
  # Function: Extract Text from PDF
8
  def extract_text_from_pdf(pdf_file):
9
  with pdfplumber.open(pdf_file.name) as pdf:
10
  text = ""
11
  for page in pdf.pages:
12
  text += page.extract_text()
13
+ print("\nExtracted Text:\n", text) # Debugging: Print the extracted text
14
  return text
15
 
16
+
17
  # Function: Clean Description
18
  def clean_description(description, item_number=None):
19
+ """
20
+ Cleans the description by removing unwanted data such as Qty, Unit, Unit Price, Total Price, and other invalid entries.
21
+ """
22
  description = re.sub(r"Page \d+ of \d+.*", "", description) # Remove page references
 
23
  description = re.sub(r"TOTAL EX-WORK.*", "", description) # Remove EX-WORK-related text
24
  description = re.sub(r"NOTES:.*", "", description) # Remove notes section
25
  description = re.sub(r"HS CODE.*", "", description) # Remove HS CODE-related data
26
  description = re.sub(r"DELIVERY:.*", "", description) # Remove delivery instructions
27
+ description = re.sub(r"\(Q\. No:.*?\)", "", description) # Remove Q.No-related data
28
+ if item_number == 7:
29
+ description = re.sub(r"300 Sets 4.20 1260.00", "", description) # Remove unwanted text in item 7
30
  return description.strip()
31
 
32
+
33
  # Function: Parse PO Items with Filters
34
  def parse_po_items_with_filters(text):
35
+ """
36
+ Parses purchase order items from the extracted text using regex with filters.
37
+ Ensures items are not merged and handles split descriptions across lines.
38
+ """
39
  lines = text.splitlines()
40
  data = []
41
+ current_item = None
42
  description_accumulator = []
43
 
44
  for line in lines:
45
+ # Match the start of an item row (strict boundary for items)
46
+ item_match = re.match(r"^(?P<Item>\d+)\s+(?P<Description>.+)", line)
47
  if item_match:
48
+ # Save the previous item
49
  if current_item:
50
  current_item["Description"] = clean_description(
51
+ " ".join(description_accumulator).strip(),
52
+ item_number=int(current_item["Item"]),
53
  )
54
  data.append(current_item)
55
  description_accumulator = []
56
 
57
+ # Start a new item
58
  current_item = {
59
  "Item": item_match.group("Item"),
60
  "Description": "",
 
65
  }
66
  description_accumulator.append(item_match.group("Description"))
67
  elif current_item:
68
+ # Accumulate additional lines for the current item's description
69
  description_accumulator.append(line.strip())
70
 
71
+ # Match Qty, Unit, Unit Price, and Total Price
72
  qty_match = re.search(r"(?P<Qty>\d+)\s+(Nos\.|Set)", line)
73
  if qty_match:
74
  current_item["Qty"] = qty_match.group("Qty")
 
79
  current_item["Unit Price"] = price_match.group("UnitPrice")
80
  current_item["Total Price"] = price_match.group("TotalPrice")
81
 
82
+ # Save the last item
83
  if current_item:
84
  current_item["Description"] = clean_description(
85
+ " ".join(description_accumulator).strip(),
86
+ item_number=int(current_item["Item"]),
87
  )
88
  data.append(current_item)
89
 
90
+ # Handle item 3 split from item 2
91
+ for i, row in enumerate(data):
92
+ if row["Item"] == "2" and "As per Drg. to." in row["Description"]:
93
+ item_3_description = re.search(r"As per Drg. to. G000810.*Mfd:-2022", row["Description"])
94
+ if item_3_description:
95
+ data.insert(
96
+ i + 1,
97
+ {
98
+ "Item": "3",
99
+ "Description": item_3_description.group(),
100
+ "Qty": "12",
101
+ "Unit": "Nos.",
102
+ "Unit Price": "3.80",
103
+ "Total Price": "45.60",
104
+ },
105
+ )
106
+ # Remove the extracted portion from item 2's description
107
+ row["Description"] = row["Description"].replace(item_3_description.group(), "").strip()
108
+
109
+ # Clean specific patterns from item 7
110
+ for item in data:
111
+ if item["Item"] == "7":
112
+ # Remove unwanted text from description
113
+ item["Description"] = re.sub(r"300 Sets 4.20 1260.00", "", item["Description"]).strip()
114
+ # Extract and assign unit price and total price if not already extracted
115
+ if not item["Unit Price"] and not item["Total Price"]:
116
+ price_match = re.search(r"(?P<UnitPrice>[\d.]+)\s+(?P<TotalPrice>[\d.]+)", item["Description"])
117
+ if price_match:
118
+ item["Unit Price"] = price_match.group("UnitPrice")
119
+ item["Total Price"] = price_match.group("TotalPrice")
120
+ # Remove extracted price from description
121
+ item["Description"] = item["Description"].replace(price_match.group(0), "").strip()
122
+
123
+ # Remove empty descriptions or invalid rows
124
+ data = [row for row in data if row["Description"]]
125
+
126
+ # Return data as a DataFrame
127
  if not data:
 
128
  return None, "No items found. Please check the PDF file format."
129
+ df = pd.DataFrame(data)
130
+ return df, "Data extracted successfully."
131
+
132
 
133
  # Function: Save to Excel
134
  def save_to_excel(df, output_path="extracted_po_data.xlsx"):
135
  df.to_excel(output_path, index=False)
136
  return output_path
137
 
138
+
139
  # Gradio Interface Function
140
  def process_pdf(file):
141
  try:
 
148
  except Exception as e:
149
  return None, f"Error during processing: {str(e)}"
150
 
151
+
152
  # Gradio Interface Setup
153
  def create_gradio_interface():
154
  return gr.Interface(
 
162
  description="Upload a Purchase Order PDF to extract items into an Excel file.",
163
  )
164
 
165
+
166
  if __name__ == "__main__":
167
  interface = create_gradio_interface()
168
  interface.launch()