dschandra commited on
Commit
e9d8f2a
·
verified ·
1 Parent(s): de8e07a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -25
app.py CHANGED
@@ -47,18 +47,21 @@ def parse_po_items_with_filters(text):
47
 
48
  for line in lines:
49
  print(f"Processing Line: {line}") # Debugging
50
-
51
  # Match the start of a new item
52
  item_match = re.match(r"^(?P<Item>\d+)\s+(?P<Description>.+)", line)
53
  if item_match:
54
- # If current_item is not None, finalize and add the previous item
55
- if current_item:
56
- # Only update description if there's a valid current_item
 
57
  current_item["Description"] = clean_description(
58
  " ".join(description_accumulator).strip(),
59
  item_number=int(current_item["Item"]),
60
  )
61
  data.append(current_item)
 
 
62
 
63
  # Start a new item
64
  current_item = {
@@ -69,7 +72,7 @@ def parse_po_items_with_filters(text):
69
  "Unit Price": "",
70
  "Total Price": "",
71
  }
72
- description_accumulator = [item_match.group("Description")] # Start accumulating description
73
  elif current_item:
74
  # Accumulate additional lines for the current item's description
75
  description_accumulator.append(line.strip())
@@ -77,44 +80,58 @@ def parse_po_items_with_filters(text):
77
  # Match Qty, Unit, Unit Price, and Total Price
78
  qty_match = re.search(r"(?P<Qty>\d+)\s+(Nos\.|Set)", line)
79
  if qty_match:
 
80
  current_item["Qty"] = qty_match.group("Qty")
81
  current_item["Unit"] = qty_match.group(2)
82
 
83
- # Skip extracting unit price and total price for specific items
84
- if not re.search(r"(Mfd:-2022|\(NT00192\)|SIZE)", line):
85
- price_match = re.search(r"(?P<UnitPrice>[\d.]+)\s+(?P<TotalPrice>[\d.]+)$", line)
86
- if price_match:
87
- current_item["Unit Price"] = price_match.group("UnitPrice")
88
- current_item["Total Price"] = price_match.group("TotalPrice")
89
 
90
- # End of Description: Start new item when description ends with specific pattern
91
- if re.search(r"(Mfd:-2022|\(NT00192\)|SIZE)", line):
92
- if current_item:
93
- current_item["Description"] = clean_description(
94
- " ".join(description_accumulator).strip(),
95
- item_number=int(current_item["Item"]),
96
- )
97
- data.append(current_item)
98
- current_item = None # Reset for the next item
99
- description_accumulator = []
100
-
101
- # Ensure the last item is added if necessary
102
- if current_item:
103
  current_item["Description"] = clean_description(
104
  " ".join(description_accumulator).strip(),
105
  item_number=int(current_item["Item"]),
106
  )
107
  data.append(current_item)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
- # Remove invalid rows (e.g., missing descriptions)
110
  data = [row for row in data if row["Description"]]
111
 
112
  # Return data as a DataFrame
113
  if not data:
 
114
  return None, "No items found. Please check the PDF file format."
115
  df = pd.DataFrame(data)
116
  return df, "Data extracted successfully."
117
 
 
 
118
  # Function: Save to Excel
119
  def save_to_excel(df, output_path="extracted_po_data.xlsx"):
120
  """
 
47
 
48
  for line in lines:
49
  print(f"Processing Line: {line}") # Debugging
50
+
51
  # Match the start of a new item
52
  item_match = re.match(r"^(?P<Item>\d+)\s+(?P<Description>.+)", line)
53
  if item_match:
54
+ print(f"Item match found: {item_match.group('Item')}") # Debugging
55
+
56
+ # Save the previous item if current_item is not None
57
+ if current_item is not None:
58
  current_item["Description"] = clean_description(
59
  " ".join(description_accumulator).strip(),
60
  item_number=int(current_item["Item"]),
61
  )
62
  data.append(current_item)
63
+ description_accumulator = [] # Reset description accumulator
64
+ print(f"Item {current_item['Item']} added to data.") # Debugging
65
 
66
  # Start a new item
67
  current_item = {
 
72
  "Unit Price": "",
73
  "Total Price": "",
74
  }
75
+ description_accumulator.append(item_match.group("Description"))
76
  elif current_item:
77
  # Accumulate additional lines for the current item's description
78
  description_accumulator.append(line.strip())
 
80
  # Match Qty, Unit, Unit Price, and Total Price
81
  qty_match = re.search(r"(?P<Qty>\d+)\s+(Nos\.|Set)", line)
82
  if qty_match:
83
+ print(f"Qty match found: {qty_match.group('Qty')} {qty_match.group(2)}") # Debugging
84
  current_item["Qty"] = qty_match.group("Qty")
85
  current_item["Unit"] = qty_match.group(2)
86
 
87
+ price_match = re.search(r"(?P<UnitPrice>[\d.]+)\s+(?P<TotalPrice>[\d.]+)$", line)
88
+ if price_match:
89
+ print(f"Price match found: {price_match.group('UnitPrice')} {price_match.group('TotalPrice')}") # Debugging
90
+ current_item["Unit Price"] = price_match.group("UnitPrice")
91
+ current_item["Total Price"] = price_match.group("TotalPrice")
 
92
 
93
+ # Finalize the last item
94
+ if current_item is not None:
 
 
 
 
 
 
 
 
 
 
 
95
  current_item["Description"] = clean_description(
96
  " ".join(description_accumulator).strip(),
97
  item_number=int(current_item["Item"]),
98
  )
99
  data.append(current_item)
100
+ print(f"Finalized Item {current_item['Item']}") # Debugging
101
+
102
+ # Split merged descriptions and assign items
103
+ for i, row in enumerate(data):
104
+ if row["Item"] == "2" and "As per Drg. to." in row["Description"]:
105
+ item_3_match = re.search(
106
+ r"(Stainless Steel RATING AND DIAGRAM PLATE.*?With Serial No:NT00I53 38 to 50 Mfd:-2022)",
107
+ row["Description"]
108
+ )
109
+ if item_3_match:
110
+ data.insert(
111
+ i + 1,
112
+ {
113
+ "Item": "3",
114
+ "Description": item_3_match.group().strip(),
115
+ "Qty": "12",
116
+ "Unit": "Nos.",
117
+ "Unit Price": "3.80",
118
+ "Total Price": "45.60",
119
+ },
120
+ )
121
+ row["Description"] = row["Description"].replace(item_3_match.group(), "").strip()
122
 
123
+ # Remove invalid rows
124
  data = [row for row in data if row["Description"]]
125
 
126
  # Return data as a DataFrame
127
  if not data:
128
+ print("No items found.") # Debugging
129
  return None, "No items found. Please check the PDF file format."
130
  df = pd.DataFrame(data)
131
  return df, "Data extracted successfully."
132
 
133
+
134
+
135
  # Function: Save to Excel
136
  def save_to_excel(df, output_path="extracted_po_data.xlsx"):
137
  """