dschandra commited on
Commit
a537fa5
·
verified ·
1 Parent(s): 4ad8626

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -16
app.py CHANGED
@@ -20,29 +20,29 @@ def extract_text_from_pdf(pdf_file):
20
  return text
21
 
22
 
23
- def reconstruct_rows(lines):
24
  """
25
- Reconstructs rows by grouping multi-line descriptions into single rows.
26
  Args:
27
  lines (list): List of text lines from the PDF.
28
  Returns:
29
- list: List of reconstructed rows.
30
  """
31
- rows = []
32
- current_row = []
33
 
34
  for line in lines:
35
- if re.match(r"^\d+\s", line): # If the line starts with an item number
36
  if current_row:
37
- rows.append(" ".join(current_row)) # Add the current row
38
- current_row = [line.strip()] # Start a new row
39
  else:
40
- current_row.append(line.strip()) # Append to the current row
41
 
42
  if current_row:
43
- rows.append(" ".join(current_row)) # Add the last row
44
 
45
- return rows
46
 
47
 
48
  def parse_po_items(rows):
@@ -51,10 +51,9 @@ def parse_po_items(rows):
51
  Args:
52
  rows (list): List of reconstructed rows.
53
  Returns:
54
- DataFrame: Extracted purchase order data.
55
  """
56
  data = []
57
-
58
  for row in rows:
59
  try:
60
  # Match ITEM, DESCRIPTION, QTY, UNIT, UNIT PRICE, TOTAL PRICE
@@ -74,7 +73,7 @@ def parse_po_items(rows):
74
  }
75
  )
76
  else:
77
- print(f"Skipped row: {row}") # Debugging: Log skipped rows
78
  except Exception as e:
79
  print(f"Error parsing row: {row}, Error: {e}")
80
 
@@ -109,8 +108,8 @@ def process_pdf(file):
109
  text = extract_text_from_pdf(file)
110
  # Split text into lines
111
  lines = text.splitlines()
112
- # Reconstruct rows
113
- rows = reconstruct_rows(lines)
114
  # Parse reconstructed rows
115
  df, status = parse_po_items(rows)
116
  if df is not None:
 
20
  return text
21
 
22
 
23
+ def preprocess_lines(lines):
24
  """
25
+ Combines multi-line rows into single rows for better parsing.
26
  Args:
27
  lines (list): List of text lines from the PDF.
28
  Returns:
29
+ list: Preprocessed list of single-row strings.
30
  """
31
+ combined_rows = []
32
+ current_row = ""
33
 
34
  for line in lines:
35
+ if re.match(r"^\d+\s", line): # If line starts with an item number
36
  if current_row:
37
+ combined_rows.append(current_row.strip())
38
+ current_row = line
39
  else:
40
+ current_row += " " + line.strip()
41
 
42
  if current_row:
43
+ combined_rows.append(current_row.strip())
44
 
45
+ return combined_rows
46
 
47
 
48
  def parse_po_items(rows):
 
51
  Args:
52
  rows (list): List of reconstructed rows.
53
  Returns:
54
+ tuple: DataFrame with extracted data and a status message.
55
  """
56
  data = []
 
57
  for row in rows:
58
  try:
59
  # Match ITEM, DESCRIPTION, QTY, UNIT, UNIT PRICE, TOTAL PRICE
 
73
  }
74
  )
75
  else:
76
+ print(f"Skipped row: {row}") # Log skipped rows
77
  except Exception as e:
78
  print(f"Error parsing row: {row}, Error: {e}")
79
 
 
108
  text = extract_text_from_pdf(file)
109
  # Split text into lines
110
  lines = text.splitlines()
111
+ # Preprocess lines to reconstruct rows
112
+ rows = preprocess_lines(lines)
113
  # Parse reconstructed rows
114
  df, status = parse_po_items(rows)
115
  if df is not None: