dschandra commited on
Commit
37c3cef
·
verified ·
1 Parent(s): 10eea43

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -1
app.py CHANGED
@@ -19,6 +19,31 @@ def extract_text_from_pdf(pdf_file):
19
  return text
20
 
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  def extract_po_data(text):
23
  """
24
  Extracts purchase order data from the text into structured rows with ITEM, DESCRIPTION, QTY, UNIT, UNIT PRICE, TOTAL PRICE.
@@ -27,7 +52,7 @@ def extract_po_data(text):
27
  Returns:
28
  tuple: A DataFrame containing structured data and a status message.
29
  """
30
- lines = text.splitlines()
31
  data = []
32
 
33
  for line in lines:
 
19
  return text
20
 
21
 
22
+ def preprocess_lines(lines):
23
+ """
24
+ Combines multi-line rows into single rows for better parsing.
25
+ Args:
26
+ lines (list): List of text lines from the PDF.
27
+ Returns:
28
+ list: Preprocessed list of single-row strings.
29
+ """
30
+ combined_lines = []
31
+ temp_line = ""
32
+
33
+ for line in lines:
34
+ if re.match(r"^\d+\s+", line): # Starts with an item number
35
+ if temp_line:
36
+ combined_lines.append(temp_line.strip())
37
+ temp_line = line # Start a new line
38
+ else:
39
+ temp_line += " " + line.strip() # Append to the current line
40
+
41
+ if temp_line:
42
+ combined_lines.append(temp_line.strip())
43
+
44
+ return combined_lines
45
+
46
+
47
  def extract_po_data(text):
48
  """
49
  Extracts purchase order data from the text into structured rows with ITEM, DESCRIPTION, QTY, UNIT, UNIT PRICE, TOTAL PRICE.
 
52
  Returns:
53
  tuple: A DataFrame containing structured data and a status message.
54
  """
55
+ lines = preprocess_lines(text.splitlines())
56
  data = []
57
 
58
  for line in lines: