dschandra commited on
Commit
4ad8626
·
verified ·
1 Parent(s): 223273b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -26
app.py CHANGED
@@ -20,47 +20,47 @@ def extract_text_from_pdf(pdf_file):
20
  return text
21
 
22
 
23
- def preprocess_lines(lines):
24
  """
25
- Combines multi-line rows into single rows for better parsing.
26
  Args:
27
  lines (list): List of text lines from the PDF.
28
  Returns:
29
- list: Preprocessed list of single-row strings.
30
  """
31
- combined_lines = []
32
- temp_line = ""
33
 
34
  for line in lines:
35
- if re.match(r"^\d+\s", line): # Detects a line starting with an ITEM number
36
- if temp_line:
37
- combined_lines.append(temp_line.strip())
38
- temp_line = line
39
  else:
40
- temp_line += " " + line.strip()
41
 
42
- if temp_line:
43
- combined_lines.append(temp_line.strip())
44
 
45
- return combined_lines
46
 
47
 
48
- def parse_po_items(lines):
49
  """
50
- Parses purchase order items from the text.
51
  Args:
52
- lines (list): List of combined single-row strings.
53
  Returns:
54
  DataFrame: Extracted purchase order data.
55
  """
56
  data = []
57
 
58
- for line in lines:
59
  try:
60
- # Extract ITEM, DESCRIPTION, QTY, UNIT, UNIT PRICE, TOTAL PRICE
61
  match = re.match(
62
- r"^(?P<Item>\d+)\s+(?P<Description>.+?)\s+(?P<Qty>\d+)\s+(?P<Unit>\S+)\s+(?P<UnitPrice>\d+\.\d+)\s+(?P<TotalPrice>\d+\.\d+)$",
63
- line,
64
  )
65
  if match:
66
  data.append(
@@ -74,9 +74,9 @@ def parse_po_items(lines):
74
  }
75
  )
76
  else:
77
- print(f"Skipped line: {line}") # Log skipped lines
78
  except Exception as e:
79
- print(f"Error parsing line: {line}, Error: {e}")
80
 
81
  if not data:
82
  return None, "No valid data found in the provided text."
@@ -107,10 +107,12 @@ def process_pdf(file):
107
  try:
108
  # Extract text from the uploaded PDF
109
  text = extract_text_from_pdf(file)
110
- # Preprocess the lines
111
- lines = preprocess_lines(text.splitlines())
112
- # Parse purchase order items
113
- df, status = parse_po_items(lines)
 
 
114
  if df is not None:
115
  output_path = save_to_excel(df)
116
  return output_path, status
 
20
  return text
21
 
22
 
23
+ def reconstruct_rows(lines):
24
  """
25
+ Reconstructs rows by grouping multi-line descriptions into single rows.
26
  Args:
27
  lines (list): List of text lines from the PDF.
28
  Returns:
29
+ list: List of reconstructed rows.
30
  """
31
+ rows = []
32
+ current_row = []
33
 
34
  for line in lines:
35
+ if re.match(r"^\d+\s", line): # If the line starts with an item number
36
+ if current_row:
37
+ rows.append(" ".join(current_row)) # Add the current row
38
+ current_row = [line.strip()] # Start a new row
39
  else:
40
+ current_row.append(line.strip()) # Append to the current row
41
 
42
+ if current_row:
43
+ rows.append(" ".join(current_row)) # Add the last row
44
 
45
+ return rows
46
 
47
 
48
+ def parse_po_items(rows):
49
  """
50
+ Parses purchase order items from reconstructed rows.
51
  Args:
52
+ rows (list): List of reconstructed rows.
53
  Returns:
54
  DataFrame: Extracted purchase order data.
55
  """
56
  data = []
57
 
58
+ for row in rows:
59
  try:
60
+ # Match ITEM, DESCRIPTION, QTY, UNIT, UNIT PRICE, TOTAL PRICE
61
  match = re.match(
62
+ r"^(?P<Item>\d+)\s+(?P<Description>.+?)\s+(?P<Qty>\d+)\s+(?P<Unit>\S+)\s+(?P<UnitPrice>[\d.]+)\s+(?P<TotalPrice>[\d.]+)$",
63
+ row,
64
  )
65
  if match:
66
  data.append(
 
74
  }
75
  )
76
  else:
77
+ print(f"Skipped row: {row}") # Debugging: Log skipped rows
78
  except Exception as e:
79
+ print(f"Error parsing row: {row}, Error: {e}")
80
 
81
  if not data:
82
  return None, "No valid data found in the provided text."
 
107
  try:
108
  # Extract text from the uploaded PDF
109
  text = extract_text_from_pdf(file)
110
+ # Split text into lines
111
+ lines = text.splitlines()
112
+ # Reconstruct rows
113
+ rows = reconstruct_rows(lines)
114
+ # Parse reconstructed rows
115
+ df, status = parse_po_items(rows)
116
  if df is not None:
117
  output_path = save_to_excel(df)
118
  return output_path, status