Update app.py
Browse files
app.py
CHANGED
|
@@ -20,29 +20,29 @@ def extract_text_from_pdf(pdf_file):
|
|
| 20 |
return text
|
| 21 |
|
| 22 |
|
| 23 |
-
def
|
| 24 |
"""
|
| 25 |
-
|
| 26 |
Args:
|
| 27 |
lines (list): List of text lines from the PDF.
|
| 28 |
Returns:
|
| 29 |
-
list:
|
| 30 |
"""
|
| 31 |
-
|
| 32 |
-
current_row =
|
| 33 |
|
| 34 |
for line in lines:
|
| 35 |
-
if re.match(r"^\d+\s", line): # If
|
| 36 |
if current_row:
|
| 37 |
-
|
| 38 |
-
current_row =
|
| 39 |
else:
|
| 40 |
-
current_row
|
| 41 |
|
| 42 |
if current_row:
|
| 43 |
-
|
| 44 |
|
| 45 |
-
return
|
| 46 |
|
| 47 |
|
| 48 |
def parse_po_items(rows):
|
|
@@ -51,10 +51,9 @@ def parse_po_items(rows):
|
|
| 51 |
Args:
|
| 52 |
rows (list): List of reconstructed rows.
|
| 53 |
Returns:
|
| 54 |
-
|
| 55 |
"""
|
| 56 |
data = []
|
| 57 |
-
|
| 58 |
for row in rows:
|
| 59 |
try:
|
| 60 |
# Match ITEM, DESCRIPTION, QTY, UNIT, UNIT PRICE, TOTAL PRICE
|
|
@@ -74,7 +73,7 @@ def parse_po_items(rows):
|
|
| 74 |
}
|
| 75 |
)
|
| 76 |
else:
|
| 77 |
-
print(f"Skipped row: {row}") #
|
| 78 |
except Exception as e:
|
| 79 |
print(f"Error parsing row: {row}, Error: {e}")
|
| 80 |
|
|
@@ -109,8 +108,8 @@ def process_pdf(file):
|
|
| 109 |
text = extract_text_from_pdf(file)
|
| 110 |
# Split text into lines
|
| 111 |
lines = text.splitlines()
|
| 112 |
-
#
|
| 113 |
-
rows =
|
| 114 |
# Parse reconstructed rows
|
| 115 |
df, status = parse_po_items(rows)
|
| 116 |
if df is not None:
|
|
|
|
| 20 |
return text
|
| 21 |
|
| 22 |
|
| 23 |
+
def preprocess_lines(lines):
|
| 24 |
"""
|
| 25 |
+
Combines multi-line rows into single rows for better parsing.
|
| 26 |
Args:
|
| 27 |
lines (list): List of text lines from the PDF.
|
| 28 |
Returns:
|
| 29 |
+
list: Preprocessed list of single-row strings.
|
| 30 |
"""
|
| 31 |
+
combined_rows = []
|
| 32 |
+
current_row = ""
|
| 33 |
|
| 34 |
for line in lines:
|
| 35 |
+
if re.match(r"^\d+\s", line): # If line starts with an item number
|
| 36 |
if current_row:
|
| 37 |
+
combined_rows.append(current_row.strip())
|
| 38 |
+
current_row = line
|
| 39 |
else:
|
| 40 |
+
current_row += " " + line.strip()
|
| 41 |
|
| 42 |
if current_row:
|
| 43 |
+
combined_rows.append(current_row.strip())
|
| 44 |
|
| 45 |
+
return combined_rows
|
| 46 |
|
| 47 |
|
| 48 |
def parse_po_items(rows):
|
|
|
|
| 51 |
Args:
|
| 52 |
rows (list): List of reconstructed rows.
|
| 53 |
Returns:
|
| 54 |
+
tuple: DataFrame with extracted data and a status message.
|
| 55 |
"""
|
| 56 |
data = []
|
|
|
|
| 57 |
for row in rows:
|
| 58 |
try:
|
| 59 |
# Match ITEM, DESCRIPTION, QTY, UNIT, UNIT PRICE, TOTAL PRICE
|
|
|
|
| 73 |
}
|
| 74 |
)
|
| 75 |
else:
|
| 76 |
+
print(f"Skipped row: {row}") # Log skipped rows
|
| 77 |
except Exception as e:
|
| 78 |
print(f"Error parsing row: {row}, Error: {e}")
|
| 79 |
|
|
|
|
| 108 |
text = extract_text_from_pdf(file)
|
| 109 |
# Split text into lines
|
| 110 |
lines = text.splitlines()
|
| 111 |
+
# Preprocess lines to reconstruct rows
|
| 112 |
+
rows = preprocess_lines(lines)
|
| 113 |
# Parse reconstructed rows
|
| 114 |
df, status = parse_po_items(rows)
|
| 115 |
if df is not None:
|