Spaces:

dschandra
/

ALNISFPO

Sleeping

dschandra commited on Dec 3, 2024

Commit

37c3cef

verified ·

1 Parent(s): 10eea43

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -19,6 +19,31 @@ def extract_text_from_pdf(pdf_file):
     return text
 def extract_po_data(text):
     """
     Extracts purchase order data from the text into structured rows with ITEM, DESCRIPTION, QTY, UNIT, UNIT PRICE, TOTAL PRICE.
@@ -27,7 +52,7 @@ def extract_po_data(text):
     Returns:
         tuple: A DataFrame containing structured data and a status message.
     """
-    lines = text.splitlines()
     data = []
     for line in lines:

     return text
+def preprocess_lines(lines):
+    """
+    Combines multi-line rows into single rows for better parsing.
+    Args:
+        lines (list): List of text lines from the PDF.
+    Returns:
+        list: Preprocessed list of single-row strings.
+    """
+    combined_lines = []
+    temp_line = ""
+    for line in lines:
+        if re.match(r"^\d+\s+", line):  # Starts with an item number
+            if temp_line:
+                combined_lines.append(temp_line.strip())
+            temp_line = line  # Start a new line
+        else:
+            temp_line += " " + line.strip()  # Append to the current line
+    if temp_line:
+        combined_lines.append(temp_line.strip())
+    return combined_lines
 def extract_po_data(text):
     """
     Extracts purchase order data from the text into structured rows with ITEM, DESCRIPTION, QTY, UNIT, UNIT PRICE, TOTAL PRICE.
     Returns:
         tuple: A DataFrame containing structured data and a status message.
     """
+    lines = preprocess_lines(text.splitlines())
     data = []
     for line in lines: