Spaces:

dschandra
/

ALNISFPO

Sleeping

App Files Files Community

dschandra commited on Dec 3, 2024

Commit

a537fa5

verified ·

1 Parent(s): 4ad8626

Update app.py

Browse files

Files changed (1) hide show

app.py +15 -16

app.py CHANGED Viewed

@@ -20,29 +20,29 @@ def extract_text_from_pdf(pdf_file):
     return text
-def reconstruct_rows(lines):
     """
-    Reconstructs rows by grouping multi-line descriptions into single rows.
     Args:
         lines (list): List of text lines from the PDF.
     Returns:
-        list: List of reconstructed rows.
     """
-    rows = []
-    current_row = []
     for line in lines:
-        if re.match(r"^\d+\s", line):  # If the line starts with an item number
             if current_row:
-                rows.append(" ".join(current_row))  # Add the current row
-            current_row = [line.strip()]  # Start a new row
         else:
-            current_row.append(line.strip())  # Append to the current row
     if current_row:
-        rows.append(" ".join(current_row))  # Add the last row
-    return rows
 def parse_po_items(rows):
@@ -51,10 +51,9 @@ def parse_po_items(rows):
     Args:
         rows (list): List of reconstructed rows.
     Returns:
-        DataFrame: Extracted purchase order data.
     """
     data = []
     for row in rows:
         try:
             # Match ITEM, DESCRIPTION, QTY, UNIT, UNIT PRICE, TOTAL PRICE
@@ -74,7 +73,7 @@ def parse_po_items(rows):
                     }
                 )
             else:
-                print(f"Skipped row: {row}")  # Debugging: Log skipped rows
         except Exception as e:
             print(f"Error parsing row: {row}, Error: {e}")
@@ -109,8 +108,8 @@ def process_pdf(file):
         text = extract_text_from_pdf(file)
         # Split text into lines
         lines = text.splitlines()
-        # Reconstruct rows
-        rows = reconstruct_rows(lines)
         # Parse reconstructed rows
         df, status = parse_po_items(rows)
         if df is not None:

     return text
+def preprocess_lines(lines):
     """
+    Combines multi-line rows into single rows for better parsing.
     Args:
         lines (list): List of text lines from the PDF.
     Returns:
+        list: Preprocessed list of single-row strings.
     """
+    combined_rows = []
+    current_row = ""
     for line in lines:
+        if re.match(r"^\d+\s", line):  # If line starts with an item number
             if current_row:
+                combined_rows.append(current_row.strip())
+            current_row = line
         else:
+            current_row += " " + line.strip()
     if current_row:
+        combined_rows.append(current_row.strip())
+    return combined_rows
 def parse_po_items(rows):
     Args:
         rows (list): List of reconstructed rows.
     Returns:
+        tuple: DataFrame with extracted data and a status message.
     """
     data = []
     for row in rows:
         try:
             # Match ITEM, DESCRIPTION, QTY, UNIT, UNIT PRICE, TOTAL PRICE
                     }
                 )
             else:
+                print(f"Skipped row: {row}")  # Log skipped rows
         except Exception as e:
             print(f"Error parsing row: {row}, Error: {e}")
         text = extract_text_from_pdf(file)
         # Split text into lines
         lines = text.splitlines()
+        # Preprocess lines to reconstruct rows
+        rows = preprocess_lines(lines)
         # Parse reconstructed rows
         df, status = parse_po_items(rows)
         if df is not None: