Spaces:

dschandra
/

ALNISFPO

Sleeping

App Files Files Community

dschandra commited on Dec 3, 2024

Commit

4ad8626

verified ·

1 Parent(s): 223273b

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -26

app.py CHANGED Viewed

@@ -20,47 +20,47 @@ def extract_text_from_pdf(pdf_file):
     return text
-def preprocess_lines(lines):
     """
-    Combines multi-line rows into single rows for better parsing.
     Args:
         lines (list): List of text lines from the PDF.
     Returns:
-        list: Preprocessed list of single-row strings.
     """
-    combined_lines = []
-    temp_line = ""
     for line in lines:
-        if re.match(r"^\d+\s", line):  # Detects a line starting with an ITEM number
-            if temp_line:
-                combined_lines.append(temp_line.strip())
-            temp_line = line
         else:
-            temp_line += " " + line.strip()
-    if temp_line:
-        combined_lines.append(temp_line.strip())
-    return combined_lines
-def parse_po_items(lines):
     """
-    Parses purchase order items from the text.
     Args:
-        lines (list): List of combined single-row strings.
     Returns:
         DataFrame: Extracted purchase order data.
     """
     data = []
-    for line in lines:
         try:
-            # Extract ITEM, DESCRIPTION, QTY, UNIT, UNIT PRICE, TOTAL PRICE
             match = re.match(
-                r"^(?P<Item>\d+)\s+(?P<Description>.+?)\s+(?P<Qty>\d+)\s+(?P<Unit>\S+)\s+(?P<UnitPrice>\d+\.\d+)\s+(?P<TotalPrice>\d+\.\d+)$",
-                line,
             )
             if match:
                 data.append(
@@ -74,9 +74,9 @@ def parse_po_items(lines):
                     }
                 )
             else:
-                print(f"Skipped line: {line}")  # Log skipped lines
         except Exception as e:
-            print(f"Error parsing line: {line}, Error: {e}")
     if not data:
         return None, "No valid data found in the provided text."
@@ -107,10 +107,12 @@ def process_pdf(file):
     try:
         # Extract text from the uploaded PDF
         text = extract_text_from_pdf(file)
-        # Preprocess the lines
-        lines = preprocess_lines(text.splitlines())
-        # Parse purchase order items
-        df, status = parse_po_items(lines)
         if df is not None:
             output_path = save_to_excel(df)
             return output_path, status

     return text
+def reconstruct_rows(lines):
     """
+    Reconstructs rows by grouping multi-line descriptions into single rows.
     Args:
         lines (list): List of text lines from the PDF.
     Returns:
+        list: List of reconstructed rows.
     """
+    rows = []
+    current_row = []
     for line in lines:
+        if re.match(r"^\d+\s", line):  # If the line starts with an item number
+            if current_row:
+                rows.append(" ".join(current_row))  # Add the current row
+            current_row = [line.strip()]  # Start a new row
         else:
+            current_row.append(line.strip())  # Append to the current row
+    if current_row:
+        rows.append(" ".join(current_row))  # Add the last row
+    return rows
+def parse_po_items(rows):
     """
+    Parses purchase order items from reconstructed rows.
     Args:
+        rows (list): List of reconstructed rows.
     Returns:
         DataFrame: Extracted purchase order data.
     """
     data = []
+    for row in rows:
         try:
+            # Match ITEM, DESCRIPTION, QTY, UNIT, UNIT PRICE, TOTAL PRICE
             match = re.match(
+                r"^(?P<Item>\d+)\s+(?P<Description>.+?)\s+(?P<Qty>\d+)\s+(?P<Unit>\S+)\s+(?P<UnitPrice>[\d.]+)\s+(?P<TotalPrice>[\d.]+)$",
+                row,
             )
             if match:
                 data.append(
                     }
                 )
             else:
+                print(f"Skipped row: {row}")  # Debugging: Log skipped rows
         except Exception as e:
+            print(f"Error parsing row: {row}, Error: {e}")
     if not data:
         return None, "No valid data found in the provided text."
     try:
         # Extract text from the uploaded PDF
         text = extract_text_from_pdf(file)
+        # Split text into lines
+        lines = text.splitlines()
+        # Reconstruct rows
+        rows = reconstruct_rows(lines)
+        # Parse reconstructed rows
+        df, status = parse_po_items(rows)
         if df is not None:
             output_path = save_to_excel(df)
             return output_path, status