Spaces:

SathvikGanta
/

PO_Details_Extraction

Sleeping

App Files Files Community

SathvikGanta commited on Nov 13, 2024

Commit

9d67179

verified ·

1 Parent(s): 22ebb3e

Update parse_bhel_po.py

Browse files

Files changed (1) hide show

parse_bhel_po.py +30 -27

parse_bhel_po.py CHANGED Viewed

@@ -3,66 +3,70 @@ import pandas as pd
 import re
 def extract_po_details(file_obj):
-    data = []
     with pdfplumber.open(file_obj) as pdf:
         for page_number, page in enumerate(pdf.pages, start=1):
             text = page.extract_text()
             if not text:
                 print(f"[DEBUG] No text found on page {page_number}. Skipping.")
                 continue
-            print(f"[DEBUG] Processing text from page {page_number}")
-            # Split text by lines to process each line individually
-            lines = text.splitlines()
-            current_record = {}
             for line_number, line in enumerate(lines, start=1):
                 print(f"[DEBUG] Processing line {line_number} on page {page_number}: {line}")
-                # Identify 'Sl No'
                 sl_no_match = re.match(r'(\d+)\s+', line)
-                if sl_no_match and not current_record.get("Sl No"):
                     current_record["Sl No"] = sl_no_match.group(1)
-                # Match Material Description if not already matched
-                if 'Material Description' not in current_record:
-                    material_desc_match = re.search(r'BPS\s+\d+\s+Material Number:\s*\d+\s*HSN Code:\s*\d+\s*IGST\s*:\s*\d+%', line)
-                    if material_desc_match:
                         current_record["Material Description"] = material_desc_match.group(0)
                 # Match Unit
-                unit_match = re.search(r'\b(No|Kg|Pack)\b', line)
-                if unit_match and not current_record.get("Unit"):
-                    current_record["Unit"] = unit_match.group(0)
                 # Match Quantity
-                quantity_match = re.search(r'\bQuantity\s+(\d+)\b', line)
-                if quantity_match and not current_record.get("Quantity"):
                     current_record["Quantity"] = quantity_match.group(1)
                 # Match Delivery Quantity
-                dely_qty_match = re.search(r'\bDely Qty\s+(\d+)\b', line)
-                if dely_qty_match and not current_record.get("Dely Qty"):
                     current_record["Dely Qty"] = dely_qty_match.group(1)
                 # Match Delivery Date
-                dely_date_match = re.search(r'\d{2}\.\d{2}\.\d{4}', line)
-                if dely_date_match and not current_record.get("Dely Date"):
                     current_record["Dely Date"] = dely_date_match.group(0)
                 # Match Unit Rate
                 unit_rate_match = re.search(r'Unit Rate\s+([\d.]+)', line)
-                if unit_rate_match and not current_record.get("Unit Rate"):
                     current_record["Unit Rate"] = unit_rate_match.group(1)
                 # Match Value
                 value_match = re.search(r'Value\s+([\d.]+)', line)
-                if value_match and not current_record.get("Value"):
                     current_record["Value"] = value_match.group(1)
                 # Check if we have a complete record
                 if len(current_record) == 8:
                     data.append([
                         current_record["Sl No"],
                         current_record["Material Description"],
@@ -73,8 +77,7 @@ def extract_po_details(file_obj):
                         current_record["Unit Rate"],
                         current_record["Value"]
                     ])
-                    print(f"[DEBUG] Extracted record: {current_record}")
-                    current_record = {}  # Reset for the next record
     # Create DataFrame if data was extracted
     if data:

 import re
 def extract_po_details(file_obj):
+    data = []  # Store completed rows here
+    current_record = {}  # Temporary storage for each record until complete
     with pdfplumber.open(file_obj) as pdf:
         for page_number, page in enumerate(pdf.pages, start=1):
             text = page.extract_text()
             if not text:
                 print(f"[DEBUG] No text found on page {page_number}. Skipping.")
                 continue
+            print(f"[DEBUG] Processing text from page {page_number}")
+            lines = text.splitlines()  # Process line by line for better control
             for line_number, line in enumerate(lines, start=1):
                 print(f"[DEBUG] Processing line {line_number} on page {page_number}: {line}")
+                # Match Sl No (if it’s the start of a new item entry)
                 sl_no_match = re.match(r'(\d+)\s+', line)
+                if sl_no_match and "Sl No" not in current_record:
                     current_record["Sl No"] = sl_no_match.group(1)
+                # Material Description (Multi-line support)
+                material_desc_match = re.search(r'(BPS\s+\d+.*?HSN Code:\d+\s+IGST\s*:\s*\d+%)', line)
+                if material_desc_match:
+                    if "Material Description" not in current_record:
                         current_record["Material Description"] = material_desc_match.group(0)
+                    else:
+                        # Append if description spans multiple lines
+                        current_record["Material Description"] += " " + material_desc_match.group(0)
                 # Match Unit
+                if "Unit" not in current_record:
+                    unit_match = re.search(r'\b(No|Kg|Pack)\b', line)
+                    if unit_match:
+                        current_record["Unit"] = unit_match.group(0)
                 # Match Quantity
+                quantity_match = re.search(r'(\d+)\s+Quantity', line)
+                if quantity_match and "Quantity" not in current_record:
                     current_record["Quantity"] = quantity_match.group(1)
                 # Match Delivery Quantity
+                dely_qty_match = re.search(r'(\d+)\s+Dely Qty', line)
+                if dely_qty_match and "Dely Qty" not in current_record:
                     current_record["Dely Qty"] = dely_qty_match.group(1)
                 # Match Delivery Date
+                dely_date_match = re.search(r'(\d{2}\.\d{2}\.\d{4})', line)
+                if dely_date_match and "Dely Date" not in current_record:
                     current_record["Dely Date"] = dely_date_match.group(0)
                 # Match Unit Rate
                 unit_rate_match = re.search(r'Unit Rate\s+([\d.]+)', line)
+                if unit_rate_match and "Unit Rate" not in current_record:
                     current_record["Unit Rate"] = unit_rate_match.group(1)
                 # Match Value
                 value_match = re.search(r'Value\s+([\d.]+)', line)
+                if value_match and "Value" not in current_record:
                     current_record["Value"] = value_match.group(1)
                 # Check if we have a complete record
                 if len(current_record) == 8:
+                    print(f"[DEBUG] Complete record found: {current_record}")
                     data.append([
                         current_record["Sl No"],
                         current_record["Material Description"],
                         current_record["Unit Rate"],
                         current_record["Value"]
                     ])
+                    current_record = {}  # Reset for next record
     # Create DataFrame if data was extracted
     if data: