Spaces:

SathvikGanta
/

PO_Details_Extraction

Sleeping

File size: 4,753 Bytes

6803196
 
 
 
b6fb9fd
9d67179
 
 
b6fb9fd
c109f3a
6803196
c109f3a
5bfecd6
c109f3a
5bfecd6
9d67179
 
b6fb9fd
5bfecd6
 
 
9d67179
4dfd00c
9d67179
4dfd00c
 
9d67179
 
 
 
4dfd00c
9d67179
 
 
 
4dfd00c
9d67179
 
 
 
4dfd00c
 
9d67179
 
4dfd00c
 
 
9d67179
 
4dfd00c
 
 
9d67179
 
4dfd00c
 
 
 
9d67179
4dfd00c
 
 
 
9d67179
4dfd00c
9d67179
4dfd00c
 
9d67179
4dfd00c
 
 
 
 
 
 
 
 
 
9d67179
6803196
5bfecd6
c109f3a
 
 
5bfecd6
 
4dfd00c
 
 
 
5bfecd6
4dfd00c
c109f3a
 
5bfecd6
c109f3a

import pdfplumber
import pandas as pd
import re

def extract_po_details(file_obj):
    data = []  # Store completed rows here
    current_record = {}  # Temporary storage for each record until complete

    with pdfplumber.open(file_obj) as pdf:
        for page_number, page in enumerate(pdf.pages, start=1):
            text = page.extract_text()
            if not text:
                print(f"[DEBUG] No text found on page {page_number}. Skipping.")
                continue

            print(f"[DEBUG] Processing text from page {page_number}")
            lines = text.splitlines()  # Process line by line for better control

            for line_number, line in enumerate(lines, start=1):
                print(f"[DEBUG] Processing line {line_number} on page {page_number}: {line}")

                # Match Sl No (if it’s the start of a new item entry)
                sl_no_match = re.match(r'(\d+)\s+', line)
                if sl_no_match and "Sl No" not in current_record:
                    current_record["Sl No"] = sl_no_match.group(1)

                # Material Description (Multi-line support)
                material_desc_match = re.search(r'(BPS\s+\d+.*?HSN Code:\d+\s+IGST\s*:\s*\d+%)', line)
                if material_desc_match:
                    if "Material Description" not in current_record:
                        current_record["Material Description"] = material_desc_match.group(0)
                    else:
                        # Append if description spans multiple lines
                        current_record["Material Description"] += " " + material_desc_match.group(0)

                # Match Unit
                if "Unit" not in current_record:
                    unit_match = re.search(r'\b(No|Kg|Pack)\b', line)
                    if unit_match:
                        current_record["Unit"] = unit_match.group(0)

                # Match Quantity
                quantity_match = re.search(r'(\d+)\s+Quantity', line)
                if quantity_match and "Quantity" not in current_record:
                    current_record["Quantity"] = quantity_match.group(1)

                # Match Delivery Quantity
                dely_qty_match = re.search(r'(\d+)\s+Dely Qty', line)
                if dely_qty_match and "Dely Qty" not in current_record:
                    current_record["Dely Qty"] = dely_qty_match.group(1)

                # Match Delivery Date
                dely_date_match = re.search(r'(\d{2}\.\d{2}\.\d{4})', line)
                if dely_date_match and "Dely Date" not in current_record:
                    current_record["Dely Date"] = dely_date_match.group(0)

                # Match Unit Rate
                unit_rate_match = re.search(r'Unit Rate\s+([\d.]+)', line)
                if unit_rate_match and "Unit Rate" not in current_record:
                    current_record["Unit Rate"] = unit_rate_match.group(1)

                # Match Value
                value_match = re.search(r'Value\s+([\d.]+)', line)
                if value_match and "Value" not in current_record:
                    current_record["Value"] = value_match.group(1)

                # Check if we have a complete record
                if len(current_record) == 8:
                    print(f"[DEBUG] Complete record found: {current_record}")
                    data.append([
                        current_record["Sl No"],
                        current_record["Material Description"],
                        current_record["Unit"],
                        current_record["Quantity"],
                        current_record["Dely Qty"],
                        current_record["Dely Date"],
                        current_record["Unit Rate"],
                        current_record["Value"]
                    ])
                    current_record = {}  # Reset for next record

    # Create DataFrame if data was extracted
    if data:
        columns = ["Sl No", "Material Description", "Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value"]
        df = pd.DataFrame(data, columns=columns)
        
        # Convert numeric columns to appropriate types
        df["Quantity"] = pd.to_numeric(df["Quantity"], errors='coerce')
        df["Dely Qty"] = pd.to_numeric(df["Dely Qty"], errors='coerce')
        df["Unit Rate"] = pd.to_numeric(df["Unit Rate"], errors='coerce')
        df["Value"] = pd.to_numeric(df["Value"], errors='coerce')
        
        print(f"[DEBUG] Data extracted successfully:\n{df}")
        return df
    else:
        print("[DEBUG] No data extracted. Check if the PDF structure or regex is correct.")
        return pd.DataFrame(columns=["Sl No", "Material Description", "Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value"])