PO_Details_Extraction / parse_bhel_po.py
SathvikGanta's picture
Update parse_bhel_po.py
9d67179 verified
import pdfplumber
import pandas as pd
import re
def extract_po_details(file_obj):
data = [] # Store completed rows here
current_record = {} # Temporary storage for each record until complete
with pdfplumber.open(file_obj) as pdf:
for page_number, page in enumerate(pdf.pages, start=1):
text = page.extract_text()
if not text:
print(f"[DEBUG] No text found on page {page_number}. Skipping.")
continue
print(f"[DEBUG] Processing text from page {page_number}")
lines = text.splitlines() # Process line by line for better control
for line_number, line in enumerate(lines, start=1):
print(f"[DEBUG] Processing line {line_number} on page {page_number}: {line}")
# Match Sl No (if it’s the start of a new item entry)
sl_no_match = re.match(r'(\d+)\s+', line)
if sl_no_match and "Sl No" not in current_record:
current_record["Sl No"] = sl_no_match.group(1)
# Material Description (Multi-line support)
material_desc_match = re.search(r'(BPS\s+\d+.*?HSN Code:\d+\s+IGST\s*:\s*\d+%)', line)
if material_desc_match:
if "Material Description" not in current_record:
current_record["Material Description"] = material_desc_match.group(0)
else:
# Append if description spans multiple lines
current_record["Material Description"] += " " + material_desc_match.group(0)
# Match Unit
if "Unit" not in current_record:
unit_match = re.search(r'\b(No|Kg|Pack)\b', line)
if unit_match:
current_record["Unit"] = unit_match.group(0)
# Match Quantity
quantity_match = re.search(r'(\d+)\s+Quantity', line)
if quantity_match and "Quantity" not in current_record:
current_record["Quantity"] = quantity_match.group(1)
# Match Delivery Quantity
dely_qty_match = re.search(r'(\d+)\s+Dely Qty', line)
if dely_qty_match and "Dely Qty" not in current_record:
current_record["Dely Qty"] = dely_qty_match.group(1)
# Match Delivery Date
dely_date_match = re.search(r'(\d{2}\.\d{2}\.\d{4})', line)
if dely_date_match and "Dely Date" not in current_record:
current_record["Dely Date"] = dely_date_match.group(0)
# Match Unit Rate
unit_rate_match = re.search(r'Unit Rate\s+([\d.]+)', line)
if unit_rate_match and "Unit Rate" not in current_record:
current_record["Unit Rate"] = unit_rate_match.group(1)
# Match Value
value_match = re.search(r'Value\s+([\d.]+)', line)
if value_match and "Value" not in current_record:
current_record["Value"] = value_match.group(1)
# Check if we have a complete record
if len(current_record) == 8:
print(f"[DEBUG] Complete record found: {current_record}")
data.append([
current_record["Sl No"],
current_record["Material Description"],
current_record["Unit"],
current_record["Quantity"],
current_record["Dely Qty"],
current_record["Dely Date"],
current_record["Unit Rate"],
current_record["Value"]
])
current_record = {} # Reset for next record
# Create DataFrame if data was extracted
if data:
columns = ["Sl No", "Material Description", "Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value"]
df = pd.DataFrame(data, columns=columns)
# Convert numeric columns to appropriate types
df["Quantity"] = pd.to_numeric(df["Quantity"], errors='coerce')
df["Dely Qty"] = pd.to_numeric(df["Dely Qty"], errors='coerce')
df["Unit Rate"] = pd.to_numeric(df["Unit Rate"], errors='coerce')
df["Value"] = pd.to_numeric(df["Value"], errors='coerce')
print(f"[DEBUG] Data extracted successfully:\n{df}")
return df
else:
print("[DEBUG] No data extracted. Check if the PDF structure or regex is correct.")
return pd.DataFrame(columns=["Sl No", "Material Description", "Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value"])