Spaces:

SathvikGanta
/

PO_Details_Extraction

Sleeping

App Files Files Community

PO_Details_Extraction / parse_bhel_po.py

SathvikGanta

Update parse_bhel_po.py

9d67179 verified about 1 year ago

raw

history blame contribute delete

4.75 kB

	import pdfplumber
	import pandas as pd
	import re

	def extract_po_details(file_obj):
	data = [] # Store completed rows here
	current_record = {} # Temporary storage for each record until complete

	with pdfplumber.open(file_obj) as pdf:
	for page_number, page in enumerate(pdf.pages, start=1):
	text = page.extract_text()
	if not text:
	print(f"[DEBUG] No text found on page {page_number}. Skipping.")
	continue

	print(f"[DEBUG] Processing text from page {page_number}")
	lines = text.splitlines() # Process line by line for better control

	for line_number, line in enumerate(lines, start=1):
	print(f"[DEBUG] Processing line {line_number} on page {page_number}: {line}")

	# Match Sl No (if it’s the start of a new item entry)
	sl_no_match = re.match(r'(\d+)\s+', line)
	if sl_no_match and "Sl No" not in current_record:
	current_record["Sl No"] = sl_no_match.group(1)

	# Material Description (Multi-line support)
	material_desc_match = re.search(r'(BPS\s+\d+.?HSN Code:\d+\s+IGST\s:\s*\d+%)', line)
	if material_desc_match:
	if "Material Description" not in current_record:
	current_record["Material Description"] = material_desc_match.group(0)
	else:
	# Append if description spans multiple lines
	current_record["Material Description"] += " " + material_desc_match.group(0)

	# Match Unit
	if "Unit" not in current_record:
	unit_match = re.search(r'\b(No\|Kg\|Pack)\b', line)
	if unit_match:
	current_record["Unit"] = unit_match.group(0)

	# Match Quantity
	quantity_match = re.search(r'(\d+)\s+Quantity', line)
	if quantity_match and "Quantity" not in current_record:
	current_record["Quantity"] = quantity_match.group(1)

	# Match Delivery Quantity
	dely_qty_match = re.search(r'(\d+)\s+Dely Qty', line)
	if dely_qty_match and "Dely Qty" not in current_record:
	current_record["Dely Qty"] = dely_qty_match.group(1)

	# Match Delivery Date
	dely_date_match = re.search(r'(\d{2}\.\d{2}\.\d{4})', line)
	if dely_date_match and "Dely Date" not in current_record:
	current_record["Dely Date"] = dely_date_match.group(0)

	# Match Unit Rate
	unit_rate_match = re.search(r'Unit Rate\s+([\d.]+)', line)
	if unit_rate_match and "Unit Rate" not in current_record:
	current_record["Unit Rate"] = unit_rate_match.group(1)

	# Match Value
	value_match = re.search(r'Value\s+([\d.]+)', line)
	if value_match and "Value" not in current_record:
	current_record["Value"] = value_match.group(1)

	# Check if we have a complete record
	if len(current_record) == 8:
	print(f"[DEBUG] Complete record found: {current_record}")
	data.append([
	current_record["Sl No"],
	current_record["Material Description"],
	current_record["Unit"],
	current_record["Quantity"],
	current_record["Dely Qty"],
	current_record["Dely Date"],
	current_record["Unit Rate"],
	current_record["Value"]
	])
	current_record = {} # Reset for next record

	# Create DataFrame if data was extracted
	if data:
	columns = ["Sl No", "Material Description", "Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value"]
	df = pd.DataFrame(data, columns=columns)

	# Convert numeric columns to appropriate types
	df["Quantity"] = pd.to_numeric(df["Quantity"], errors='coerce')
	df["Dely Qty"] = pd.to_numeric(df["Dely Qty"], errors='coerce')
	df["Unit Rate"] = pd.to_numeric(df["Unit Rate"], errors='coerce')
	df["Value"] = pd.to_numeric(df["Value"], errors='coerce')

	print(f"[DEBUG] Data extracted successfully:\n{df}")
	return df
	else:
	print("[DEBUG] No data extracted. Check if the PDF structure or regex is correct.")
	return pd.DataFrame(columns=["Sl No", "Material Description", "Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value"])