Spaces:

jithenderchoudary
/

Bhel_Po

Sleeping

App Files Files Community

Bhel_Po / app.py

jithenderchoudary

Update app.py

64a1f48 verified about 1 year ago

raw

history blame contribute delete

3.72 kB

	import pdfplumber
	import re
	import pandas as pd
	import gradio as gr

	def extract_data(pdf_file):
	"""
	Extract data from the uploaded PDF and return it as an Excel file.
	"""
	data = []
	columns = ["SI No", "Material Description", "Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value"]

	start_si, end_si = 10, 1150 # Define SI No range for processing

	with pdfplumber.open(pdf_file) as pdf:
	for page in pdf.pages:
	full_text = page.extract_text() # Get the text content for the page
	lines = full_text.splitlines() if full_text else []

	for line in lines:
	try:
	# Parse the first column for SI No
	si_no_match = re.match(r"^\s*(\d+)\s", line)
	if not si_no_match:
	continue

	si_no = int(si_no_match.group(1))
	if not (start_si <= si_no <= end_si):
	continue # Skip rows outside the range

	# Extract Material Description and details dynamically
	material_desc = extract_material_description(full_text, si_no)

	# Extract remaining fields
	parts = line.split()
	unit = parts[3]
	quantity = int(parts[4])
	dely_qty = int(parts[5])
	dely_date = parts[6]
	unit_rate = float(parts[7])
	value = float(parts[8])

	# Append row data
	data.append([si_no, material_desc, unit, quantity, dely_qty, dely_date, unit_rate, value])

	except (ValueError, IndexError):
	# Skip invalid rows or rows with missing data
	continue

	# Convert data to DataFrame and save as Excel
	df = pd.DataFrame(data, columns=columns)
	excel_path = "/tmp/Extracted_PO_Data_Accurate.xlsx"
	df.to_excel(excel_path, index=False)
	return excel_path

	def extract_material_description(full_text, si_no):
	"""
	Extract Material Description, including Material Number, HSN Code, and IGST, using unique patterns.
	"""
	material_desc = ""

	# Match the specific SI No row to extract details
	si_no_pattern = rf"{si_no}\s+(BPS\s+\d+).*?Material\s+Number:\s+(\d+)"
	match = re.search(si_no_pattern, full_text, re.DOTALL)
	if match:
	bps_code = match.group(1)
	material_number = match.group(2)
	material_desc += f"{bps_code}\nMaterial Number: {material_number}\n"

	# Extract HSN Code
	hsn_code_match = re.search(r"HSN\s+Code:\s*(\d+)", full_text)
	if hsn_code_match:
	hsn_code = hsn_code_match.group(1)
	material_desc += f"HSN Code: {hsn_code}\n"
	else:
	material_desc += "HSN Code: Not Found\n"

	# Extract IGST
	igst_match = re.search(r"IGST\s:\s(\d+)\s*%", full_text)
	if igst_match:
	igst = igst_match.group(1)
	material_desc += f"IGST: {igst} %"
	else:
	material_desc += "IGST: Not Found"

	return material_desc.strip()

	# Gradio Interface function
	def gradio_interface(pdf_file):
	"""
	Interface function for Gradio to process the PDF and return the Excel file.
	"""
	return process_po(pdf_file.name)

	# Define Gradio interface
	interface = gr.Interface(
	fn=gradio_interface,
	inputs=gr.File(label="Upload PDF"),
	outputs=gr.File(label="Download Accurate Excel"),
	title="BHEL PO Data Extractor",
	description="Upload a BHEL Purchase Order (PO) PDF to extract material numbers, descriptions, and IGST information into an Excel file."
	)

	if __name__ == "__main__":
	interface.launch()