Spaces:

jithenderchoudary
/

poext

Sleeping

App Files Files Community

poext / app.py

jithenderchoudary

Update app.py

d88c0af verified over 1 year ago

raw

history blame contribute delete

2.32 kB

	import fitz # PyMuPDF
	import pandas as pd
	import gradio as gr
	import tempfile
	import re

	def extract_po_to_excel(pdf_file):
	# Regular expressions to match key fields
	item_pattern = re.compile(r'Pos\.\sItem Code\sUnit\sDelivery Date\sQuantity\sBasic Price\sDiscount\sCur\.\sAmount', re.IGNORECASE)
	data_pattern = re.compile(r'(\d+)\s+(\d+)\s+(\w+)\s+(\d{4}-\d{2}-\d{2})\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+(\w+)\s+([\d.]+)')

	# Initialize list to store extracted data
	extracted_data = []

	# Load PDF
	with fitz.open(pdf_file.name) as pdf:
	for page_num in range(pdf.page_count):
	page = pdf[page_num]
	text = page.get_text("text")

	# Find the table start position
	if item_pattern.search(text):
	# Find all matching data lines
	matches = data_pattern.findall(text)

	# Process each line and add it to the data list
	for match in matches:
	pos, item_code, unit, delivery_date, quantity, basic_price, discount, currency, amount = match
	extracted_data.append({
	"Position": pos,
	"Item Code": item_code,
	"Unit": unit,
	"Delivery Date": delivery_date,
	"Quantity": quantity,
	"Basic Price": basic_price,
	"Discount": discount,
	"Currency": currency,
	"Amount": amount
	})

	# Create DataFrame
	df = pd.DataFrame(extracted_data)

	# Save DataFrame to a temporary Excel file
	temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
	df.to_excel(temp_file.name, index=False)
	temp_file.close()

	return temp_file.name

	def main(pdf_file):
	excel_file_path = extract_po_to_excel(pdf_file)
	return excel_file_path

	# Gradio interface
	interface = gr.Interface(
	fn=main,
	inputs=gr.File(label="Upload PO PDF"),
	outputs=gr.File(label="Download Excel File"),
	title="PO PDF to Excel Converter",
	description="Upload a PO PDF file to extract and download it as an Excel sheet."
	)

	if __name__ == "__main__":
	interface.launch()