Spaces:

jithenderchoudary
/

Langc1

Sleeping

App Files Files Community

Langc1 / app.py

jithenderchoudary

Update app.py

cbb8f1a verified over 1 year ago

raw

history blame contribute delete

2.46 kB

	import re
	import pandas as pd
	from langchain_community.document_loaders import PyMuPDFLoader
	import gradio as gr
	import os

	# Updated regex pattern to capture varied data formats
	item_regex = re.compile(
	r"(\d+)\s+([A-Z\s]+)\s+Material Number:\s+(\d+)\s+HSN Code:\s\d+\sIGST :\s\d+\s%\sNO\s(\d+)\s+(\d+)\s+([\d.]+)\s+([\d.]+)"
	)

	def extract_data_from_pdf(pdf_file):
	try:
	# Load and parse the PDF document
	loader = PyMuPDFLoader(pdf_file.name)
	documents = loader.load()

	# Print document content for debugging
	print("Extracted Document Content for Debugging:")
	for doc in documents:
	print(doc.page_content)

	# Initialize list to store extracted data
	data = []

	# Iterate over each document page and search for items using regex
	for doc in documents:
	matches = item_regex.findall(doc.page_content)
	for match in matches:
	data.append({
	"Sl No": match[0],
	"Material Description": match[1].strip(),
	"Material Number": match[2],
	"Quantity": match[3],
	"Dely Qty": match[4],
	"Unit Rate": match[5],
	"Value": match[6]
	})

	# Check if any data was extracted
	if not data:
	# Create a DataFrame with an error message if no data is found
	df = pd.DataFrame([{"Message": "No matching data found in the PDF."}])
	else:
	# Create DataFrame with extracted data
	df = pd.DataFrame(data)

	# Save to CSV
	csv_path = "/tmp/extracted_po_data.csv"
	df.to_csv(csv_path, index=False)
	return csv_path

	except Exception as e:
	# Log error and return an empty file
	print(f"Error processing PDF: {e}")
	error_path = "/tmp/error_message.csv"
	error_df = pd.DataFrame([{"Error": str(e)}])
	error_df.to_csv(error_path, index=False)
	return error_path

	# Gradio interface for uploading PDF and downloading CSV
	interface = gr.Interface(
	fn=extract_data_from_pdf,
	inputs="file",
	outputs="file",
	title="PO PDF to CSV Converter",
	description="Upload a Purchase Order PDF to extract fields into a CSV file."
	)

	if __name__ == "__main__":
	# Enable queueing and set a shareable link
	interface.queue(max_size=10).launch(share=True)