Spaces:

vaibhavbalar
/

masho

Sleeping

App Files Files Community

masho / app.py

vaibhavbalar

Update app.py

60cb00c verified about 1 year ago

raw

history blame contribute delete

3.82 kB

	import gradio as gr
	import pandas as pd
	import PyPDF2
	import re
	import io

	def extract_records_from_pdf(pdf_path):
	with open(pdf_path, "rb") as f:
	reader = PyPDF2.PdfReader(f)
	all_text = "\n".join(page.extract_text() for page in reader.pages if page.extract_text())
	raw_records = re.split(r"Customer Address", all_text)
	raw_records = [r.strip() for r in raw_records if r.strip()]
	records = []
	for rec in raw_records:
	lines = [l.strip() for l in rec.split('\n') if l.strip()]
	if not lines:
	continue
	customer_name = lines[0]
	address_lines = []
	for line in lines[1:]:
	if re.match(r"^(If undelivered\|Product Details\|COD:\|TAX INVOICE\|AWB\|Return Code\|Valmo\|Ecom Express\|Pickup)", line):
	break
	address_lines.append(line)
	customer_address = ", ".join(address_lines)
	awb_code = ""
	found_return_code = False
	for i, line in enumerate(lines):
	if line.startswith("Return Code"):
	found_return_code = True
	if i + 2 < len(lines):
	awb_code = lines[i + 2].strip()
	break
	prod_line = ""
	prod_index = -1
	for i, line in enumerate(lines):
	if "Product Details" in line:
	prod_index = i
	j = i + 1
	while j < len(lines):
	if re.match(r"^SKU\b.Order[\s_]No\.?", lines[j], re.IGNORECASE):
	j += 1
	continue
	if lines[j].strip():
	prod_line = lines[j].strip()
	break
	j += 1
	break
	if not found_return_code and prod_line and prod_index > 0:
	candidate = lines[prod_index - 1].strip()
	if candidate:
	awb_code = candidate
	sku_code = ""
	order_no = ""
	if prod_line:
	parts = prod_line.split()
	if len(parts) > 1:
	order_no = parts[-1]
	if len(parts) > 4:
	sku_code = " ".join(parts[:-3])
	else:
	sku_code = " ".join(parts[:-1])

	records.append({
	"Customer Name": customer_name,
	"Customer Address": customer_address,
	"SKU code": sku_code,
	"Order No.": order_no,
	"AWB/Return Code": awb_code,
	})
	return pd.DataFrame(records)

	def process_pdf_and_export(pdf_file):
	if hasattr(pdf_file, "name"):
	df = extract_records_from_pdf(pdf_file.name)
	buf = io.BytesIO()
	df.to_csv(buf, index=False)
	buf.seek(0)
	return df, ("result.csv", buf)
	else:
	return pd.DataFrame([{"Error": "File object has no .name attribute"}]), None

	with gr.Blocks() as demo:
	gr.Markdown("## Meesho PDF Extractor - Table View & CSV Download (Spaces Ready)")
	file_upload = gr.File(label="Upload your PDF")
	df_output = gr.Dataframe()
	csv_download = gr.File(label="Download CSV")

	def update_output(pdf_file):
	df, file_tuple = process_pdf_and_export(pdf_file)
	# Save buffer to a temp CSV file for download
	if file_tuple is not None:
	fname, buf = file_tuple
	with open(fname, "wb") as f:
	f.write(buf.read())
	return df, fname
	else:
	return df, None

	file_upload.change(
	update_output,
	inputs=file_upload,
	outputs=[df_output, csv_download]
	)

	demo.launch()