masho / app.py
vaibhavbalar's picture
Update app.py
60cb00c verified
import gradio as gr
import pandas as pd
import PyPDF2
import re
import io
def extract_records_from_pdf(pdf_path):
with open(pdf_path, "rb") as f:
reader = PyPDF2.PdfReader(f)
all_text = "\n".join(page.extract_text() for page in reader.pages if page.extract_text())
raw_records = re.split(r"Customer Address", all_text)
raw_records = [r.strip() for r in raw_records if r.strip()]
records = []
for rec in raw_records:
lines = [l.strip() for l in rec.split('\n') if l.strip()]
if not lines:
continue
customer_name = lines[0]
address_lines = []
for line in lines[1:]:
if re.match(r"^(If undelivered|Product Details|COD:|TAX INVOICE|AWB|Return Code|Valmo|Ecom Express|Pickup)", line):
break
address_lines.append(line)
customer_address = ", ".join(address_lines)
awb_code = ""
found_return_code = False
for i, line in enumerate(lines):
if line.startswith("Return Code"):
found_return_code = True
if i + 2 < len(lines):
awb_code = lines[i + 2].strip()
break
prod_line = ""
prod_index = -1
for i, line in enumerate(lines):
if "Product Details" in line:
prod_index = i
j = i + 1
while j < len(lines):
if re.match(r"^SKU\b.*Order[\s_]*No\.?", lines[j], re.IGNORECASE):
j += 1
continue
if lines[j].strip():
prod_line = lines[j].strip()
break
j += 1
break
if not found_return_code and prod_line and prod_index > 0:
candidate = lines[prod_index - 1].strip()
if candidate:
awb_code = candidate
sku_code = ""
order_no = ""
if prod_line:
parts = prod_line.split()
if len(parts) > 1:
order_no = parts[-1]
if len(parts) > 4:
sku_code = " ".join(parts[:-3])
else:
sku_code = " ".join(parts[:-1])
records.append({
"Customer Name": customer_name,
"Customer Address": customer_address,
"SKU code": sku_code,
"Order No.": order_no,
"AWB/Return Code": awb_code,
})
return pd.DataFrame(records)
def process_pdf_and_export(pdf_file):
if hasattr(pdf_file, "name"):
df = extract_records_from_pdf(pdf_file.name)
buf = io.BytesIO()
df.to_csv(buf, index=False)
buf.seek(0)
return df, ("result.csv", buf)
else:
return pd.DataFrame([{"Error": "File object has no .name attribute"}]), None
with gr.Blocks() as demo:
gr.Markdown("## Meesho PDF Extractor - Table View & CSV Download (Spaces Ready)")
file_upload = gr.File(label="Upload your PDF")
df_output = gr.Dataframe()
csv_download = gr.File(label="Download CSV")
def update_output(pdf_file):
df, file_tuple = process_pdf_and_export(pdf_file)
# Save buffer to a temp CSV file for download
if file_tuple is not None:
fname, buf = file_tuple
with open(fname, "wb") as f:
f.write(buf.read())
return df, fname
else:
return df, None
file_upload.change(
update_output,
inputs=file_upload,
outputs=[df_output, csv_download]
)
demo.launch()