Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import pandas as pd | |
| import PyPDF2 | |
| import re | |
| import io | |
| def extract_records_from_pdf(pdf_path): | |
| with open(pdf_path, "rb") as f: | |
| reader = PyPDF2.PdfReader(f) | |
| all_text = "\n".join(page.extract_text() for page in reader.pages if page.extract_text()) | |
| raw_records = re.split(r"Customer Address", all_text) | |
| raw_records = [r.strip() for r in raw_records if r.strip()] | |
| records = [] | |
| for rec in raw_records: | |
| lines = [l.strip() for l in rec.split('\n') if l.strip()] | |
| if not lines: | |
| continue | |
| customer_name = lines[0] | |
| address_lines = [] | |
| for line in lines[1:]: | |
| if re.match(r"^(If undelivered|Product Details|COD:|TAX INVOICE|AWB|Return Code|Valmo|Ecom Express|Pickup)", line): | |
| break | |
| address_lines.append(line) | |
| customer_address = ", ".join(address_lines) | |
| awb_code = "" | |
| found_return_code = False | |
| for i, line in enumerate(lines): | |
| if line.startswith("Return Code"): | |
| found_return_code = True | |
| if i + 2 < len(lines): | |
| awb_code = lines[i + 2].strip() | |
| break | |
| prod_line = "" | |
| prod_index = -1 | |
| for i, line in enumerate(lines): | |
| if "Product Details" in line: | |
| prod_index = i | |
| j = i + 1 | |
| while j < len(lines): | |
| if re.match(r"^SKU\b.*Order[\s_]*No\.?", lines[j], re.IGNORECASE): | |
| j += 1 | |
| continue | |
| if lines[j].strip(): | |
| prod_line = lines[j].strip() | |
| break | |
| j += 1 | |
| break | |
| if not found_return_code and prod_line and prod_index > 0: | |
| candidate = lines[prod_index - 1].strip() | |
| if candidate: | |
| awb_code = candidate | |
| sku_code = "" | |
| order_no = "" | |
| if prod_line: | |
| parts = prod_line.split() | |
| if len(parts) > 1: | |
| order_no = parts[-1] | |
| if len(parts) > 4: | |
| sku_code = " ".join(parts[:-3]) | |
| else: | |
| sku_code = " ".join(parts[:-1]) | |
| records.append({ | |
| "Customer Name": customer_name, | |
| "Customer Address": customer_address, | |
| "SKU code": sku_code, | |
| "Order No.": order_no, | |
| "AWB/Return Code": awb_code, | |
| }) | |
| return pd.DataFrame(records) | |
| def process_pdf_and_export(pdf_file): | |
| if hasattr(pdf_file, "name"): | |
| df = extract_records_from_pdf(pdf_file.name) | |
| buf = io.BytesIO() | |
| df.to_csv(buf, index=False) | |
| buf.seek(0) | |
| return df, ("result.csv", buf) | |
| else: | |
| return pd.DataFrame([{"Error": "File object has no .name attribute"}]), None | |
| with gr.Blocks() as demo: | |
| gr.Markdown("## Meesho PDF Extractor - Table View & CSV Download (Spaces Ready)") | |
| file_upload = gr.File(label="Upload your PDF") | |
| df_output = gr.Dataframe() | |
| csv_download = gr.File(label="Download CSV") | |
| def update_output(pdf_file): | |
| df, file_tuple = process_pdf_and_export(pdf_file) | |
| # Save buffer to a temp CSV file for download | |
| if file_tuple is not None: | |
| fname, buf = file_tuple | |
| with open(fname, "wb") as f: | |
| f.write(buf.read()) | |
| return df, fname | |
| else: | |
| return df, None | |
| file_upload.change( | |
| update_output, | |
| inputs=file_upload, | |
| outputs=[df_output, csv_download] | |
| ) | |
| demo.launch() |