Spaces:
Sleeping
Sleeping
File size: 2,315 Bytes
02c0991 50a0564 b1003d3 50a0564 d88c0af 50a0564 d88c0af 47d2da2 d88c0af eb705ff 50a0564 d88c0af 50a0564 b1003d3 50a0564 05a3ebf 50a0564 b1003d3 05a3ebf 47d2da2 8ff450e d88c0af | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 | import fitz # PyMuPDF
import pandas as pd
import gradio as gr
import tempfile
import re
def extract_po_to_excel(pdf_file):
# Regular expressions to match key fields
item_pattern = re.compile(r'Pos\.\s*Item Code\s*Unit\s*Delivery Date\s*Quantity\s*Basic Price\s*Discount\s*Cur\.\s*Amount', re.IGNORECASE)
data_pattern = re.compile(r'(\d+)\s+(\d+)\s+(\w+)\s+(\d{4}-\d{2}-\d{2})\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+(\w+)\s+([\d.]+)')
# Initialize list to store extracted data
extracted_data = []
# Load PDF
with fitz.open(pdf_file.name) as pdf:
for page_num in range(pdf.page_count):
page = pdf[page_num]
text = page.get_text("text")
# Find the table start position
if item_pattern.search(text):
# Find all matching data lines
matches = data_pattern.findall(text)
# Process each line and add it to the data list
for match in matches:
pos, item_code, unit, delivery_date, quantity, basic_price, discount, currency, amount = match
extracted_data.append({
"Position": pos,
"Item Code": item_code,
"Unit": unit,
"Delivery Date": delivery_date,
"Quantity": quantity,
"Basic Price": basic_price,
"Discount": discount,
"Currency": currency,
"Amount": amount
})
# Create DataFrame
df = pd.DataFrame(extracted_data)
# Save DataFrame to a temporary Excel file
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
df.to_excel(temp_file.name, index=False)
temp_file.close()
return temp_file.name
def main(pdf_file):
excel_file_path = extract_po_to_excel(pdf_file)
return excel_file_path
# Gradio interface
interface = gr.Interface(
fn=main,
inputs=gr.File(label="Upload PO PDF"),
outputs=gr.File(label="Download Excel File"),
title="PO PDF to Excel Converter",
description="Upload a PO PDF file to extract and download it as an Excel sheet."
)
if __name__ == "__main__":
interface.launch()
|