import fitz # PyMuPDF import pandas as pd import gradio as gr import tempfile import re def extract_po_to_excel(pdf_file): # Regular expressions to match key fields item_pattern = re.compile(r'Pos\.\s*Item Code\s*Unit\s*Delivery Date\s*Quantity\s*Basic Price\s*Discount\s*Cur\.\s*Amount', re.IGNORECASE) data_pattern = re.compile(r'(\d+)\s+(\d+)\s+(\w+)\s+(\d{4}-\d{2}-\d{2})\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+(\w+)\s+([\d.]+)') # Initialize list to store extracted data extracted_data = [] # Load PDF with fitz.open(pdf_file.name) as pdf: for page_num in range(pdf.page_count): page = pdf[page_num] text = page.get_text("text") # Find the table start position if item_pattern.search(text): # Find all matching data lines matches = data_pattern.findall(text) # Process each line and add it to the data list for match in matches: pos, item_code, unit, delivery_date, quantity, basic_price, discount, currency, amount = match extracted_data.append({ "Position": pos, "Item Code": item_code, "Unit": unit, "Delivery Date": delivery_date, "Quantity": quantity, "Basic Price": basic_price, "Discount": discount, "Currency": currency, "Amount": amount }) # Create DataFrame df = pd.DataFrame(extracted_data) # Save DataFrame to a temporary Excel file temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") df.to_excel(temp_file.name, index=False) temp_file.close() return temp_file.name def main(pdf_file): excel_file_path = extract_po_to_excel(pdf_file) return excel_file_path # Gradio interface interface = gr.Interface( fn=main, inputs=gr.File(label="Upload PO PDF"), outputs=gr.File(label="Download Excel File"), title="PO PDF to Excel Converter", description="Upload a PO PDF file to extract and download it as an Excel sheet." ) if __name__ == "__main__": interface.launch()