Spaces:
Sleeping
Sleeping
| import fitz # PyMuPDF | |
| import pandas as pd | |
| import gradio as gr | |
| import tempfile | |
| import re | |
| def extract_po_to_excel(pdf_file): | |
| # Regular expressions to match key fields | |
| item_pattern = re.compile(r'Pos\.\s*Item Code\s*Unit\s*Delivery Date\s*Quantity\s*Basic Price\s*Discount\s*Cur\.\s*Amount', re.IGNORECASE) | |
| data_pattern = re.compile(r'(\d+)\s+(\d+)\s+(\w+)\s+(\d{4}-\d{2}-\d{2})\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+(\w+)\s+([\d.]+)') | |
| # Initialize list to store extracted data | |
| extracted_data = [] | |
| # Load PDF | |
| with fitz.open(pdf_file.name) as pdf: | |
| for page_num in range(pdf.page_count): | |
| page = pdf[page_num] | |
| text = page.get_text("text") | |
| # Find the table start position | |
| if item_pattern.search(text): | |
| # Find all matching data lines | |
| matches = data_pattern.findall(text) | |
| # Process each line and add it to the data list | |
| for match in matches: | |
| pos, item_code, unit, delivery_date, quantity, basic_price, discount, currency, amount = match | |
| extracted_data.append({ | |
| "Position": pos, | |
| "Item Code": item_code, | |
| "Unit": unit, | |
| "Delivery Date": delivery_date, | |
| "Quantity": quantity, | |
| "Basic Price": basic_price, | |
| "Discount": discount, | |
| "Currency": currency, | |
| "Amount": amount | |
| }) | |
| # Create DataFrame | |
| df = pd.DataFrame(extracted_data) | |
| # Save DataFrame to a temporary Excel file | |
| temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") | |
| df.to_excel(temp_file.name, index=False) | |
| temp_file.close() | |
| return temp_file.name | |
| def main(pdf_file): | |
| excel_file_path = extract_po_to_excel(pdf_file) | |
| return excel_file_path | |
| # Gradio interface | |
| interface = gr.Interface( | |
| fn=main, | |
| inputs=gr.File(label="Upload PO PDF"), | |
| outputs=gr.File(label="Download Excel File"), | |
| title="PO PDF to Excel Converter", | |
| description="Upload a PO PDF file to extract and download it as an Excel sheet." | |
| ) | |
| if __name__ == "__main__": | |
| interface.launch() | |