File size: 2,315 Bytes
02c0991
50a0564
b1003d3
50a0564
 
 
 
d88c0af
 
 
 
 
50a0564
 
 
 
 
 
 
d88c0af
 
 
 
 
47d2da2
d88c0af
 
 
 
 
 
eb705ff
50a0564
 
 
d88c0af
 
 
 
50a0564
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b1003d3
50a0564
05a3ebf
50a0564
 
 
b1003d3
 
 
 
05a3ebf
47d2da2
8ff450e
d88c0af
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import fitz  # PyMuPDF
import pandas as pd
import gradio as gr
import tempfile
import re

def extract_po_to_excel(pdf_file):
    # Regular expressions to match key fields
    item_pattern = re.compile(r'Pos\.\s*Item Code\s*Unit\s*Delivery Date\s*Quantity\s*Basic Price\s*Discount\s*Cur\.\s*Amount', re.IGNORECASE)
    data_pattern = re.compile(r'(\d+)\s+(\d+)\s+(\w+)\s+(\d{4}-\d{2}-\d{2})\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+(\w+)\s+([\d.]+)')

    # Initialize list to store extracted data
    extracted_data = []

    # Load PDF
    with fitz.open(pdf_file.name) as pdf:
        for page_num in range(pdf.page_count):
            page = pdf[page_num]
            text = page.get_text("text")

            # Find the table start position
            if item_pattern.search(text):
                # Find all matching data lines
                matches = data_pattern.findall(text)
                
                # Process each line and add it to the data list
                for match in matches:
                    pos, item_code, unit, delivery_date, quantity, basic_price, discount, currency, amount = match
                    extracted_data.append({
                        "Position": pos,
                        "Item Code": item_code,
                        "Unit": unit,
                        "Delivery Date": delivery_date,
                        "Quantity": quantity,
                        "Basic Price": basic_price,
                        "Discount": discount,
                        "Currency": currency,
                        "Amount": amount
                    })

    # Create DataFrame
    df = pd.DataFrame(extracted_data)
    
    # Save DataFrame to a temporary Excel file
    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
    df.to_excel(temp_file.name, index=False)
    temp_file.close()
    
    return temp_file.name

def main(pdf_file):
    excel_file_path = extract_po_to_excel(pdf_file)
    return excel_file_path

# Gradio interface
interface = gr.Interface(
    fn=main,
    inputs=gr.File(label="Upload PO PDF"),
    outputs=gr.File(label="Download Excel File"),
    title="PO PDF to Excel Converter",
    description="Upload a PO PDF file to extract and download it as an Excel sheet."
)

if __name__ == "__main__":
    interface.launch()