File size: 2,478 Bytes
0e57005
 
c979dff
0e57005
dd6d2da
0e57005
 
 
 
 
 
 
770419b
 
 
 
 
e3ee82d
 
 
 
 
770419b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dd6d2da
 
 
 
 
770419b
 
 
 
 
 
 
dd6d2da
770419b
dd6d2da
 
 
 
0e57005
 
 
 
 
 
 
 
 
 
 
c979dff
db1150d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import re
import pandas as pd
from langchain_community.document_loaders import PyMuPDFLoader  # Updated import
import gradio as gr
import os

# Define regex patterns for extracting data
item_regex = re.compile(
    r"(\d+)\s+([A-Z\s]+)\s+Material Number:\s+(\d+)\s+HSN Code:\d+\s+IGST :\s+\d+ %\s+NO\s+(\d+)\s+(\d+)\s+([\d.]+)\s+([\d.]+)"
)

def extract_data_from_pdf(pdf_file):
    try:
        # Load and parse the PDF document
        loader = PyMuPDFLoader(pdf_file.name)
        documents = loader.load()

        # Print document content for debugging
        print("Extracted Document Content for Debugging:")
        for doc in documents:
            print(doc.page_content)

        # Initialize list to store extracted data
        data = []

        # Iterate over each document page and search for items using regex
        for doc in documents:
            matches = item_regex.findall(doc.page_content)
            for match in matches:
                data.append({
                    "Sl No": match[0],
                    "Material Description": match[1],
                    "Material Number": match[2],
                    "Quantity": match[3],
                    "Dely Qty": match[4],
                    "Unit Rate": match[5],
                    "Value": match[6]
                })

        # Check if any data was extracted
        if not data:
            # Create a DataFrame with an error message if no data is found
            df = pd.DataFrame([{"Message": "No matching data found in the PDF."}])
        else:
            # Create DataFrame with extracted data
            df = pd.DataFrame(data)

        # Save to Excel
        excel_path = "/tmp/extracted_po_data.xlsx"
        df.to_excel(excel_path, index=False)
        return excel_path

    except Exception as e:
        # Log error and return an empty file
        print(f"Error processing PDF: {e}")
        error_path = "/tmp/error_message.xlsx"
        error_df = pd.DataFrame([{"Error": str(e)}])
        error_df.to_excel(error_path, index=False)
        return error_path

# Gradio interface for uploading PDF and downloading Excel
interface = gr.Interface(
    fn=extract_data_from_pdf,
    inputs="file",
    outputs="file",
    title="PO PDF to Excel Converter",
    description="Upload a Purchase Order PDF to extract fields into an Excel file."
)

if __name__ == "__main__":
    # Enable queueing and set a shareable link
    interface.queue(max_size=10).launch(share=True)