File size: 2,463 Bytes
25ff5ac
 
 
 
 
 
cbb8f1a
25ff5ac
cbb8f1a
25ff5ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cbb8f1a
25ff5ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import re
import pandas as pd
from langchain_community.document_loaders import PyMuPDFLoader
import gradio as gr
import os

# Updated regex pattern to capture varied data formats
item_regex = re.compile(
    r"(\d+)\s+([A-Z\s]+)\s+Material Number:\s+(\d+)\s+HSN Code:\s*\d+\s*IGST :\s*\d+\s*%\s*NO\s*(\d+)\s+(\d+)\s+([\d.]+)\s+([\d.]+)"
)

def extract_data_from_pdf(pdf_file):
    try:
        # Load and parse the PDF document
        loader = PyMuPDFLoader(pdf_file.name)
        documents = loader.load()

        # Print document content for debugging
        print("Extracted Document Content for Debugging:")
        for doc in documents:
            print(doc.page_content)

        # Initialize list to store extracted data
        data = []

        # Iterate over each document page and search for items using regex
        for doc in documents:
            matches = item_regex.findall(doc.page_content)
            for match in matches:
                data.append({
                    "Sl No": match[0],
                    "Material Description": match[1].strip(),
                    "Material Number": match[2],
                    "Quantity": match[3],
                    "Dely Qty": match[4],
                    "Unit Rate": match[5],
                    "Value": match[6]
                })

        # Check if any data was extracted
        if not data:
            # Create a DataFrame with an error message if no data is found
            df = pd.DataFrame([{"Message": "No matching data found in the PDF."}])
        else:
            # Create DataFrame with extracted data
            df = pd.DataFrame(data)

        # Save to CSV
        csv_path = "/tmp/extracted_po_data.csv"
        df.to_csv(csv_path, index=False)
        return csv_path

    except Exception as e:
        # Log error and return an empty file
        print(f"Error processing PDF: {e}")
        error_path = "/tmp/error_message.csv"
        error_df = pd.DataFrame([{"Error": str(e)}])
        error_df.to_csv(error_path, index=False)
        return error_path

# Gradio interface for uploading PDF and downloading CSV
interface = gr.Interface(
    fn=extract_data_from_pdf,
    inputs="file",
    outputs="file",
    title="PO PDF to CSV Converter",
    description="Upload a Purchase Order PDF to extract fields into a CSV file."
)

if __name__ == "__main__":
    # Enable queueing and set a shareable link
    interface.queue(max_size=10).launch(share=True)