Spaces:

jithenderchoudary
/

Langc1

Sleeping

File size: 2,463 Bytes

25ff5ac
 
 
 
 
 
cbb8f1a
25ff5ac
cbb8f1a
25ff5ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cbb8f1a
25ff5ac

import re
import pandas as pd
from langchain_community.document_loaders import PyMuPDFLoader
import gradio as gr
import os

# Updated regex pattern to capture varied data formats
item_regex = re.compile(
    r"(\d+)\s+([A-Z\s]+)\s+Material Number:\s+(\d+)\s+HSN Code:\s*\d+\s*IGST :\s*\d+\s*%\s*NO\s*(\d+)\s+(\d+)\s+([\d.]+)\s+([\d.]+)"
)

def extract_data_from_pdf(pdf_file):
    try:
        # Load and parse the PDF document
        loader = PyMuPDFLoader(pdf_file.name)
        documents = loader.load()

        # Print document content for debugging
        print("Extracted Document Content for Debugging:")
        for doc in documents:
            print(doc.page_content)

        # Initialize list to store extracted data
        data = []

        # Iterate over each document page and search for items using regex
        for doc in documents:
            matches = item_regex.findall(doc.page_content)
            for match in matches:
                data.append({
                    "Sl No": match[0],
                    "Material Description": match[1].strip(),
                    "Material Number": match[2],
                    "Quantity": match[3],
                    "Dely Qty": match[4],
                    "Unit Rate": match[5],
                    "Value": match[6]
                })

        # Check if any data was extracted
        if not data:
            # Create a DataFrame with an error message if no data is found
            df = pd.DataFrame([{"Message": "No matching data found in the PDF."}])
        else:
            # Create DataFrame with extracted data
            df = pd.DataFrame(data)

        # Save to CSV
        csv_path = "/tmp/extracted_po_data.csv"
        df.to_csv(csv_path, index=False)
        return csv_path

    except Exception as e:
        # Log error and return an empty file
        print(f"Error processing PDF: {e}")
        error_path = "/tmp/error_message.csv"
        error_df = pd.DataFrame([{"Error": str(e)}])
        error_df.to_csv(error_path, index=False)
        return error_path

# Gradio interface for uploading PDF and downloading CSV
interface = gr.Interface(
    fn=extract_data_from_pdf,
    inputs="file",
    outputs="file",
    title="PO PDF to CSV Converter",
    description="Upload a Purchase Order PDF to extract fields into a CSV file."
)

if __name__ == "__main__":
    # Enable queueing and set a shareable link
    interface.queue(max_size=10).launch(share=True)