Spaces:
Sleeping
Sleeping
File size: 2,463 Bytes
25ff5ac cbb8f1a 25ff5ac cbb8f1a 25ff5ac cbb8f1a 25ff5ac | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 | import re
import pandas as pd
from langchain_community.document_loaders import PyMuPDFLoader
import gradio as gr
import os
# Updated regex pattern to capture varied data formats
item_regex = re.compile(
r"(\d+)\s+([A-Z\s]+)\s+Material Number:\s+(\d+)\s+HSN Code:\s*\d+\s*IGST :\s*\d+\s*%\s*NO\s*(\d+)\s+(\d+)\s+([\d.]+)\s+([\d.]+)"
)
def extract_data_from_pdf(pdf_file):
try:
# Load and parse the PDF document
loader = PyMuPDFLoader(pdf_file.name)
documents = loader.load()
# Print document content for debugging
print("Extracted Document Content for Debugging:")
for doc in documents:
print(doc.page_content)
# Initialize list to store extracted data
data = []
# Iterate over each document page and search for items using regex
for doc in documents:
matches = item_regex.findall(doc.page_content)
for match in matches:
data.append({
"Sl No": match[0],
"Material Description": match[1].strip(),
"Material Number": match[2],
"Quantity": match[3],
"Dely Qty": match[4],
"Unit Rate": match[5],
"Value": match[6]
})
# Check if any data was extracted
if not data:
# Create a DataFrame with an error message if no data is found
df = pd.DataFrame([{"Message": "No matching data found in the PDF."}])
else:
# Create DataFrame with extracted data
df = pd.DataFrame(data)
# Save to CSV
csv_path = "/tmp/extracted_po_data.csv"
df.to_csv(csv_path, index=False)
return csv_path
except Exception as e:
# Log error and return an empty file
print(f"Error processing PDF: {e}")
error_path = "/tmp/error_message.csv"
error_df = pd.DataFrame([{"Error": str(e)}])
error_df.to_csv(error_path, index=False)
return error_path
# Gradio interface for uploading PDF and downloading CSV
interface = gr.Interface(
fn=extract_data_from_pdf,
inputs="file",
outputs="file",
title="PO PDF to CSV Converter",
description="Upload a Purchase Order PDF to extract fields into a CSV file."
)
if __name__ == "__main__":
# Enable queueing and set a shareable link
interface.queue(max_size=10).launch(share=True)
|