Langc1 / app.py
jithenderchoudary's picture
Update app.py
cbb8f1a verified
import re
import pandas as pd
from langchain_community.document_loaders import PyMuPDFLoader
import gradio as gr
import os
# Updated regex pattern to capture varied data formats
item_regex = re.compile(
r"(\d+)\s+([A-Z\s]+)\s+Material Number:\s+(\d+)\s+HSN Code:\s*\d+\s*IGST :\s*\d+\s*%\s*NO\s*(\d+)\s+(\d+)\s+([\d.]+)\s+([\d.]+)"
)
def extract_data_from_pdf(pdf_file):
try:
# Load and parse the PDF document
loader = PyMuPDFLoader(pdf_file.name)
documents = loader.load()
# Print document content for debugging
print("Extracted Document Content for Debugging:")
for doc in documents:
print(doc.page_content)
# Initialize list to store extracted data
data = []
# Iterate over each document page and search for items using regex
for doc in documents:
matches = item_regex.findall(doc.page_content)
for match in matches:
data.append({
"Sl No": match[0],
"Material Description": match[1].strip(),
"Material Number": match[2],
"Quantity": match[3],
"Dely Qty": match[4],
"Unit Rate": match[5],
"Value": match[6]
})
# Check if any data was extracted
if not data:
# Create a DataFrame with an error message if no data is found
df = pd.DataFrame([{"Message": "No matching data found in the PDF."}])
else:
# Create DataFrame with extracted data
df = pd.DataFrame(data)
# Save to CSV
csv_path = "/tmp/extracted_po_data.csv"
df.to_csv(csv_path, index=False)
return csv_path
except Exception as e:
# Log error and return an empty file
print(f"Error processing PDF: {e}")
error_path = "/tmp/error_message.csv"
error_df = pd.DataFrame([{"Error": str(e)}])
error_df.to_csv(error_path, index=False)
return error_path
# Gradio interface for uploading PDF and downloading CSV
interface = gr.Interface(
fn=extract_data_from_pdf,
inputs="file",
outputs="file",
title="PO PDF to CSV Converter",
description="Upload a Purchase Order PDF to extract fields into a CSV file."
)
if __name__ == "__main__":
# Enable queueing and set a shareable link
interface.queue(max_size=10).launch(share=True)