Langch / app.py
jithenderchoudary's picture
Update app.py
db1150d verified
import re
import pandas as pd
from langchain_community.document_loaders import PyMuPDFLoader # Updated import
import gradio as gr
import os
# Define regex patterns for extracting data
item_regex = re.compile(
r"(\d+)\s+([A-Z\s]+)\s+Material Number:\s+(\d+)\s+HSN Code:\d+\s+IGST :\s+\d+ %\s+NO\s+(\d+)\s+(\d+)\s+([\d.]+)\s+([\d.]+)"
)
def extract_data_from_pdf(pdf_file):
try:
# Load and parse the PDF document
loader = PyMuPDFLoader(pdf_file.name)
documents = loader.load()
# Print document content for debugging
print("Extracted Document Content for Debugging:")
for doc in documents:
print(doc.page_content)
# Initialize list to store extracted data
data = []
# Iterate over each document page and search for items using regex
for doc in documents:
matches = item_regex.findall(doc.page_content)
for match in matches:
data.append({
"Sl No": match[0],
"Material Description": match[1],
"Material Number": match[2],
"Quantity": match[3],
"Dely Qty": match[4],
"Unit Rate": match[5],
"Value": match[6]
})
# Check if any data was extracted
if not data:
# Create a DataFrame with an error message if no data is found
df = pd.DataFrame([{"Message": "No matching data found in the PDF."}])
else:
# Create DataFrame with extracted data
df = pd.DataFrame(data)
# Save to Excel
excel_path = "/tmp/extracted_po_data.xlsx"
df.to_excel(excel_path, index=False)
return excel_path
except Exception as e:
# Log error and return an empty file
print(f"Error processing PDF: {e}")
error_path = "/tmp/error_message.xlsx"
error_df = pd.DataFrame([{"Error": str(e)}])
error_df.to_excel(error_path, index=False)
return error_path
# Gradio interface for uploading PDF and downloading Excel
interface = gr.Interface(
fn=extract_data_from_pdf,
inputs="file",
outputs="file",
title="PO PDF to Excel Converter",
description="Upload a Purchase Order PDF to extract fields into an Excel file."
)
if __name__ == "__main__":
# Enable queueing and set a shareable link
interface.queue(max_size=10).launch(share=True)