import re import pandas as pd from langchain_community.document_loaders import PyMuPDFLoader # Updated import import gradio as gr import os # Define regex patterns for extracting data item_regex = re.compile( r"(\d+)\s+([A-Z\s]+)\s+Material Number:\s+(\d+)\s+HSN Code:\d+\s+IGST :\s+\d+ %\s+NO\s+(\d+)\s+(\d+)\s+([\d.]+)\s+([\d.]+)" ) def extract_data_from_pdf(pdf_file): try: # Load and parse the PDF document loader = PyMuPDFLoader(pdf_file.name) documents = loader.load() # Print document content for debugging print("Extracted Document Content for Debugging:") for doc in documents: print(doc.page_content) # Initialize list to store extracted data data = [] # Iterate over each document page and search for items using regex for doc in documents: matches = item_regex.findall(doc.page_content) for match in matches: data.append({ "Sl No": match[0], "Material Description": match[1], "Material Number": match[2], "Quantity": match[3], "Dely Qty": match[4], "Unit Rate": match[5], "Value": match[6] }) # Check if any data was extracted if not data: # Create a DataFrame with an error message if no data is found df = pd.DataFrame([{"Message": "No matching data found in the PDF."}]) else: # Create DataFrame with extracted data df = pd.DataFrame(data) # Save to Excel excel_path = "/tmp/extracted_po_data.xlsx" df.to_excel(excel_path, index=False) return excel_path except Exception as e: # Log error and return an empty file print(f"Error processing PDF: {e}") error_path = "/tmp/error_message.xlsx" error_df = pd.DataFrame([{"Error": str(e)}]) error_df.to_excel(error_path, index=False) return error_path # Gradio interface for uploading PDF and downloading Excel interface = gr.Interface( fn=extract_data_from_pdf, inputs="file", outputs="file", title="PO PDF to Excel Converter", description="Upload a Purchase Order PDF to extract fields into an Excel file." ) if __name__ == "__main__": # Enable queueing and set a shareable link interface.queue(max_size=10).launch(share=True)