import re import pandas as pd from langchain_community.document_loaders import PyMuPDFLoader import gradio as gr import os # Updated regex pattern to capture varied data formats item_regex = re.compile( r"(\d+)\s+([A-Z\s]+)\s+Material Number:\s+(\d+)\s+HSN Code:\s*\d+\s*IGST :\s*\d+\s*%\s*NO\s*(\d+)\s+(\d+)\s+([\d.]+)\s+([\d.]+)" ) def extract_data_from_pdf(pdf_file): try: # Load and parse the PDF document loader = PyMuPDFLoader(pdf_file.name) documents = loader.load() # Print document content for debugging print("Extracted Document Content for Debugging:") for doc in documents: print(doc.page_content) # Initialize list to store extracted data data = [] # Iterate over each document page and search for items using regex for doc in documents: matches = item_regex.findall(doc.page_content) for match in matches: data.append({ "Sl No": match[0], "Material Description": match[1].strip(), "Material Number": match[2], "Quantity": match[3], "Dely Qty": match[4], "Unit Rate": match[5], "Value": match[6] }) # Check if any data was extracted if not data: # Create a DataFrame with an error message if no data is found df = pd.DataFrame([{"Message": "No matching data found in the PDF."}]) else: # Create DataFrame with extracted data df = pd.DataFrame(data) # Save to CSV csv_path = "/tmp/extracted_po_data.csv" df.to_csv(csv_path, index=False) return csv_path except Exception as e: # Log error and return an empty file print(f"Error processing PDF: {e}") error_path = "/tmp/error_message.csv" error_df = pd.DataFrame([{"Error": str(e)}]) error_df.to_csv(error_path, index=False) return error_path # Gradio interface for uploading PDF and downloading CSV interface = gr.Interface( fn=extract_data_from_pdf, inputs="file", outputs="file", title="PO PDF to CSV Converter", description="Upload a Purchase Order PDF to extract fields into a CSV file." ) if __name__ == "__main__": # Enable queueing and set a shareable link interface.queue(max_size=10).launch(share=True)