Spaces:
Sleeping
Sleeping
| import re | |
| import pandas as pd | |
| from langchain_community.document_loaders import PyMuPDFLoader | |
| import gradio as gr | |
| import os | |
| # Updated regex pattern to capture varied data formats | |
| item_regex = re.compile( | |
| r"(\d+)\s+([A-Z\s]+)\s+Material Number:\s+(\d+)\s+HSN Code:\s*\d+\s*IGST :\s*\d+\s*%\s*NO\s*(\d+)\s+(\d+)\s+([\d.]+)\s+([\d.]+)" | |
| ) | |
| def extract_data_from_pdf(pdf_file): | |
| try: | |
| # Load and parse the PDF document | |
| loader = PyMuPDFLoader(pdf_file.name) | |
| documents = loader.load() | |
| # Print document content for debugging | |
| print("Extracted Document Content for Debugging:") | |
| for doc in documents: | |
| print(doc.page_content) | |
| # Initialize list to store extracted data | |
| data = [] | |
| # Iterate over each document page and search for items using regex | |
| for doc in documents: | |
| matches = item_regex.findall(doc.page_content) | |
| for match in matches: | |
| data.append({ | |
| "Sl No": match[0], | |
| "Material Description": match[1].strip(), | |
| "Material Number": match[2], | |
| "Quantity": match[3], | |
| "Dely Qty": match[4], | |
| "Unit Rate": match[5], | |
| "Value": match[6] | |
| }) | |
| # Check if any data was extracted | |
| if not data: | |
| # Create a DataFrame with an error message if no data is found | |
| df = pd.DataFrame([{"Message": "No matching data found in the PDF."}]) | |
| else: | |
| # Create DataFrame with extracted data | |
| df = pd.DataFrame(data) | |
| # Save to CSV | |
| csv_path = "/tmp/extracted_po_data.csv" | |
| df.to_csv(csv_path, index=False) | |
| return csv_path | |
| except Exception as e: | |
| # Log error and return an empty file | |
| print(f"Error processing PDF: {e}") | |
| error_path = "/tmp/error_message.csv" | |
| error_df = pd.DataFrame([{"Error": str(e)}]) | |
| error_df.to_csv(error_path, index=False) | |
| return error_path | |
| # Gradio interface for uploading PDF and downloading CSV | |
| interface = gr.Interface( | |
| fn=extract_data_from_pdf, | |
| inputs="file", | |
| outputs="file", | |
| title="PO PDF to CSV Converter", | |
| description="Upload a Purchase Order PDF to extract fields into a CSV file." | |
| ) | |
| if __name__ == "__main__": | |
| # Enable queueing and set a shareable link | |
| interface.queue(max_size=10).launch(share=True) | |