Spaces:

jithenderchoudary
/

Langch

Sleeping

App Files Files Community

jithenderchoudary commited on Nov 14, 2024

Commit

0e57005

verified ·

1 Parent(s): ec527ae

Update app.py

Browse files

Files changed (1) hide show

app.py +51 -0

app.py CHANGED Viewed

	@@ -0,0 +1,51 @@

+import re
+import pandas as pd
+from langchain.document_loaders import PyMuPDFLoader
+import gradio as gr
+# Define regex patterns for extracting data
+item_regex = re.compile(
+    r"(\d+)\s+([A-Z\s]+)\s+Material Number:\s+(\d+)\s+HSN Code:\d+\s+IGST :\s+\d+ %\s+NO\s+(\d+)\s+(\d+)\s+([\d.]+)\s+([\d.]+)"
+)
+def extract_data_from_pdf(pdf_file):
+    # Load and parse the PDF document
+    loader = PyMuPDFLoader(pdf_file.name)
+    documents = loader.load()
+    # Initialize list to store extracted data
+    data = []
+    # Iterate over each document page and search for items using regex
+    for doc in documents:
+        matches = item_regex.findall(doc.page_content)
+        for match in matches:
+            data.append({
+                "Sl No": match[0],
+                "Material Description": match[1],
+                "Material Number": match[2],
+                "Quantity": match[3],
+                "Dely Qty": match[4],
+                "Unit Rate": match[5],
+                "Value": match[6]
+            })
+    # Create a DataFrame
+    df = pd.DataFrame(data)
+    # Save to Excel
+    excel_path = "/tmp/extracted_po_data.xlsx"
+    df.to_excel(excel_path, index=False)
+    return excel_path
+# Gradio interface for uploading PDF and downloading Excel
+interface = gr.Interface(
+    fn=extract_data_from_pdf,
+    inputs="file",
+    outputs="file",
+    title="PO PDF to Excel Converter",
+    description="Upload a Purchase Order PDF to extract fields into an Excel file."
+)
+if __name__ == "__main__":
+    interface.launch()