jithenderchoudary commited on
Commit
0e57005
·
verified ·
1 Parent(s): ec527ae

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -0
app.py CHANGED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import pandas as pd
3
+ from langchain.document_loaders import PyMuPDFLoader
4
+ import gradio as gr
5
+
6
+ # Define regex patterns for extracting data
7
+ item_regex = re.compile(
8
+ r"(\d+)\s+([A-Z\s]+)\s+Material Number:\s+(\d+)\s+HSN Code:\d+\s+IGST :\s+\d+ %\s+NO\s+(\d+)\s+(\d+)\s+([\d.]+)\s+([\d.]+)"
9
+ )
10
+
11
+ def extract_data_from_pdf(pdf_file):
12
+ # Load and parse the PDF document
13
+ loader = PyMuPDFLoader(pdf_file.name)
14
+ documents = loader.load()
15
+
16
+ # Initialize list to store extracted data
17
+ data = []
18
+
19
+ # Iterate over each document page and search for items using regex
20
+ for doc in documents:
21
+ matches = item_regex.findall(doc.page_content)
22
+ for match in matches:
23
+ data.append({
24
+ "Sl No": match[0],
25
+ "Material Description": match[1],
26
+ "Material Number": match[2],
27
+ "Quantity": match[3],
28
+ "Dely Qty": match[4],
29
+ "Unit Rate": match[5],
30
+ "Value": match[6]
31
+ })
32
+
33
+ # Create a DataFrame
34
+ df = pd.DataFrame(data)
35
+
36
+ # Save to Excel
37
+ excel_path = "/tmp/extracted_po_data.xlsx"
38
+ df.to_excel(excel_path, index=False)
39
+ return excel_path
40
+
41
+ # Gradio interface for uploading PDF and downloading Excel
42
+ interface = gr.Interface(
43
+ fn=extract_data_from_pdf,
44
+ inputs="file",
45
+ outputs="file",
46
+ title="PO PDF to Excel Converter",
47
+ description="Upload a Purchase Order PDF to extract fields into an Excel file."
48
+ )
49
+
50
+ if __name__ == "__main__":
51
+ interface.launch()