jithenderchoudary commited on
Commit
b1003d3
·
verified ·
1 Parent(s): 1af2170

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -12
app.py CHANGED
@@ -1,15 +1,59 @@
1
  import fitz # PyMuPDF
2
  import pandas as pd
 
 
 
3
 
4
- # Load PDF and read text
5
- pdf_file = 'path_to_your_pdf.pdf'
6
- data = []
7
- with fitz.open(pdf_file) as pdf:
8
- for page in pdf:
9
- text = page.get_text()
10
- # Parsing logic here for PO items and details
11
- # Append parsed data as dict to data list
12
-
13
- # Convert to DataFrame
14
- df = pd.DataFrame(data)
15
- df.to_excel('output_po_data.xlsx', index=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import fitz # PyMuPDF
2
  import pandas as pd
3
+ import gradio as gr
4
+ import tempfile
5
+ import os
6
 
7
+ def extract_po_to_excel(pdf_file):
8
+ # Load PDF and extract text
9
+ with fitz.open(pdf_file.name) as pdf:
10
+ data = []
11
+ for page_num in range(pdf.page_count):
12
+ page = pdf[page_num]
13
+ text = page.get_text("text")
14
+
15
+ # Simple example of extraction (customize parsing as needed)
16
+ lines = text.splitlines()
17
+ for line in lines:
18
+ # Only extract lines with known keywords (sample logic; adjust as necessary)
19
+ if "Pos." in line or "Item Code" in line:
20
+ data.append(line)
21
+
22
+ # Example structure, parse `data` into structured format
23
+ structured_data = []
24
+ for line in data:
25
+ # Custom parsing logic goes here; here's a basic split by spaces
26
+ # Adjust parsing to match your actual data needs
27
+ parts = line.split()
28
+ if len(parts) > 1:
29
+ structured_data.append({
30
+ "Position": parts[0],
31
+ "Item Code": parts[1],
32
+ # Extract other fields as needed
33
+ })
34
+
35
+ # Create DataFrame and export to Excel
36
+ df = pd.DataFrame(structured_data)
37
+
38
+ # Save to temporary file
39
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
40
+ df.to_excel(temp_file.name, index=False)
41
+ temp_file.close()
42
+
43
+ return temp_file.name
44
+
45
+ def main(pdf_file):
46
+ excel_file_path = extract_po_to_excel(pdf_file)
47
+ return excel_file_path
48
+
49
+ # Gradio interface
50
+ interface = gr.Interface(
51
+ fn=main,
52
+ inputs=gr.inputs.File(label="Upload PO PDF"),
53
+ outputs=gr.outputs.File(label="Download Excel File"),
54
+ title="PDF to Excel Converter",
55
+ description="Upload a PO PDF file, and download it as an Excel sheet."
56
+ )
57
+
58
+ if __name__ == "__main__":
59
+ interface.launch()