chmawia commited on
Commit
89fdc36
Β·
verified Β·
1 Parent(s): d9a1982

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -35
app.py CHANGED
@@ -1,9 +1,8 @@
1
  import streamlit as st
2
  import fitz # PyMuPDF
3
- import pandas as pd
4
  import os
5
 
6
- # Streamlit UI
7
  st.set_page_config(page_title="PDF to Structured Data", layout="centered")
8
 
9
  # Custom Styling
@@ -21,15 +20,23 @@ st.markdown(
21
  unsafe_allow_html=True,
22
  )
23
 
 
24
  st.markdown("<h1 style='text-align: center;'>πŸ“„ PDF to Structured Data</h1>", unsafe_allow_html=True)
25
- st.markdown("<p style='text-align: center; color: gray;'>powered by Google DeepMind Gemini 2.0 Flash</p>", unsafe_allow_html=True)
26
 
27
  # File uploader
28
- uploaded_file = st.file_uploader("Drop your PDF here or click to browse", type=["pdf"], help="Maximum file size: 100MB")
 
 
 
 
29
 
30
- # Text input for structure description
31
  st.markdown("<h4 style='color: #4A90E2;'>Describe the structure and type of data you want to extract from the PDF.</h4>", unsafe_allow_html=True)
32
- data_description = st.text_area("Example: Extract all invoice details including invoice number, date, items, prices, and total amount...", "")
 
 
 
33
 
34
  # Function to extract structured data
35
  def extract_text_and_structure(pdf_path):
@@ -37,18 +44,33 @@ def extract_text_and_structure(pdf_path):
37
  structured_data = {"title": "", "headings": [], "paragraphs": []}
38
 
39
  for page in doc:
40
- text = page.get_text("text")
41
  blocks = page.get_text("blocks")
42
 
43
  for block in blocks:
44
  content = block[4].strip()
45
  if not content:
46
- continue
47
-
48
- font_size = page.get_text("dict")["blocks"][blocks.index(block)]["lines"][0]["spans"][0]["size"]
49
- if font_size > 14:
50
- structured_data["headings"].append(content)
51
- else:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  structured_data["paragraphs"].append(content)
53
 
54
  if structured_data["headings"]:
@@ -57,25 +79,31 @@ def extract_text_and_structure(pdf_path):
57
  return structured_data
58
 
59
  # Extract Data Button
60
- extract_button = st.button("Extract Data")
61
-
62
- if extract_button and uploaded_file:
63
- with open("temp.pdf", "wb") as f:
64
- f.write(uploaded_file.getbuffer())
65
-
66
- st.success("PDF Uploaded Successfully!")
67
-
68
- extracted_data = extract_text_and_structure("temp.pdf")
69
-
70
- st.subheader("Title")
71
- st.write(extracted_data["title"])
72
-
73
- st.subheader("Headings")
74
- for heading in extracted_data["headings"]:
75
- st.write(f"- {heading}")
76
-
77
- st.subheader("Paragraphs")
78
- for para in extracted_data["paragraphs"]:
79
- st.write(para)
80
-
81
- os.remove("temp.pdf")
 
 
 
 
 
 
 
1
  import streamlit as st
2
  import fitz # PyMuPDF
 
3
  import os
4
 
5
+ # Configure Streamlit page
6
  st.set_page_config(page_title="PDF to Structured Data", layout="centered")
7
 
8
  # Custom Styling
 
20
  unsafe_allow_html=True,
21
  )
22
 
23
+ # App title
24
  st.markdown("<h1 style='text-align: center;'>πŸ“„ PDF to Structured Data</h1>", unsafe_allow_html=True)
25
+ st.markdown("<p style='text-align: center; color: gray;'>powered by AI</p>", unsafe_allow_html=True)
26
 
27
  # File uploader
28
+ uploaded_file = st.file_uploader(
29
+ "Drop your PDF here or click to browse",
30
+ type=["pdf"],
31
+ help="Maximum file size: 100MB"
32
+ )
33
 
34
+ # User input for data structure description
35
  st.markdown("<h4 style='color: #4A90E2;'>Describe the structure and type of data you want to extract from the PDF.</h4>", unsafe_allow_html=True)
36
+ data_description = st.text_area(
37
+ "Example: Extract all invoice details including invoice number, date, items, prices, and total amount...",
38
+ ""
39
+ )
40
 
41
  # Function to extract structured data
42
  def extract_text_and_structure(pdf_path):
 
44
  structured_data = {"title": "", "headings": [], "paragraphs": []}
45
 
46
  for page in doc:
 
47
  blocks = page.get_text("blocks")
48
 
49
  for block in blocks:
50
  content = block[4].strip()
51
  if not content:
52
+ continue # Skip empty blocks
53
+
54
+ # Get font size safely
55
+ try:
56
+ text_dict = page.get_text("dict")
57
+ block_index = blocks.index(block)
58
+
59
+ block_data = text_dict["blocks"][block_index] if "blocks" in text_dict and block_index < len(text_dict["blocks"]) else {}
60
+
61
+ if "lines" in block_data and block_data["lines"]:
62
+ if "spans" in block_data["lines"][0] and block_data["lines"][0]["spans"]:
63
+ font_size = block_data["lines"][0]["spans"][0]["size"]
64
+ if font_size > 14:
65
+ structured_data["headings"].append(content)
66
+ else:
67
+ structured_data["paragraphs"].append(content)
68
+ else:
69
+ structured_data["paragraphs"].append(content)
70
+ else:
71
+ structured_data["paragraphs"].append(content)
72
+
73
+ except Exception as e:
74
  structured_data["paragraphs"].append(content)
75
 
76
  if structured_data["headings"]:
 
79
  return structured_data
80
 
81
  # Extract Data Button
82
+ if st.button("Extract Data", use_container_width=True):
83
+ if uploaded_file is not None:
84
+ with st.spinner("Processing your PDF..."):
85
+ temp_path = "temp.pdf"
86
+ with open(temp_path, "wb") as f:
87
+ f.write(uploaded_file.getbuffer())
88
+
89
+ extracted_data = extract_text_and_structure(temp_path)
90
+ os.remove(temp_path)
91
+
92
+ # Display extracted data
93
+ st.success("βœ… Extraction Complete!")
94
+
95
+ if extracted_data["title"]:
96
+ st.subheader("πŸ“Œ Title")
97
+ st.write(extracted_data["title"])
98
+
99
+ if extracted_data["headings"]:
100
+ st.subheader("πŸ“‘ Headings")
101
+ for heading in extracted_data["headings"]:
102
+ st.write(f"- {heading}")
103
+
104
+ if extracted_data["paragraphs"]:
105
+ st.subheader("πŸ“– Paragraphs")
106
+ for para in extracted_data["paragraphs"]:
107
+ st.write(para)
108
+ else:
109
+ st.warning("⚠️ Please upload a PDF file before extracting data.")