chmawia commited on
Commit
721d64b
·
verified ·
1 Parent(s): 89fdc36

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -64
app.py CHANGED
@@ -1,9 +1,10 @@
1
  import streamlit as st
2
  import fitz # PyMuPDF
3
  import os
 
4
 
5
  # Configure Streamlit page
6
- st.set_page_config(page_title="PDF to Structured Data", layout="centered")
7
 
8
  # Custom Styling
9
  st.markdown(
@@ -12,98 +13,69 @@ st.markdown(
12
  .stFileUploader, .stTextArea, .stButton button {
13
  width: 100% !important;
14
  }
15
- .stTextArea textarea {
16
- height: 80px !important;
17
- }
18
  </style>
19
  """,
20
  unsafe_allow_html=True,
21
  )
22
 
23
  # App title
24
- st.markdown("<h1 style='text-align: center;'>📄 PDF to Structured Data</h1>", unsafe_allow_html=True)
25
- st.markdown("<p style='text-align: center; color: gray;'>powered by AI</p>", unsafe_allow_html=True)
26
 
27
  # File uploader
28
- uploaded_file = st.file_uploader(
29
- "Drop your PDF here or click to browse",
30
- type=["pdf"],
31
- help="Maximum file size: 100MB"
32
- )
33
-
34
- # User input for data structure description
35
- st.markdown("<h4 style='color: #4A90E2;'>Describe the structure and type of data you want to extract from the PDF.</h4>", unsafe_allow_html=True)
36
- data_description = st.text_area(
37
- "Example: Extract all invoice details including invoice number, date, items, prices, and total amount...",
38
- ""
39
- )
40
 
41
- # Function to extract structured data
42
- def extract_text_and_structure(pdf_path):
43
  doc = fitz.open(pdf_path)
44
- structured_data = {"title": "", "headings": [], "paragraphs": []}
 
45
 
46
- for page in doc:
47
- blocks = page.get_text("blocks")
48
-
49
- for block in blocks:
50
- content = block[4].strip()
51
- if not content:
52
- continue # Skip empty blocks
53
 
54
- # Get font size safely
55
- try:
56
- text_dict = page.get_text("dict")
57
- block_index = blocks.index(block)
58
-
59
- block_data = text_dict["blocks"][block_index] if "blocks" in text_dict and block_index < len(text_dict["blocks"]) else {}
60
 
61
- if "lines" in block_data and block_data["lines"]:
62
- if "spans" in block_data["lines"][0] and block_data["lines"][0]["spans"]:
63
- font_size = block_data["lines"][0]["spans"][0]["size"]
64
- if font_size > 14:
65
- structured_data["headings"].append(content)
66
- else:
67
- structured_data["paragraphs"].append(content)
68
- else:
69
- structured_data["paragraphs"].append(content)
70
- else:
71
- structured_data["paragraphs"].append(content)
72
 
73
- except Exception as e:
74
- structured_data["paragraphs"].append(content)
 
75
 
76
- if structured_data["headings"]:
77
- structured_data["title"] = structured_data["headings"][0]
 
78
 
79
- return structured_data
80
 
81
  # Extract Data Button
82
- if st.button("Extract Data", use_container_width=True):
83
  if uploaded_file is not None:
84
  with st.spinner("Processing your PDF..."):
85
  temp_path = "temp.pdf"
86
  with open(temp_path, "wb") as f:
87
  f.write(uploaded_file.getbuffer())
88
 
89
- extracted_data = extract_text_and_structure(temp_path)
90
  os.remove(temp_path)
91
 
92
  # Display extracted data
93
  st.success("✅ Extraction Complete!")
94
 
95
- if extracted_data["title"]:
96
- st.subheader("📌 Title")
97
- st.write(extracted_data["title"])
 
98
 
99
- if extracted_data["headings"]:
100
- st.subheader("📑 Headings")
101
- for heading in extracted_data["headings"]:
102
- st.write(f"- {heading}")
103
 
104
- if extracted_data["paragraphs"]:
105
- st.subheader("📖 Paragraphs")
106
- for para in extracted_data["paragraphs"]:
107
- st.write(para)
108
  else:
109
- st.warning("⚠️ Please upload a PDF file before extracting data.")
 
1
  import streamlit as st
2
  import fitz # PyMuPDF
3
  import os
4
+ import re
5
 
6
  # Configure Streamlit page
7
+ st.set_page_config(page_title="PDF to Important Info", layout="centered")
8
 
9
  # Custom Styling
10
  st.markdown(
 
13
  .stFileUploader, .stTextArea, .stButton button {
14
  width: 100% !important;
15
  }
 
 
 
16
  </style>
17
  """,
18
  unsafe_allow_html=True,
19
  )
20
 
21
  # App title
22
+ st.markdown("<h1 style='text-align: center;'>📄 Extract Key Points & MCQs</h1>", unsafe_allow_html=True)
 
23
 
24
  # File uploader
25
+ uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
 
 
 
 
 
 
 
 
 
 
 
26
 
27
+ # Function to extract text and filter key points
28
+ def extract_key_info(pdf_path):
29
  doc = fitz.open(pdf_path)
30
+ key_points = []
31
+ mcqs = []
32
 
33
+ question_patterns = [
34
+ r"^\d+\.", # Numbered questions (e.g., 1. What is AI?)
35
+ r"^(What|Which|How|When|Why|Who|Where|Describe|Explain)\b", # Question words
36
+ r"^[A-D]\)", # MCQ answer choices (A) Option 1
37
+ ]
 
 
38
 
39
+ for page in doc:
40
+ text = page.get_text("text")
41
+ lines = text.split("\n")
 
 
 
42
 
43
+ for line in lines:
44
+ line = line.strip()
 
 
 
 
 
 
 
 
 
45
 
46
+ # Extract MCQs & Questions
47
+ if any(re.match(pattern, line, re.IGNORECASE) for pattern in question_patterns):
48
+ mcqs.append(line)
49
 
50
+ # Extract Key Points (short sentences or bullet points)
51
+ elif len(line) < 150 and ("" in line or "-" in line or "*" in line):
52
+ key_points.append(line)
53
 
54
+ return key_points, mcqs
55
 
56
  # Extract Data Button
57
+ if st.button("Extract Important Info"):
58
  if uploaded_file is not None:
59
  with st.spinner("Processing your PDF..."):
60
  temp_path = "temp.pdf"
61
  with open(temp_path, "wb") as f:
62
  f.write(uploaded_file.getbuffer())
63
 
64
+ key_points, mcqs = extract_key_info(temp_path)
65
  os.remove(temp_path)
66
 
67
  # Display extracted data
68
  st.success("✅ Extraction Complete!")
69
 
70
+ if key_points:
71
+ st.subheader("📌 Key Points")
72
+ for point in key_points:
73
+ st.write(f"- {point}")
74
 
75
+ if mcqs:
76
+ st.subheader("��� MCQs & Important Questions")
77
+ for question in mcqs:
78
+ st.write(f"- {question}")
79
 
 
 
 
 
80
  else:
81
+ st.warning("⚠️ Please upload a PDF file first.")