Spaces:

chmawia
/

pdf_data_analyzer

Sleeping

App Files Files Community

chmawia commited on Mar 4, 2025

Commit

721d64b

verified ·

1 Parent(s): 89fdc36

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -64

app.py CHANGED Viewed

@@ -1,9 +1,10 @@
 import streamlit as st
 import fitz  # PyMuPDF
 import os
 # Configure Streamlit page
-st.set_page_config(page_title="PDF to Structured Data", layout="centered")
 # Custom Styling
 st.markdown(
@@ -12,98 +13,69 @@ st.markdown(
     .stFileUploader, .stTextArea, .stButton button {
         width: 100% !important;
     }
-    .stTextArea textarea {
-        height: 80px !important;
-    }
     </style>
     """,
     unsafe_allow_html=True,
 )
 # App title
-st.markdown("<h1 style='text-align: center;'>📄 PDF to Structured Data</h1>", unsafe_allow_html=True)
-st.markdown("<p style='text-align: center; color: gray;'>powered by AI</p>", unsafe_allow_html=True)
 # File uploader
-uploaded_file = st.file_uploader(
-    "Drop your PDF here or click to browse",
-    type=["pdf"],
-    help="Maximum file size: 100MB"
-)
-# User input for data structure description
-st.markdown("<h4 style='color: #4A90E2;'>Describe the structure and type of data you want to extract from the PDF.</h4>", unsafe_allow_html=True)
-data_description = st.text_area(
-    "Example: Extract all invoice details including invoice number, date, items, prices, and total amount...",
-    ""
-)
-# Function to extract structured data
-def extract_text_and_structure(pdf_path):
     doc = fitz.open(pdf_path)
-    structured_data = {"title": "", "headings": [], "paragraphs": []}
-    for page in doc:
-        blocks = page.get_text("blocks")
-        for block in blocks:
-            content = block[4].strip()
-            if not content:
-                continue  # Skip empty blocks
-            # Get font size safely
-            try:
-                text_dict = page.get_text("dict")
-                block_index = blocks.index(block)
-                block_data = text_dict["blocks"][block_index] if "blocks" in text_dict and block_index < len(text_dict["blocks"]) else {}
-                if "lines" in block_data and block_data["lines"]:
-                    if "spans" in block_data["lines"][0] and block_data["lines"][0]["spans"]:
-                        font_size = block_data["lines"][0]["spans"][0]["size"]
-                        if font_size > 14:
-                            structured_data["headings"].append(content)
-                        else:
-                            structured_data["paragraphs"].append(content)
-                    else:
-                        structured_data["paragraphs"].append(content)
-                else:
-                    structured_data["paragraphs"].append(content)
-            except Exception as e:
-                structured_data["paragraphs"].append(content)
-    if structured_data["headings"]:
-        structured_data["title"] = structured_data["headings"][0]
-    return structured_data
 # Extract Data Button
-if st.button("Extract Data", use_container_width=True):
     if uploaded_file is not None:
         with st.spinner("Processing your PDF..."):
             temp_path = "temp.pdf"
             with open(temp_path, "wb") as f:
                 f.write(uploaded_file.getbuffer())
-            extracted_data = extract_text_and_structure(temp_path)
             os.remove(temp_path)
             # Display extracted data
             st.success("✅ Extraction Complete!")
-            if extracted_data["title"]:
-                st.subheader("📌 Title")
-                st.write(extracted_data["title"])
-            if extracted_data["headings"]:
-                st.subheader("📑 Headings")
-                for heading in extracted_data["headings"]:
-                    st.write(f"- {heading}")
-            if extracted_data["paragraphs"]:
-                st.subheader("📖 Paragraphs")
-                for para in extracted_data["paragraphs"]:
-                    st.write(para)
     else:
-        st.warning("⚠️ Please upload a PDF file before extracting data.")

 import streamlit as st
 import fitz  # PyMuPDF
 import os
+import re
 # Configure Streamlit page
+st.set_page_config(page_title="PDF to Important Info", layout="centered")
 # Custom Styling
 st.markdown(
     .stFileUploader, .stTextArea, .stButton button {
         width: 100% !important;
     }
     </style>
     """,
     unsafe_allow_html=True,
 )
 # App title
+st.markdown("<h1 style='text-align: center;'>📄 Extract Key Points & MCQs</h1>", unsafe_allow_html=True)
 # File uploader
+uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
+# Function to extract text and filter key points
+def extract_key_info(pdf_path):
     doc = fitz.open(pdf_path)
+    key_points = []
+    mcqs = []
+    question_patterns = [
+        r"^\d+\.",  # Numbered questions (e.g., 1. What is AI?)
+        r"^(What|Which|How|When|Why|Who|Where|Describe|Explain)\b",  # Question words
+        r"^[A-D]\)",  # MCQ answer choices (A) Option 1
+    ]
+    for page in doc:
+        text = page.get_text("text")
+        lines = text.split("\n")
+        for line in lines:
+            line = line.strip()
+            # Extract MCQs & Questions
+            if any(re.match(pattern, line, re.IGNORECASE) for pattern in question_patterns):
+                mcqs.append(line)
+            # Extract Key Points (short sentences or bullet points)
+            elif len(line) < 150 and ("•" in line or "-" in line or "*" in line):
+                key_points.append(line)
+    return key_points, mcqs
 # Extract Data Button
+if st.button("Extract Important Info"):
     if uploaded_file is not None:
         with st.spinner("Processing your PDF..."):
             temp_path = "temp.pdf"
             with open(temp_path, "wb") as f:
                 f.write(uploaded_file.getbuffer())
+            key_points, mcqs = extract_key_info(temp_path)
             os.remove(temp_path)
             # Display extracted data
             st.success("✅ Extraction Complete!")
+            if key_points:
+                st.subheader("📌 Key Points")
+                for point in key_points:
+                    st.write(f"- {point}")
+            if mcqs:
+                st.subheader("��� MCQs & Important Questions")
+                for question in mcqs:
+                    st.write(f"- {question}")
     else:
+        st.warning("⚠️ Please upload a PDF file first.")