Spaces:

chmawia
/

pdf_data_analyzer

Sleeping

App Files Files Community

chmawia commited on Mar 4, 2025

Commit

e76110f

verified ·

1 Parent(s): 7196622

Update app.py

Browse files

Files changed (1) hide show

app.py +65 -42

app.py CHANGED Viewed

@@ -2,9 +2,10 @@ import streamlit as st
 import fitz  # PyMuPDF
 import os
 import re
 # Configure Streamlit page
-st.set_page_config(page_title="Extract Key Info", layout="centered")
 # Custom Styling
 st.markdown(
@@ -26,14 +27,32 @@ st.markdown(
 )
 # Page title
-st.markdown("<h1 style='text-align: center;'>📄 Extract Key Points & MCQs</h1>", unsafe_allow_html=True)
 # File uploader
 uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
-# Function to extract only key points, MCQs, and important questions
-def extract_relevant_info(pdf_path):
     doc = fitz.open(pdf_path)
     key_points = []
     mcqs = []
     important_questions = []
@@ -43,61 +62,65 @@ def extract_relevant_info(pdf_path):
     question_pattern = r"^(What|Which|How|Why|When|Who|Where|Explain|Describe)\b"
     bullet_point_pattern = r"^(•|-|\*)\s"
-    for page in doc:
-        text = page.get_text("text")
-        lines = text.split("\n")
-        for line in lines:
-            line = line.strip()
-            # Extract MCQs
-            if re.match(mcq_pattern, line):
-                mcqs.append(line)
-            # Extract Important Questions
-            elif re.match(question_pattern, line, re.IGNORECASE):
-                important_questions.append(line)
-            # Extract Key Points (Bullets or Short Sentences)
-            elif re.match(bullet_point_pattern, line) or (len(line) < 150 and "." in line):
-                key_points.append(line)
     return key_points, mcqs, important_questions
-# Extract Data Button (Disabled until file is uploaded)
 if uploaded_file:
-    extract_button = st.button("🚀 Extract Important Info", use_container_width=True)
     if extract_button:
         with st.spinner("Processing your PDF..."):
             temp_path = "temp.pdf"
             with open(temp_path, "wb") as f:
                 f.write(uploaded_file.getbuffer())
-            key_points, mcqs, important_questions = extract_relevant_info(temp_path)
             os.remove(temp_path)
-            # Display extracted data
-            st.success("✅ Extraction Complete!")
-            col1, col2 = st.columns(2)
-            with col1:
-                if key_points:
-                    st.subheader("📌 Key Points")
-                    for point in key_points:
-                        st.write(f"- {point}")
-            with col2:
-                if mcqs:
-                    st.subheader("❓ MCQs")
-                    for question in mcqs:
-                        st.write(f"- {question}")
-                if important_questions:
-                    st.subheader("❓ Important Questions")
-                    for question in important_questions:
-                        st.write(f"- {question}")
 else:
     st.warning("⚠️ Please upload a PDF file first.")

 import fitz  # PyMuPDF
 import os
 import re
+from transformers import pipeline
 # Configure Streamlit page
+st.set_page_config(page_title="PDF Extractor", layout="centered")
 # Custom Styling
 st.markdown(
 )
 # Page title
+st.markdown("<h1 style='text-align: center;'>📄 PDF Extractor</h1>", unsafe_allow_html=True)
 # File uploader
 uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
+# Selection: Summarize or Generate MCQs & Key Points
+task = st.radio("Select Task:", ["Summarize PDF", "Generate MCQs, Key Points, and Important Questions"])
+# Function to extract text from PDF
+def extract_text_from_pdf(pdf_path):
     doc = fitz.open(pdf_path)
+    text = ""
+    for page in doc:
+        text += page.get_text("text") + "\n"
+    return text.strip()
+# Function to generate a summary using an AI model
+def summarize_text(text):
+    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
+    summary = summarizer(text, max_length=200, min_length=50, do_sample=False)
+    return summary[0]['summary_text']
+# Function to extract only key points, MCQs, and important questions
+def extract_relevant_info(text):
     key_points = []
     mcqs = []
     important_questions = []
     question_pattern = r"^(What|Which|How|Why|When|Who|Where|Explain|Describe)\b"
     bullet_point_pattern = r"^(•|-|\*)\s"
+    lines = text.split("\n")
+    for line in lines:
+        line = line.strip()
+        # Extract MCQs
+        if re.match(mcq_pattern, line):
+            mcqs.append(line)
+        # Extract Important Questions
+        elif re.match(question_pattern, line, re.IGNORECASE):
+            important_questions.append(line)
+        # Extract Key Points (Bullets or Short Sentences)
+        elif re.match(bullet_point_pattern, line) or (len(line) < 150 and "." in line):
+            key_points.append(line)
     return key_points, mcqs, important_questions
+# Extract Data Button
 if uploaded_file:
+    extract_button = st.button("🚀 Extract Data", use_container_width=True)
     if extract_button:
         with st.spinner("Processing your PDF..."):
             temp_path = "temp.pdf"
             with open(temp_path, "wb") as f:
                 f.write(uploaded_file.getbuffer())
+            extracted_text = extract_text_from_pdf(temp_path)
             os.remove(temp_path)
+            # Perform selected task
+            if task == "Summarize PDF":
+                st.subheader("📖 Summary")
+                summary = summarize_text(extracted_text)
+                st.write(summary)
+            elif task == "Generate MCQs, Key Points, and Important Questions":
+                key_points, mcqs, important_questions = extract_relevant_info(extracted_text)
+                col1, col2 = st.columns(2)
+                with col1:
+                    if key_points:
+                        st.subheader("📌 Key Points")
+                        for point in key_points:
+                            st.write(f"- {point}")
+                with col2:
+                    if mcqs:
+                        st.subheader("❓ MCQs")
+                        for question in mcqs:
+                            st.write(f"- {question}")
+                    if important_questions:
+                        st.subheader("❓ Important Questions")
+                        for question in important_questions:
+                            st.write(f"- {question}")
 else:
     st.warning("⚠️ Please upload a PDF file first.")