chmawia's picture
Update app.py
78064e5 verified
import streamlit as st
import fitz # PyMuPDF
import os
import re
# Configure Streamlit page
st.set_page_config(page_title="PDF Extractor", layout="centered")
# Custom Styling
st.markdown(
"""
<style>
.stButton button {
width: 100% !important;
background-color: #1E90FF;
color: white;
font-size: 18px;
}
.stFileUploader {
border: 2px dashed #1E90FF;
padding: 10px;
}
</style>
""",
unsafe_allow_html=True,
)
# Page title
st.markdown("<h1 style='text-align: center;'>πŸ“„ PDF Extractor</h1>", unsafe_allow_html=True)
# File uploader
uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
# Selection: Summarize or Generate MCQs & Key Points
task = st.radio("Select Task:", ["Summarize PDF", "Generate MCQs, Key Points, and Important Questions"])
# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
doc = fitz.open(pdf_path)
text = ""
for page in doc:
text += page.get_text("text") + "\n"
return text.strip()
# Function for simple text summarization (basic method)
def simple_summarize(text):
sentences = text.split(". ")
summary = ". ".join(sentences[:5]) # Take the first 5 sentences as a simple summary
return summary + "..." if len(sentences) > 5 else summary
# Function to extract only key points, MCQs, and important questions
def extract_relevant_info(text):
key_points = []
mcqs = []
important_questions = []
# Define patterns
mcq_pattern = r"^[A-D]\)" # Example: A) Option 1
question_pattern = r"^(What|Which|How|Why|When|Who|Where|Explain|Describe)\b"
bullet_point_pattern = r"^(β€’|-|\*)\s"
lines = text.split("\n")
for line in lines:
line = line.strip()
# Extract MCQs
if re.match(mcq_pattern, line):
mcqs.append(line)
# Extract Important Questions
elif re.match(question_pattern, line, re.IGNORECASE):
important_questions.append(line)
# Extract Key Points (Bullets or Short Sentences)
elif re.match(bullet_point_pattern, line) or (len(line) < 150 and "." in line):
key_points.append(line)
return key_points, mcqs, important_questions
# Extract Data Button
if uploaded_file:
extract_button = st.button("πŸš€ Extract Data", use_container_width=True)
if extract_button:
with st.spinner("Processing your PDF..."):
temp_path = "temp.pdf"
with open(temp_path, "wb") as f:
f.write(uploaded_file.getbuffer())
extracted_text = extract_text_from_pdf(temp_path)
os.remove(temp_path)
# Perform selected task
if task == "Summarize PDF":
st.subheader("πŸ“– Summary")
summary = simple_summarize(extracted_text)
st.write(summary)
elif task == "Generate MCQs, Key Points, and Important Questions":
key_points, mcqs, important_questions = extract_relevant_info(extracted_text)
col1, col2 = st.columns(2)
with col1:
if key_points:
st.subheader("πŸ“Œ Key Points")
for point in key_points:
st.write(f"- {point}")
with col2:
if mcqs:
st.subheader("❓ MCQs")
for question in mcqs:
st.write(f"- {question}")
if important_questions:
st.subheader("❓ Important Questions")
for question in important_questions:
st.write(f"- {question}")
else:
st.warning("⚠️ Please upload a PDF file first.")