Spaces:

chmawia
/

pdf_data_analyzer

Sleeping

App Files Files Community

pdf_data_analyzer / app.py

chmawia

Update app.py

78064e5 verified 10 months ago

raw

history blame contribute delete

3.83 kB

	import streamlit as st
	import fitz # PyMuPDF
	import os
	import re

	# Configure Streamlit page
	st.set_page_config(page_title="PDF Extractor", layout="centered")

	# Custom Styling
	st.markdown(
	"""
	<style>
	.stButton button {
	width: 100% !important;
	background-color: #1E90FF;
	color: white;
	font-size: 18px;
	}
	.stFileUploader {
	border: 2px dashed #1E90FF;
	padding: 10px;
	}
	</style>
	""",
	unsafe_allow_html=True,
	)

	# Page title
	st.markdown("<h1 style='text-align: center;'>📄 PDF Extractor</h1>", unsafe_allow_html=True)

	# File uploader
	uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])

	# Selection: Summarize or Generate MCQs & Key Points
	task = st.radio("Select Task:", ["Summarize PDF", "Generate MCQs, Key Points, and Important Questions"])

	# Function to extract text from PDF
	def extract_text_from_pdf(pdf_path):
	doc = fitz.open(pdf_path)
	text = ""

	for page in doc:
	text += page.get_text("text") + "\n"

	return text.strip()

	# Function for simple text summarization (basic method)
	def simple_summarize(text):
	sentences = text.split(". ")
	summary = ". ".join(sentences[:5]) # Take the first 5 sentences as a simple summary
	return summary + "..." if len(sentences) > 5 else summary

	# Function to extract only key points, MCQs, and important questions
	def extract_relevant_info(text):
	key_points = []
	mcqs = []
	important_questions = []

	# Define patterns
	mcq_pattern = r"^[A-D]\)" # Example: A) Option 1
	question_pattern = r"^(What\|Which\|How\|Why\|When\|Who\|Where\|Explain\|Describe)\b"
	bullet_point_pattern = r"^(•\|-\|\*)\s"

	lines = text.split("\n")

	for line in lines:
	line = line.strip()

	# Extract MCQs
	if re.match(mcq_pattern, line):
	mcqs.append(line)

	# Extract Important Questions
	elif re.match(question_pattern, line, re.IGNORECASE):
	important_questions.append(line)

	# Extract Key Points (Bullets or Short Sentences)
	elif re.match(bullet_point_pattern, line) or (len(line) < 150 and "." in line):
	key_points.append(line)

	return key_points, mcqs, important_questions

	# Extract Data Button
	if uploaded_file:
	extract_button = st.button("🚀 Extract Data", use_container_width=True)

	if extract_button:
	with st.spinner("Processing your PDF..."):
	temp_path = "temp.pdf"
	with open(temp_path, "wb") as f:
	f.write(uploaded_file.getbuffer())

	extracted_text = extract_text_from_pdf(temp_path)
	os.remove(temp_path)

	# Perform selected task
	if task == "Summarize PDF":
	st.subheader("📖 Summary")
	summary = simple_summarize(extracted_text)
	st.write(summary)

	elif task == "Generate MCQs, Key Points, and Important Questions":
	key_points, mcqs, important_questions = extract_relevant_info(extracted_text)

	col1, col2 = st.columns(2)

	with col1:
	if key_points:
	st.subheader("📌 Key Points")
	for point in key_points:
	st.write(f"- {point}")

	with col2:
	if mcqs:
	st.subheader("❓ MCQs")
	for question in mcqs:
	st.write(f"- {question}")

	if important_questions:
	st.subheader("❓ Important Questions")
	for question in important_questions:
	st.write(f"- {question}")

	else:
	st.warning("⚠️ Please upload a PDF file first.")