Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import fitz # PyMuPDF | |
| import os | |
| import re | |
| # Configure Streamlit page | |
| st.set_page_config(page_title="PDF Extractor", layout="centered") | |
| # Custom Styling | |
| st.markdown( | |
| """ | |
| <style> | |
| .stButton button { | |
| width: 100% !important; | |
| background-color: #1E90FF; | |
| color: white; | |
| font-size: 18px; | |
| } | |
| .stFileUploader { | |
| border: 2px dashed #1E90FF; | |
| padding: 10px; | |
| } | |
| </style> | |
| """, | |
| unsafe_allow_html=True, | |
| ) | |
| # Page title | |
| st.markdown("<h1 style='text-align: center;'>π PDF Extractor</h1>", unsafe_allow_html=True) | |
| # File uploader | |
| uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"]) | |
| # Selection: Summarize or Generate MCQs & Key Points | |
| task = st.radio("Select Task:", ["Summarize PDF", "Generate MCQs, Key Points, and Important Questions"]) | |
| # Function to extract text from PDF | |
| def extract_text_from_pdf(pdf_path): | |
| doc = fitz.open(pdf_path) | |
| text = "" | |
| for page in doc: | |
| text += page.get_text("text") + "\n" | |
| return text.strip() | |
| # Function for simple text summarization (basic method) | |
| def simple_summarize(text): | |
| sentences = text.split(". ") | |
| summary = ". ".join(sentences[:5]) # Take the first 5 sentences as a simple summary | |
| return summary + "..." if len(sentences) > 5 else summary | |
| # Function to extract only key points, MCQs, and important questions | |
| def extract_relevant_info(text): | |
| key_points = [] | |
| mcqs = [] | |
| important_questions = [] | |
| # Define patterns | |
| mcq_pattern = r"^[A-D]\)" # Example: A) Option 1 | |
| question_pattern = r"^(What|Which|How|Why|When|Who|Where|Explain|Describe)\b" | |
| bullet_point_pattern = r"^(β’|-|\*)\s" | |
| lines = text.split("\n") | |
| for line in lines: | |
| line = line.strip() | |
| # Extract MCQs | |
| if re.match(mcq_pattern, line): | |
| mcqs.append(line) | |
| # Extract Important Questions | |
| elif re.match(question_pattern, line, re.IGNORECASE): | |
| important_questions.append(line) | |
| # Extract Key Points (Bullets or Short Sentences) | |
| elif re.match(bullet_point_pattern, line) or (len(line) < 150 and "." in line): | |
| key_points.append(line) | |
| return key_points, mcqs, important_questions | |
| # Extract Data Button | |
| if uploaded_file: | |
| extract_button = st.button("π Extract Data", use_container_width=True) | |
| if extract_button: | |
| with st.spinner("Processing your PDF..."): | |
| temp_path = "temp.pdf" | |
| with open(temp_path, "wb") as f: | |
| f.write(uploaded_file.getbuffer()) | |
| extracted_text = extract_text_from_pdf(temp_path) | |
| os.remove(temp_path) | |
| # Perform selected task | |
| if task == "Summarize PDF": | |
| st.subheader("π Summary") | |
| summary = simple_summarize(extracted_text) | |
| st.write(summary) | |
| elif task == "Generate MCQs, Key Points, and Important Questions": | |
| key_points, mcqs, important_questions = extract_relevant_info(extracted_text) | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| if key_points: | |
| st.subheader("π Key Points") | |
| for point in key_points: | |
| st.write(f"- {point}") | |
| with col2: | |
| if mcqs: | |
| st.subheader("β MCQs") | |
| for question in mcqs: | |
| st.write(f"- {question}") | |
| if important_questions: | |
| st.subheader("β Important Questions") | |
| for question in important_questions: | |
| st.write(f"- {question}") | |
| else: | |
| st.warning("β οΈ Please upload a PDF file first.") | |