import streamlit as st import fitz # PyMuPDF import pandas as pd import re import io # --- Core PDF Processing Functions --- def find_sections(pdf_bytes, marker_pattern): """ Scans a PDF and finds the start and end pages of sections based on a pattern. Returns a dictionary like {'Q1': {'start': 0, 'end': 1}, 'Q2': {'start': 2, 'end': 3}} """ sections = {} doc = fitz.open(stream=pdf_bytes, filetype="pdf") # Find all occurrences of the pattern found_items = [] for page_num, page in enumerate(doc): text = page.get_text("text") # Example pattern: r"^(Question|Q)\s*(\d+)" matches = re.finditer(marker_pattern, text, re.IGNORECASE | re.MULTILINE) for match in matches: question_num = match.group(2) # Assumes the number is the second group # We use a key like 'Q1', 'Q2' for consistency key = f"Q{question_num}" if key not in [item['key'] for item in found_items]: found_items.append({'key': key, 'page': page_num}) if not found_items: return {} # Determine page ranges for i, item in enumerate(found_items): key = item['key'] start_page = item['page'] end_page = doc.page_count - 1 # Default to end of doc if i + 1 < len(found_items): end_page = found_items[i+1]['page'] - 1 # Ensure end page is not before start page if end_page < start_page: end_page = start_page sections[key] = {'start': start_page, 'end': end_page} return sections def extract_section_pdf(pdf_bytes, start_page, end_page): """ Extracts a range of pages from a PDF and returns it as new PDF bytes. """ source_doc = fitz.open(stream=pdf_bytes, filetype="pdf") new_doc = fitz.open() # Create a new empty PDF new_doc.insert_pdf(source_doc, from_page=start_page, to_page=end_page) return new_doc.tobytes() # --- Streamlit UI --- st.set_page_config(layout="wide") st.title("📄 PDF Section Splitter & Mapper") st.info("Upload your question, rubric, and solution PDFs. The tool will find sections (Q1, Q2...) and map them together.") # File Uploaders col1, col2, col3 = st.columns(3) with col1: q_file = st.file_uploader("1. Upload Questions PDF", type="pdf") with col2: r_file = st.file_uploader("2. Upload Rubrics PDF", type="pdf") with col3: s_file = st.file_uploader("3. Upload Solutions PDF", type="pdf") # User-defined marker marker_text = st.text_input( "Enter the text marker for questions (e.g., 'Question', 'Q', 'Problem')", value="Question" ) if st.button("🚀 Process PDFs", disabled=(not all([q_file, r_file, s_file]))): # Compile the regex pattern # This pattern looks for the marker, optional space, and then captures the digits marker_pattern = rf"^{marker_text}\s*(\d+)" with st.spinner("Processing documents... This might take a moment."): # Read file bytes q_bytes = q_file.getvalue() r_bytes = r_file.getvalue() s_bytes = s_file.getvalue() # Find sections in all three documents q_sections = find_sections(q_bytes, marker_pattern) r_sections = find_sections(r_bytes, marker_pattern) s_sections = find_sections(s_bytes, marker_pattern) # Get a unique, sorted list of all question keys found (e.g., Q1, Q2, Q10) all_keys = sorted(list(set(q_sections.keys()) | set(r_sections.keys()) | set(s_sections.keys())), key=lambda x: int(x[1:])) if not all_keys: st.error("Could not find any sections with the provided marker. Please check your PDFs or refine the marker text.") else: st.success(f"Found {len(all_keys)} unique sections! Displaying results below.") results = [] for key in all_keys: # Extract the PDF section for each type if it exists question_pdf_bytes = extract_section_pdf(q_bytes, **q_sections[key]) if key in q_sections else None rubric_pdf_bytes = extract_section_pdf(r_bytes, **r_sections[key]) if key in r_sections else None solution_pdf_bytes = extract_section_pdf(s_bytes, **s_sections[key]) if key in s_sections else None results.append({ 'key': key, 'question': question_pdf_bytes, 'rubric': rubric_pdf_bytes, 'solution': solution_pdf_bytes }) # Display results for item in results: st.markdown(f"---") st.subheader(f"Section: {item['key']}") c1, c2, c3 = st.columns(3) with c1: st.markdown("#### Question") if item['question']: with st.expander("👁️ Preview"): preview_doc = fitz.open(stream=item['question'], filetype="pdf") pix = preview_doc[0].get_pixmap() st.image(pix.tobytes()) st.download_button( label="⬇️ Download PDF", data=item['question'], file_name=f"{item['key']}_question.pdf", mime="application/pdf" ) else: st.warning("Not found") with c2: st.markdown("#### Rubric") if item['rubric']: with st.expander("👁️ Preview"): preview_doc = fitz.open(stream=item['rubric'], filetype="pdf") pix = preview_doc[0].get_pixmap() st.image(pix.tobytes()) st.download_button( label="⬇️ Download PDF", data=item['rubric'], file_name=f"{item['key']}_rubric.pdf", mime="application/pdf" ) else: st.warning("Not found") with c3: st.markdown("#### Solution") if item['solution']: with st.expander("👁️ Preview"): preview_doc = fitz.open(stream=item['solution'], filetype="pdf") pix = preview_doc[0].get_pixmap() st.image(pix.tobytes()) st.download_button( label="⬇️ Download PDF", data=item['solution'], file_name=f"{item['key']}_solution.pdf", mime="application/pdf" ) else: st.warning("Not found")