Spaces:

akashmishra358
/

pdfcutter

Runtime error

File size: 7,077 Bytes

568c150

import streamlit as st
import fitz  # PyMuPDF
import pandas as pd
import re
import io

# --- Core PDF Processing Functions ---

def find_sections(pdf_bytes, marker_pattern):
    """
    Scans a PDF and finds the start and end pages of sections based on a pattern.
    Returns a dictionary like {'Q1': {'start': 0, 'end': 1}, 'Q2': {'start': 2, 'end': 3}}
    """
    sections = {}
    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
    
    # Find all occurrences of the pattern
    found_items = []
    for page_num, page in enumerate(doc):
        text = page.get_text("text")
        # Example pattern: r"^(Question|Q)\s*(\d+)"
        matches = re.finditer(marker_pattern, text, re.IGNORECASE | re.MULTILINE)
        for match in matches:
            question_num = match.group(2) # Assumes the number is the second group
            # We use a key like 'Q1', 'Q2' for consistency
            key = f"Q{question_num}" 
            if key not in [item['key'] for item in found_items]:
                 found_items.append({'key': key, 'page': page_num})

    if not found_items:
        return {}
        
    # Determine page ranges
    for i, item in enumerate(found_items):
        key = item['key']
        start_page = item['page']
        end_page = doc.page_count - 1 # Default to end of doc
        if i + 1 < len(found_items):
            end_page = found_items[i+1]['page'] - 1
        
        # Ensure end page is not before start page
        if end_page < start_page:
            end_page = start_page
            
        sections[key] = {'start': start_page, 'end': end_page}
        
    return sections

def extract_section_pdf(pdf_bytes, start_page, end_page):
    """
    Extracts a range of pages from a PDF and returns it as new PDF bytes.
    """
    source_doc = fitz.open(stream=pdf_bytes, filetype="pdf")
    new_doc = fitz.open() # Create a new empty PDF
    new_doc.insert_pdf(source_doc, from_page=start_page, to_page=end_page)
    return new_doc.tobytes()

# --- Streamlit UI ---

st.set_page_config(layout="wide")
st.title("📄 PDF Section Splitter & Mapper")

st.info("Upload your question, rubric, and solution PDFs. The tool will find sections (Q1, Q2...) and map them together.")

# File Uploaders
col1, col2, col3 = st.columns(3)
with col1:
    q_file = st.file_uploader("1. Upload Questions PDF", type="pdf")
with col2:
    r_file = st.file_uploader("2. Upload Rubrics PDF", type="pdf")
with col3:
    s_file = st.file_uploader("3. Upload Solutions PDF", type="pdf")

# User-defined marker
marker_text = st.text_input(
    "Enter the text marker for questions (e.g., 'Question', 'Q', 'Problem')", 
    value="Question"
)

if st.button("🚀 Process PDFs", disabled=(not all([q_file, r_file, s_file]))):
    
    # Compile the regex pattern
    # This pattern looks for the marker, optional space, and then captures the digits
    marker_pattern = rf"^{marker_text}\s*(\d+)"

    with st.spinner("Processing documents... This might take a moment."):
        # Read file bytes
        q_bytes = q_file.getvalue()
        r_bytes = r_file.getvalue()
        s_bytes = s_file.getvalue()

        # Find sections in all three documents
        q_sections = find_sections(q_bytes, marker_pattern)
        r_sections = find_sections(r_bytes, marker_pattern)
        s_sections = find_sections(s_bytes, marker_pattern)
        
        # Get a unique, sorted list of all question keys found (e.g., Q1, Q2, Q10)
        all_keys = sorted(list(set(q_sections.keys()) | set(r_sections.keys()) | set(s_sections.keys())), key=lambda x: int(x[1:]))

        if not all_keys:
            st.error("Could not find any sections with the provided marker. Please check your PDFs or refine the marker text.")
        else:
            st.success(f"Found {len(all_keys)} unique sections! Displaying results below.")

            results = []
            for key in all_keys:
                # Extract the PDF section for each type if it exists
                question_pdf_bytes = extract_section_pdf(q_bytes, **q_sections[key]) if key in q_sections else None
                rubric_pdf_bytes = extract_section_pdf(r_bytes, **r_sections[key]) if key in r_sections else None
                solution_pdf_bytes = extract_section_pdf(s_bytes, **s_sections[key]) if key in s_sections else None
                
                results.append({
                    'key': key,
                    'question': question_pdf_bytes,
                    'rubric': rubric_pdf_bytes,
                    'solution': solution_pdf_bytes
                })

            # Display results
            for item in results:
                st.markdown(f"---")
                st.subheader(f"Section: {item['key']}")
                
                c1, c2, c3 = st.columns(3)
                
                with c1:
                    st.markdown("#### Question")
                    if item['question']:
                        with st.expander("👁️ Preview"):
                            preview_doc = fitz.open(stream=item['question'], filetype="pdf")
                            pix = preview_doc[0].get_pixmap()
                            st.image(pix.tobytes())
                        st.download_button(
                            label="⬇️ Download PDF",
                            data=item['question'],
                            file_name=f"{item['key']}_question.pdf",
                            mime="application/pdf"
                        )
                    else:
                        st.warning("Not found")

                with c2:
                    st.markdown("#### Rubric")
                    if item['rubric']:
                        with st.expander("👁️ Preview"):
                            preview_doc = fitz.open(stream=item['rubric'], filetype="pdf")
                            pix = preview_doc[0].get_pixmap()
                            st.image(pix.tobytes())
                        st.download_button(
                            label="⬇️ Download PDF",
                            data=item['rubric'],
                            file_name=f"{item['key']}_rubric.pdf",
                            mime="application/pdf"
                        )
                    else:
                        st.warning("Not found")
                        
                with c3:
                    st.markdown("#### Solution")
                    if item['solution']:
                        with st.expander("👁️ Preview"):
                            preview_doc = fitz.open(stream=item['solution'], filetype="pdf")
                            pix = preview_doc[0].get_pixmap()
                            st.image(pix.tobytes())
                        st.download_button(
                            label="⬇️ Download PDF",
                            data=item['solution'],
                            file_name=f"{item['key']}_solution.pdf",
                            mime="application/pdf"
                        )
                    else:
                        st.warning("Not found")