Spaces:

akashmishra358
/

pdfcut

Sleeping

App Files Files Community

akashmishra358 commited on Sep 17, 2025

Commit

55cd5f6

verified ·

1 Parent(s): c027746

Upload app.py

Browse files

Files changed (1) hide show

app.py +184 -0

app.py ADDED Viewed

	@@ -0,0 +1,184 @@

+import streamlit as st
+import fitz  # PyMuPDF
+import pandas as pd
+import re
+import io
+# --- Core PDF Processing Functions ---
+def find_sections(pdf_bytes, marker_pattern):
+    """
+    Scans a PDF and finds the start and end pages of sections based on a pattern.
+    Returns a dictionary like {'Q1': {'start': 0, 'end': 1}, 'Q2': {'start': 2, 'end': 3}}
+    """
+    sections = {}
+    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+    # Find all occurrences of the pattern
+    found_items = []
+    for page_num, page in enumerate(doc):
+        text = page.get_text("text")
+        # Example pattern: r"^(Question|Q)\s*(\d+)"
+        matches = re.finditer(marker_pattern, text, re.IGNORECASE | re.MULTILINE)
+        for match in matches:
+            # We use the raw number found for sorting later
+            question_num_str = match.group(1)
+            key = f"Q{question_num_str}"
+            if key not in [item['key'] for item in found_items]:
+                 found_items.append({'key': key, 'page': page_num, 'num': int(question_num_str)})
+    if not found_items:
+        return {}
+    # Sort items numerically to handle Q1, Q2, Q10 correctly
+    found_items.sort(key=lambda x: x['num'])
+    # Determine page ranges
+    for i, item in enumerate(found_items):
+        key = item['key']
+        start_page = item['page']
+        end_page = doc.page_count - 1 # Default to end of doc
+        if i + 1 < len(found_items):
+            end_page = found_items[i+1]['page'] - 1
+        # Ensure end page is not before start page
+        if end_page < start_page:
+            end_page = start_page
+        sections[key] = {'start': start_page, 'end': end_page}
+    return sections
+def extract_section_pdf(pdf_bytes, start_page, end_page):
+    """
+    Extracts a range of pages from a PDF and returns it as new PDF bytes.
+    """
+    source_doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+    new_doc = fitz.open() # Create a new empty PDF
+    new_doc.insert_pdf(source_doc, from_page=start_page, to_page=end_page)
+    return new_doc.tobytes()
+# --- Streamlit UI ---
+st.set_page_config(layout="wide")
+st.title("📄 PDF Section Splitter & Mapper")
+st.info("Upload your question, rubric, and solution PDFs. The tool will find sections (Q1, Q2...) and map them together.")
+# File Uploaders
+col1, col2, col3 = st.columns(3)
+with col1:
+    q_file = st.file_uploader("1. Upload Questions PDF", type="pdf")
+with col2:
+    r_file = st.file_uploader("2. Upload Rubrics PDF", type="pdf")
+with col3:
+    s_file = st.file_uploader("3. Upload Solutions PDF", type="pdf")
+# User-defined marker
+marker_text = st.text_input(
+    "Enter the text marker for questions (e.g., 'Question', 'Q', 'Problem')",
+    value="Question"
+)
+if st.button("🚀 Process PDFs", disabled=(not all([q_file, r_file, s_file]))):
+    # Compile the regex pattern
+    # This pattern looks for the marker, optional space, and then captures the digits
+    marker_pattern = rf"^{marker_text}\s*(\d+)"
+    with st.spinner("Processing documents... This might take a moment."):
+        # Read file bytes
+        q_bytes = q_file.getvalue()
+        r_bytes = r_file.getvalue()
+        s_bytes = s_file.getvalue()
+        # Find sections in all three documents
+        q_sections = find_sections(q_bytes, marker_pattern)
+        r_sections = find_sections(r_bytes, marker_pattern)
+        s_sections = find_sections(s_bytes, marker_pattern)
+        # Get a unique, sorted list of all question keys found (e.g., Q1, Q2, Q10)
+        all_keys = sorted(list(set(q_sections.keys()) | set(r_sections.keys()) | set(s_sections.keys())), key=lambda x: int(re.search(r'\d+', x).group()))
+        if not all_keys:
+            st.error("Could not find any sections with the provided marker. Please check your PDFs or refine the marker text.")
+        else:
+            st.success(f"Found {len(all_keys)} unique sections! Displaying results below.")
+            results = []
+            for key in all_keys:
+                # Extract the PDF section for each type if it exists
+                question_pdf_bytes = extract_section_pdf(q_bytes, **q_sections[key]) if key in q_sections else None
+                rubric_pdf_bytes = extract_section_pdf(r_bytes, **r_sections[key]) if key in r_sections else None
+                solution_pdf_bytes = extract_section_pdf(s_bytes, **s_sections[key]) if key in s_sections else None
+                results.append({
+                    'key': key,
+                    'question': question_pdf_bytes,
+                    'rubric': rubric_pdf_bytes,
+                    'solution': solution_pdf_bytes
+                })
+            # Display results
+            for item in results:
+                st.markdown(f"---")
+                st.subheader(f"Section: {item['key']}")
+                c1, c2, c3 = st.columns(3)
+                with c1:
+                    st.markdown("#### Question")
+                    if item['question']:
+                        with st.expander("👁️ Preview"):
+                            try:
+                                preview_doc = fitz.open(stream=item['question'], filetype="pdf")
+                                pix = preview_doc[0].get_pixmap()
+                                st.image(pix.tobytes())
+                            except Exception as e:
+                                st.error(f"Could not generate preview: {e}")
+                        st.download_button(
+                            label="⬇️ Download PDF",
+                            data=item['question'],
+                            file_name=f"{item['key'].lower().replace(' ','_')}_question.pdf",
+                            mime="application/pdf"
+                        )
+                    else:
+                        st.warning("Not found")
+                with c2:
+                    st.markdown("#### Rubric")
+                    if item['rubric']:
+                        with st.expander("👁️ Preview"):
+                            try:
+                                preview_doc = fitz.open(stream=item['rubric'], filetype="pdf")
+                                pix = preview_doc[0].get_pixmap()
+                                st.image(pix.tobytes())
+                            except Exception as e:
+                                st.error(f"Could not generate preview: {e}")
+                        st.download_button(
+                            label="⬇️ Download PDF",
+                            data=item['rubric'],
+                            file_name=f"{item['key'].lower().replace(' ','_')}_rubric.pdf",
+                            mime="application/pdf"
+                        )
+                    else:
+                        st.warning("Not found")
+                with c3:
+                    st.markdown("#### Solution")
+                    if item['solution']:
+                        with st.expander("👁️ Preview"):
+                            try:
+                                preview_doc = fitz.open(stream=item['solution'], filetype="pdf")
+                                pix = preview_doc[0].get_pixmap()
+                                st.image(pix.tobytes())
+                            except Exception as e:
+                                st.error(f"Could not generate preview: {e}")
+                        st.download_button(
+                            label="⬇️ Download PDF",
+                            data=item['solution'],
+                            file_name=f"{item['key'].lower().replace(' ','_')}_solution.pdf",
+                            mime="application/pdf"
+                        )
+                    else:
+                        st.warning("Not found")