pdfcut / src /streamlit_app.py
akashmishra358's picture
Update src/streamlit_app.py
c027746 verified
import streamlit as st
import fitz # PyMuPDF
import pandas as pd
import re
import io
# --- Core PDF Processing Functions ---
def find_sections(pdf_bytes, marker_pattern):
"""
Scans a PDF and finds the start and end pages of sections based on a pattern.
Returns a dictionary like {'Q1': {'start': 0, 'end': 1}, 'Q2': {'start': 2, 'end': 3}}
"""
sections = {}
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
# Find all occurrences of the pattern
found_items = []
for page_num, page in enumerate(doc):
text = page.get_text("text")
# Example pattern: r"^(Question|Q)\s*(\d+)"
matches = re.finditer(marker_pattern, text, re.IGNORECASE | re.MULTILINE)
for match in matches:
# We use the raw number found for sorting later
question_num_str = match.group(1)
key = f"Q{question_num_str}"
if key not in [item['key'] for item in found_items]:
found_items.append({'key': key, 'page': page_num, 'num': int(question_num_str)})
if not found_items:
return {}
# Sort items numerically to handle Q1, Q2, Q10 correctly
found_items.sort(key=lambda x: x['num'])
# Determine page ranges
for i, item in enumerate(found_items):
key = item['key']
start_page = item['page']
end_page = doc.page_count - 1 # Default to end of doc
if i + 1 < len(found_items):
end_page = found_items[i+1]['page'] - 1
# Ensure end page is not before start page
if end_page < start_page:
end_page = start_page
sections[key] = {'start': start_page, 'end': end_page}
return sections
def extract_section_pdf(pdf_bytes, start_page, end_page):
"""
Extracts a range of pages from a PDF and returns it as new PDF bytes.
"""
source_doc = fitz.open(stream=pdf_bytes, filetype="pdf")
new_doc = fitz.open() # Create a new empty PDF
new_doc.insert_pdf(source_doc, from_page=start_page, to_page=end_page)
return new_doc.tobytes()
# --- Streamlit UI ---
st.set_page_config(layout="wide")
st.title("πŸ“„ PDF Section Splitter & Mapper")
st.info("Upload your question, rubric, and solution PDFs. The tool will find sections (Q1, Q2...) and map them together.")
# File Uploaders
col1, col2, col3 = st.columns(3)
with col1:
q_file = st.file_uploader("1. Upload Questions PDF", type="pdf")
with col2:
r_file = st.file_uploader("2. Upload Rubrics PDF", type="pdf")
with col3:
s_file = st.file_uploader("3. Upload Solutions PDF", type="pdf")
# User-defined marker
marker_text = st.text_input(
"Enter the text marker for questions (e.g., 'Question', 'Q', 'Problem')",
value="Question"
)
if st.button("πŸš€ Process PDFs", disabled=(not all([q_file, r_file, s_file]))):
# Compile the regex pattern
# This pattern looks for the marker, optional space, and then captures the digits
marker_pattern = rf"^{marker_text}\s*(\d+)"
with st.spinner("Processing documents... This might take a moment."):
# Read file bytes
q_bytes = q_file.getvalue()
r_bytes = r_file.getvalue()
s_bytes = s_file.getvalue()
# Find sections in all three documents
q_sections = find_sections(q_bytes, marker_pattern)
r_sections = find_sections(r_bytes, marker_pattern)
s_sections = find_sections(s_bytes, marker_pattern)
# Get a unique, sorted list of all question keys found (e.g., Q1, Q2, Q10)
all_keys = sorted(list(set(q_sections.keys()) | set(r_sections.keys()) | set(s_sections.keys())), key=lambda x: int(re.search(r'\d+', x).group()))
if not all_keys:
st.error("Could not find any sections with the provided marker. Please check your PDFs or refine the marker text.")
else:
st.success(f"Found {len(all_keys)} unique sections! Displaying results below.")
results = []
for key in all_keys:
# Extract the PDF section for each type if it exists
question_pdf_bytes = extract_section_pdf(q_bytes, **q_sections[key]) if key in q_sections else None
rubric_pdf_bytes = extract_section_pdf(r_bytes, **r_sections[key]) if key in r_sections else None
solution_pdf_bytes = extract_section_pdf(s_bytes, **s_sections[key]) if key in s_sections else None
results.append({
'key': key,
'question': question_pdf_bytes,
'rubric': rubric_pdf_bytes,
'solution': solution_pdf_bytes
})
# Display results
for item in results:
st.markdown(f"---")
st.subheader(f"Section: {item['key']}")
c1, c2, c3 = st.columns(3)
with c1:
st.markdown("#### Question")
if item['question']:
with st.expander("πŸ‘οΈ Preview"):
try:
preview_doc = fitz.open(stream=item['question'], filetype="pdf")
pix = preview_doc[0].get_pixmap()
st.image(pix.tobytes())
except Exception as e:
st.error(f"Could not generate preview: {e}")
st.download_button(
label="⬇️ Download PDF",
data=item['question'],
file_name=f"{item['key'].lower().replace(' ','_')}_question.pdf",
mime="application/pdf"
)
else:
st.warning("Not found")
with c2:
st.markdown("#### Rubric")
if item['rubric']:
with st.expander("πŸ‘οΈ Preview"):
try:
preview_doc = fitz.open(stream=item['rubric'], filetype="pdf")
pix = preview_doc[0].get_pixmap()
st.image(pix.tobytes())
except Exception as e:
st.error(f"Could not generate preview: {e}")
st.download_button(
label="⬇️ Download PDF",
data=item['rubric'],
file_name=f"{item['key'].lower().replace(' ','_')}_rubric.pdf",
mime="application/pdf"
)
else:
st.warning("Not found")
with c3:
st.markdown("#### Solution")
if item['solution']:
with st.expander("πŸ‘οΈ Preview"):
try:
preview_doc = fitz.open(stream=item['solution'], filetype="pdf")
pix = preview_doc[0].get_pixmap()
st.image(pix.tobytes())
except Exception as e:
st.error(f"Could not generate preview: {e}")
st.download_button(
label="⬇️ Download PDF",
data=item['solution'],
file_name=f"{item['key'].lower().replace(' ','_')}_solution.pdf",
mime="application/pdf"
)
else:
st.warning("Not found")