Spaces:

akashmishra358
/

pdfcut

Sleeping

App Files Files Community

pdfcut / src /streamlit_app.py

akashmishra358

Update src/streamlit_app.py

c027746 verified 4 months ago

raw

history blame contribute delete

7.8 kB

	import streamlit as st
	import fitz # PyMuPDF
	import pandas as pd
	import re
	import io

	# --- Core PDF Processing Functions ---

	def find_sections(pdf_bytes, marker_pattern):
	"""
	Scans a PDF and finds the start and end pages of sections based on a pattern.
	Returns a dictionary like {'Q1': {'start': 0, 'end': 1}, 'Q2': {'start': 2, 'end': 3}}
	"""
	sections = {}
	doc = fitz.open(stream=pdf_bytes, filetype="pdf")

	# Find all occurrences of the pattern
	found_items = []
	for page_num, page in enumerate(doc):
	text = page.get_text("text")
	# Example pattern: r"^(Question\|Q)\s*(\d+)"
	matches = re.finditer(marker_pattern, text, re.IGNORECASE \| re.MULTILINE)
	for match in matches:
	# We use the raw number found for sorting later
	question_num_str = match.group(1)
	key = f"Q{question_num_str}"
	if key not in [item['key'] for item in found_items]:
	found_items.append({'key': key, 'page': page_num, 'num': int(question_num_str)})

	if not found_items:
	return {}

	# Sort items numerically to handle Q1, Q2, Q10 correctly
	found_items.sort(key=lambda x: x['num'])

	# Determine page ranges
	for i, item in enumerate(found_items):
	key = item['key']
	start_page = item['page']
	end_page = doc.page_count - 1 # Default to end of doc
	if i + 1 < len(found_items):
	end_page = found_items[i+1]['page'] - 1

	# Ensure end page is not before start page
	if end_page < start_page:
	end_page = start_page

	sections[key] = {'start': start_page, 'end': end_page}

	return sections

	def extract_section_pdf(pdf_bytes, start_page, end_page):
	"""
	Extracts a range of pages from a PDF and returns it as new PDF bytes.
	"""
	source_doc = fitz.open(stream=pdf_bytes, filetype="pdf")
	new_doc = fitz.open() # Create a new empty PDF
	new_doc.insert_pdf(source_doc, from_page=start_page, to_page=end_page)
	return new_doc.tobytes()

	# --- Streamlit UI ---

	st.set_page_config(layout="wide")
	st.title("📄 PDF Section Splitter & Mapper")

	st.info("Upload your question, rubric, and solution PDFs. The tool will find sections (Q1, Q2...) and map them together.")

	# File Uploaders
	col1, col2, col3 = st.columns(3)
	with col1:
	q_file = st.file_uploader("1. Upload Questions PDF", type="pdf")
	with col2:
	r_file = st.file_uploader("2. Upload Rubrics PDF", type="pdf")
	with col3:
	s_file = st.file_uploader("3. Upload Solutions PDF", type="pdf")

	# User-defined marker
	marker_text = st.text_input(
	"Enter the text marker for questions (e.g., 'Question', 'Q', 'Problem')",
	value="Question"
	)

	if st.button("🚀 Process PDFs", disabled=(not all([q_file, r_file, s_file]))):

	# Compile the regex pattern
	# This pattern looks for the marker, optional space, and then captures the digits
	marker_pattern = rf"^{marker_text}\s*(\d+)"

	with st.spinner("Processing documents... This might take a moment."):
	# Read file bytes
	q_bytes = q_file.getvalue()
	r_bytes = r_file.getvalue()
	s_bytes = s_file.getvalue()

	# Find sections in all three documents
	q_sections = find_sections(q_bytes, marker_pattern)
	r_sections = find_sections(r_bytes, marker_pattern)
	s_sections = find_sections(s_bytes, marker_pattern)

	# Get a unique, sorted list of all question keys found (e.g., Q1, Q2, Q10)
	all_keys = sorted(list(set(q_sections.keys()) \| set(r_sections.keys()) \| set(s_sections.keys())), key=lambda x: int(re.search(r'\d+', x).group()))

	if not all_keys:
	st.error("Could not find any sections with the provided marker. Please check your PDFs or refine the marker text.")
	else:
	st.success(f"Found {len(all_keys)} unique sections! Displaying results below.")

	results = []
	for key in all_keys:
	# Extract the PDF section for each type if it exists
	question_pdf_bytes = extract_section_pdf(q_bytes, **q_sections[key]) if key in q_sections else None
	rubric_pdf_bytes = extract_section_pdf(r_bytes, **r_sections[key]) if key in r_sections else None
	solution_pdf_bytes = extract_section_pdf(s_bytes, **s_sections[key]) if key in s_sections else None

	results.append({
	'key': key,
	'question': question_pdf_bytes,
	'rubric': rubric_pdf_bytes,
	'solution': solution_pdf_bytes
	})

	# Display results
	for item in results:
	st.markdown(f"---")
	st.subheader(f"Section: {item['key']}")

	c1, c2, c3 = st.columns(3)

	with c1:
	st.markdown("#### Question")
	if item['question']:
	with st.expander("👁️ Preview"):
	try:
	preview_doc = fitz.open(stream=item['question'], filetype="pdf")
	pix = preview_doc[0].get_pixmap()
	st.image(pix.tobytes())
	except Exception as e:
	st.error(f"Could not generate preview: {e}")
	st.download_button(
	label="⬇️ Download PDF",
	data=item['question'],
	file_name=f"{item['key'].lower().replace(' ','_')}_question.pdf",
	mime="application/pdf"
	)
	else:
	st.warning("Not found")

	with c2:
	st.markdown("#### Rubric")
	if item['rubric']:
	with st.expander("👁️ Preview"):
	try:
	preview_doc = fitz.open(stream=item['rubric'], filetype="pdf")
	pix = preview_doc[0].get_pixmap()
	st.image(pix.tobytes())
	except Exception as e:
	st.error(f"Could not generate preview: {e}")
	st.download_button(
	label="⬇️ Download PDF",
	data=item['rubric'],
	file_name=f"{item['key'].lower().replace(' ','_')}_rubric.pdf",
	mime="application/pdf"
	)
	else:
	st.warning("Not found")

	with c3:
	st.markdown("#### Solution")
	if item['solution']:
	with st.expander("👁️ Preview"):
	try:
	preview_doc = fitz.open(stream=item['solution'], filetype="pdf")
	pix = preview_doc[0].get_pixmap()
	st.image(pix.tobytes())
	except Exception as e:
	st.error(f"Could not generate preview: {e}")
	st.download_button(
	label="⬇️ Download PDF",
	data=item['solution'],
	file_name=f"{item['key'].lower().replace(' ','_')}_solution.pdf",
	mime="application/pdf"
	)
	else:
	st.warning("Not found")