Spaces:
Runtime error
Runtime error
File size: 7,077 Bytes
568c150 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 |
import streamlit as st
import fitz # PyMuPDF
import pandas as pd
import re
import io
# --- Core PDF Processing Functions ---
def find_sections(pdf_bytes, marker_pattern):
"""
Scans a PDF and finds the start and end pages of sections based on a pattern.
Returns a dictionary like {'Q1': {'start': 0, 'end': 1}, 'Q2': {'start': 2, 'end': 3}}
"""
sections = {}
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
# Find all occurrences of the pattern
found_items = []
for page_num, page in enumerate(doc):
text = page.get_text("text")
# Example pattern: r"^(Question|Q)\s*(\d+)"
matches = re.finditer(marker_pattern, text, re.IGNORECASE | re.MULTILINE)
for match in matches:
question_num = match.group(2) # Assumes the number is the second group
# We use a key like 'Q1', 'Q2' for consistency
key = f"Q{question_num}"
if key not in [item['key'] for item in found_items]:
found_items.append({'key': key, 'page': page_num})
if not found_items:
return {}
# Determine page ranges
for i, item in enumerate(found_items):
key = item['key']
start_page = item['page']
end_page = doc.page_count - 1 # Default to end of doc
if i + 1 < len(found_items):
end_page = found_items[i+1]['page'] - 1
# Ensure end page is not before start page
if end_page < start_page:
end_page = start_page
sections[key] = {'start': start_page, 'end': end_page}
return sections
def extract_section_pdf(pdf_bytes, start_page, end_page):
"""
Extracts a range of pages from a PDF and returns it as new PDF bytes.
"""
source_doc = fitz.open(stream=pdf_bytes, filetype="pdf")
new_doc = fitz.open() # Create a new empty PDF
new_doc.insert_pdf(source_doc, from_page=start_page, to_page=end_page)
return new_doc.tobytes()
# --- Streamlit UI ---
st.set_page_config(layout="wide")
st.title("๐ PDF Section Splitter & Mapper")
st.info("Upload your question, rubric, and solution PDFs. The tool will find sections (Q1, Q2...) and map them together.")
# File Uploaders
col1, col2, col3 = st.columns(3)
with col1:
q_file = st.file_uploader("1. Upload Questions PDF", type="pdf")
with col2:
r_file = st.file_uploader("2. Upload Rubrics PDF", type="pdf")
with col3:
s_file = st.file_uploader("3. Upload Solutions PDF", type="pdf")
# User-defined marker
marker_text = st.text_input(
"Enter the text marker for questions (e.g., 'Question', 'Q', 'Problem')",
value="Question"
)
if st.button("๐ Process PDFs", disabled=(not all([q_file, r_file, s_file]))):
# Compile the regex pattern
# This pattern looks for the marker, optional space, and then captures the digits
marker_pattern = rf"^{marker_text}\s*(\d+)"
with st.spinner("Processing documents... This might take a moment."):
# Read file bytes
q_bytes = q_file.getvalue()
r_bytes = r_file.getvalue()
s_bytes = s_file.getvalue()
# Find sections in all three documents
q_sections = find_sections(q_bytes, marker_pattern)
r_sections = find_sections(r_bytes, marker_pattern)
s_sections = find_sections(s_bytes, marker_pattern)
# Get a unique, sorted list of all question keys found (e.g., Q1, Q2, Q10)
all_keys = sorted(list(set(q_sections.keys()) | set(r_sections.keys()) | set(s_sections.keys())), key=lambda x: int(x[1:]))
if not all_keys:
st.error("Could not find any sections with the provided marker. Please check your PDFs or refine the marker text.")
else:
st.success(f"Found {len(all_keys)} unique sections! Displaying results below.")
results = []
for key in all_keys:
# Extract the PDF section for each type if it exists
question_pdf_bytes = extract_section_pdf(q_bytes, **q_sections[key]) if key in q_sections else None
rubric_pdf_bytes = extract_section_pdf(r_bytes, **r_sections[key]) if key in r_sections else None
solution_pdf_bytes = extract_section_pdf(s_bytes, **s_sections[key]) if key in s_sections else None
results.append({
'key': key,
'question': question_pdf_bytes,
'rubric': rubric_pdf_bytes,
'solution': solution_pdf_bytes
})
# Display results
for item in results:
st.markdown(f"---")
st.subheader(f"Section: {item['key']}")
c1, c2, c3 = st.columns(3)
with c1:
st.markdown("#### Question")
if item['question']:
with st.expander("๐๏ธ Preview"):
preview_doc = fitz.open(stream=item['question'], filetype="pdf")
pix = preview_doc[0].get_pixmap()
st.image(pix.tobytes())
st.download_button(
label="โฌ๏ธ Download PDF",
data=item['question'],
file_name=f"{item['key']}_question.pdf",
mime="application/pdf"
)
else:
st.warning("Not found")
with c2:
st.markdown("#### Rubric")
if item['rubric']:
with st.expander("๐๏ธ Preview"):
preview_doc = fitz.open(stream=item['rubric'], filetype="pdf")
pix = preview_doc[0].get_pixmap()
st.image(pix.tobytes())
st.download_button(
label="โฌ๏ธ Download PDF",
data=item['rubric'],
file_name=f"{item['key']}_rubric.pdf",
mime="application/pdf"
)
else:
st.warning("Not found")
with c3:
st.markdown("#### Solution")
if item['solution']:
with st.expander("๐๏ธ Preview"):
preview_doc = fitz.open(stream=item['solution'], filetype="pdf")
pix = preview_doc[0].get_pixmap()
st.image(pix.tobytes())
st.download_button(
label="โฌ๏ธ Download PDF",
data=item['solution'],
file_name=f"{item['key']}_solution.pdf",
mime="application/pdf"
)
else:
st.warning("Not found") |