Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import fitz # PyMuPDF | |
| import pandas as pd | |
| import re | |
| import io | |
| # --- Core PDF Processing Functions --- | |
| def find_sections(pdf_bytes, marker_pattern): | |
| """ | |
| Scans a PDF and finds the start and end pages of sections based on a pattern. | |
| Returns a dictionary like {'Q1': {'start': 0, 'end': 1}, 'Q2': {'start': 2, 'end': 3}} | |
| """ | |
| sections = {} | |
| doc = fitz.open(stream=pdf_bytes, filetype="pdf") | |
| # Find all occurrences of the pattern | |
| found_items = [] | |
| for page_num, page in enumerate(doc): | |
| text = page.get_text("text") | |
| # Example pattern: r"^(Question|Q)\s*(\d+)" | |
| matches = re.finditer(marker_pattern, text, re.IGNORECASE | re.MULTILINE) | |
| for match in matches: | |
| # We use the raw number found for sorting later | |
| question_num_str = match.group(1) | |
| key = f"Q{question_num_str}" | |
| if key not in [item['key'] for item in found_items]: | |
| found_items.append({'key': key, 'page': page_num, 'num': int(question_num_str)}) | |
| if not found_items: | |
| return {} | |
| # Sort items numerically to handle Q1, Q2, Q10 correctly | |
| found_items.sort(key=lambda x: x['num']) | |
| # Determine page ranges | |
| for i, item in enumerate(found_items): | |
| key = item['key'] | |
| start_page = item['page'] | |
| end_page = doc.page_count - 1 # Default to end of doc | |
| if i + 1 < len(found_items): | |
| end_page = found_items[i+1]['page'] - 1 | |
| # Ensure end page is not before start page | |
| if end_page < start_page: | |
| end_page = start_page | |
| sections[key] = {'start': start_page, 'end': end_page} | |
| return sections | |
| def extract_section_pdf(pdf_bytes, start_page, end_page): | |
| """ | |
| Extracts a range of pages from a PDF and returns it as new PDF bytes. | |
| """ | |
| source_doc = fitz.open(stream=pdf_bytes, filetype="pdf") | |
| new_doc = fitz.open() # Create a new empty PDF | |
| new_doc.insert_pdf(source_doc, from_page=start_page, to_page=end_page) | |
| return new_doc.tobytes() | |
| # --- Streamlit UI --- | |
| st.set_page_config(layout="wide") | |
| st.title("π PDF Section Splitter & Mapper") | |
| st.info("Upload your question, rubric, and solution PDFs. The tool will find sections (Q1, Q2...) and map them together.") | |
| # File Uploaders | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| q_file = st.file_uploader("1. Upload Questions PDF", type="pdf") | |
| with col2: | |
| r_file = st.file_uploader("2. Upload Rubrics PDF", type="pdf") | |
| with col3: | |
| s_file = st.file_uploader("3. Upload Solutions PDF", type="pdf") | |
| # User-defined marker | |
| marker_text = st.text_input( | |
| "Enter the text marker for questions (e.g., 'Question', 'Q', 'Problem')", | |
| value="Question" | |
| ) | |
| if st.button("π Process PDFs", disabled=(not all([q_file, r_file, s_file]))): | |
| # Compile the regex pattern | |
| # This pattern looks for the marker, optional space, and then captures the digits | |
| marker_pattern = rf"^{marker_text}\s*(\d+)" | |
| with st.spinner("Processing documents... This might take a moment."): | |
| # Read file bytes | |
| q_bytes = q_file.getvalue() | |
| r_bytes = r_file.getvalue() | |
| s_bytes = s_file.getvalue() | |
| # Find sections in all three documents | |
| q_sections = find_sections(q_bytes, marker_pattern) | |
| r_sections = find_sections(r_bytes, marker_pattern) | |
| s_sections = find_sections(s_bytes, marker_pattern) | |
| # Get a unique, sorted list of all question keys found (e.g., Q1, Q2, Q10) | |
| all_keys = sorted(list(set(q_sections.keys()) | set(r_sections.keys()) | set(s_sections.keys())), key=lambda x: int(re.search(r'\d+', x).group())) | |
| if not all_keys: | |
| st.error("Could not find any sections with the provided marker. Please check your PDFs or refine the marker text.") | |
| else: | |
| st.success(f"Found {len(all_keys)} unique sections! Displaying results below.") | |
| results = [] | |
| for key in all_keys: | |
| # Extract the PDF section for each type if it exists | |
| question_pdf_bytes = extract_section_pdf(q_bytes, **q_sections[key]) if key in q_sections else None | |
| rubric_pdf_bytes = extract_section_pdf(r_bytes, **r_sections[key]) if key in r_sections else None | |
| solution_pdf_bytes = extract_section_pdf(s_bytes, **s_sections[key]) if key in s_sections else None | |
| results.append({ | |
| 'key': key, | |
| 'question': question_pdf_bytes, | |
| 'rubric': rubric_pdf_bytes, | |
| 'solution': solution_pdf_bytes | |
| }) | |
| # Display results | |
| for item in results: | |
| st.markdown(f"---") | |
| st.subheader(f"Section: {item['key']}") | |
| c1, c2, c3 = st.columns(3) | |
| with c1: | |
| st.markdown("#### Question") | |
| if item['question']: | |
| with st.expander("ποΈ Preview"): | |
| try: | |
| preview_doc = fitz.open(stream=item['question'], filetype="pdf") | |
| pix = preview_doc[0].get_pixmap() | |
| st.image(pix.tobytes()) | |
| except Exception as e: | |
| st.error(f"Could not generate preview: {e}") | |
| st.download_button( | |
| label="β¬οΈ Download PDF", | |
| data=item['question'], | |
| file_name=f"{item['key'].lower().replace(' ','_')}_question.pdf", | |
| mime="application/pdf" | |
| ) | |
| else: | |
| st.warning("Not found") | |
| with c2: | |
| st.markdown("#### Rubric") | |
| if item['rubric']: | |
| with st.expander("ποΈ Preview"): | |
| try: | |
| preview_doc = fitz.open(stream=item['rubric'], filetype="pdf") | |
| pix = preview_doc[0].get_pixmap() | |
| st.image(pix.tobytes()) | |
| except Exception as e: | |
| st.error(f"Could not generate preview: {e}") | |
| st.download_button( | |
| label="β¬οΈ Download PDF", | |
| data=item['rubric'], | |
| file_name=f"{item['key'].lower().replace(' ','_')}_rubric.pdf", | |
| mime="application/pdf" | |
| ) | |
| else: | |
| st.warning("Not found") | |
| with c3: | |
| st.markdown("#### Solution") | |
| if item['solution']: | |
| with st.expander("ποΈ Preview"): | |
| try: | |
| preview_doc = fitz.open(stream=item['solution'], filetype="pdf") | |
| pix = preview_doc[0].get_pixmap() | |
| st.image(pix.tobytes()) | |
| except Exception as e: | |
| st.error(f"Could not generate preview: {e}") | |
| st.download_button( | |
| label="β¬οΈ Download PDF", | |
| data=item['solution'], | |
| file_name=f"{item['key'].lower().replace(' ','_')}_solution.pdf", | |
| mime="application/pdf" | |
| ) | |
| else: | |
| st.warning("Not found") |