File size: 7,077 Bytes
568c150
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
import streamlit as st
import fitz  # PyMuPDF
import pandas as pd
import re
import io

# --- Core PDF Processing Functions ---

def find_sections(pdf_bytes, marker_pattern):
    """
    Scans a PDF and finds the start and end pages of sections based on a pattern.
    Returns a dictionary like {'Q1': {'start': 0, 'end': 1}, 'Q2': {'start': 2, 'end': 3}}
    """
    sections = {}
    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
    
    # Find all occurrences of the pattern
    found_items = []
    for page_num, page in enumerate(doc):
        text = page.get_text("text")
        # Example pattern: r"^(Question|Q)\s*(\d+)"
        matches = re.finditer(marker_pattern, text, re.IGNORECASE | re.MULTILINE)
        for match in matches:
            question_num = match.group(2) # Assumes the number is the second group
            # We use a key like 'Q1', 'Q2' for consistency
            key = f"Q{question_num}" 
            if key not in [item['key'] for item in found_items]:
                 found_items.append({'key': key, 'page': page_num})

    if not found_items:
        return {}
        
    # Determine page ranges
    for i, item in enumerate(found_items):
        key = item['key']
        start_page = item['page']
        end_page = doc.page_count - 1 # Default to end of doc
        if i + 1 < len(found_items):
            end_page = found_items[i+1]['page'] - 1
        
        # Ensure end page is not before start page
        if end_page < start_page:
            end_page = start_page
            
        sections[key] = {'start': start_page, 'end': end_page}
        
    return sections

def extract_section_pdf(pdf_bytes, start_page, end_page):
    """
    Extracts a range of pages from a PDF and returns it as new PDF bytes.
    """
    source_doc = fitz.open(stream=pdf_bytes, filetype="pdf")
    new_doc = fitz.open() # Create a new empty PDF
    new_doc.insert_pdf(source_doc, from_page=start_page, to_page=end_page)
    return new_doc.tobytes()

# --- Streamlit UI ---

st.set_page_config(layout="wide")
st.title("๐Ÿ“„ PDF Section Splitter & Mapper")

st.info("Upload your question, rubric, and solution PDFs. The tool will find sections (Q1, Q2...) and map them together.")

# File Uploaders
col1, col2, col3 = st.columns(3)
with col1:
    q_file = st.file_uploader("1. Upload Questions PDF", type="pdf")
with col2:
    r_file = st.file_uploader("2. Upload Rubrics PDF", type="pdf")
with col3:
    s_file = st.file_uploader("3. Upload Solutions PDF", type="pdf")

# User-defined marker
marker_text = st.text_input(
    "Enter the text marker for questions (e.g., 'Question', 'Q', 'Problem')", 
    value="Question"
)

if st.button("๐Ÿš€ Process PDFs", disabled=(not all([q_file, r_file, s_file]))):
    
    # Compile the regex pattern
    # This pattern looks for the marker, optional space, and then captures the digits
    marker_pattern = rf"^{marker_text}\s*(\d+)"

    with st.spinner("Processing documents... This might take a moment."):
        # Read file bytes
        q_bytes = q_file.getvalue()
        r_bytes = r_file.getvalue()
        s_bytes = s_file.getvalue()

        # Find sections in all three documents
        q_sections = find_sections(q_bytes, marker_pattern)
        r_sections = find_sections(r_bytes, marker_pattern)
        s_sections = find_sections(s_bytes, marker_pattern)
        
        # Get a unique, sorted list of all question keys found (e.g., Q1, Q2, Q10)
        all_keys = sorted(list(set(q_sections.keys()) | set(r_sections.keys()) | set(s_sections.keys())), key=lambda x: int(x[1:]))

        if not all_keys:
            st.error("Could not find any sections with the provided marker. Please check your PDFs or refine the marker text.")
        else:
            st.success(f"Found {len(all_keys)} unique sections! Displaying results below.")

            results = []
            for key in all_keys:
                # Extract the PDF section for each type if it exists
                question_pdf_bytes = extract_section_pdf(q_bytes, **q_sections[key]) if key in q_sections else None
                rubric_pdf_bytes = extract_section_pdf(r_bytes, **r_sections[key]) if key in r_sections else None
                solution_pdf_bytes = extract_section_pdf(s_bytes, **s_sections[key]) if key in s_sections else None
                
                results.append({
                    'key': key,
                    'question': question_pdf_bytes,
                    'rubric': rubric_pdf_bytes,
                    'solution': solution_pdf_bytes
                })

            # Display results
            for item in results:
                st.markdown(f"---")
                st.subheader(f"Section: {item['key']}")
                
                c1, c2, c3 = st.columns(3)
                
                with c1:
                    st.markdown("#### Question")
                    if item['question']:
                        with st.expander("๐Ÿ‘๏ธ Preview"):
                            preview_doc = fitz.open(stream=item['question'], filetype="pdf")
                            pix = preview_doc[0].get_pixmap()
                            st.image(pix.tobytes())
                        st.download_button(
                            label="โฌ‡๏ธ Download PDF",
                            data=item['question'],
                            file_name=f"{item['key']}_question.pdf",
                            mime="application/pdf"
                        )
                    else:
                        st.warning("Not found")

                with c2:
                    st.markdown("#### Rubric")
                    if item['rubric']:
                        with st.expander("๐Ÿ‘๏ธ Preview"):
                            preview_doc = fitz.open(stream=item['rubric'], filetype="pdf")
                            pix = preview_doc[0].get_pixmap()
                            st.image(pix.tobytes())
                        st.download_button(
                            label="โฌ‡๏ธ Download PDF",
                            data=item['rubric'],
                            file_name=f"{item['key']}_rubric.pdf",
                            mime="application/pdf"
                        )
                    else:
                        st.warning("Not found")
                        
                with c3:
                    st.markdown("#### Solution")
                    if item['solution']:
                        with st.expander("๐Ÿ‘๏ธ Preview"):
                            preview_doc = fitz.open(stream=item['solution'], filetype="pdf")
                            pix = preview_doc[0].get_pixmap()
                            st.image(pix.tobytes())
                        st.download_button(
                            label="โฌ‡๏ธ Download PDF",
                            data=item['solution'],
                            file_name=f"{item['key']}_solution.pdf",
                            mime="application/pdf"
                        )
                    else:
                        st.warning("Not found")