akashmishra358 commited on
Commit
55cd5f6
·
verified ·
1 Parent(s): c027746

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +184 -0
app.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import fitz # PyMuPDF
3
+ import pandas as pd
4
+ import re
5
+ import io
6
+
7
+ # --- Core PDF Processing Functions ---
8
+
9
+ def find_sections(pdf_bytes, marker_pattern):
10
+ """
11
+ Scans a PDF and finds the start and end pages of sections based on a pattern.
12
+ Returns a dictionary like {'Q1': {'start': 0, 'end': 1}, 'Q2': {'start': 2, 'end': 3}}
13
+ """
14
+ sections = {}
15
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
16
+
17
+ # Find all occurrences of the pattern
18
+ found_items = []
19
+ for page_num, page in enumerate(doc):
20
+ text = page.get_text("text")
21
+ # Example pattern: r"^(Question|Q)\s*(\d+)"
22
+ matches = re.finditer(marker_pattern, text, re.IGNORECASE | re.MULTILINE)
23
+ for match in matches:
24
+ # We use the raw number found for sorting later
25
+ question_num_str = match.group(1)
26
+ key = f"Q{question_num_str}"
27
+ if key not in [item['key'] for item in found_items]:
28
+ found_items.append({'key': key, 'page': page_num, 'num': int(question_num_str)})
29
+
30
+ if not found_items:
31
+ return {}
32
+
33
+ # Sort items numerically to handle Q1, Q2, Q10 correctly
34
+ found_items.sort(key=lambda x: x['num'])
35
+
36
+ # Determine page ranges
37
+ for i, item in enumerate(found_items):
38
+ key = item['key']
39
+ start_page = item['page']
40
+ end_page = doc.page_count - 1 # Default to end of doc
41
+ if i + 1 < len(found_items):
42
+ end_page = found_items[i+1]['page'] - 1
43
+
44
+ # Ensure end page is not before start page
45
+ if end_page < start_page:
46
+ end_page = start_page
47
+
48
+ sections[key] = {'start': start_page, 'end': end_page}
49
+
50
+ return sections
51
+
52
+ def extract_section_pdf(pdf_bytes, start_page, end_page):
53
+ """
54
+ Extracts a range of pages from a PDF and returns it as new PDF bytes.
55
+ """
56
+ source_doc = fitz.open(stream=pdf_bytes, filetype="pdf")
57
+ new_doc = fitz.open() # Create a new empty PDF
58
+ new_doc.insert_pdf(source_doc, from_page=start_page, to_page=end_page)
59
+ return new_doc.tobytes()
60
+
61
+ # --- Streamlit UI ---
62
+
63
+ st.set_page_config(layout="wide")
64
+ st.title("📄 PDF Section Splitter & Mapper")
65
+
66
+ st.info("Upload your question, rubric, and solution PDFs. The tool will find sections (Q1, Q2...) and map them together.")
67
+
68
+ # File Uploaders
69
+ col1, col2, col3 = st.columns(3)
70
+ with col1:
71
+ q_file = st.file_uploader("1. Upload Questions PDF", type="pdf")
72
+ with col2:
73
+ r_file = st.file_uploader("2. Upload Rubrics PDF", type="pdf")
74
+ with col3:
75
+ s_file = st.file_uploader("3. Upload Solutions PDF", type="pdf")
76
+
77
+ # User-defined marker
78
+ marker_text = st.text_input(
79
+ "Enter the text marker for questions (e.g., 'Question', 'Q', 'Problem')",
80
+ value="Question"
81
+ )
82
+
83
+ if st.button("🚀 Process PDFs", disabled=(not all([q_file, r_file, s_file]))):
84
+
85
+ # Compile the regex pattern
86
+ # This pattern looks for the marker, optional space, and then captures the digits
87
+ marker_pattern = rf"^{marker_text}\s*(\d+)"
88
+
89
+ with st.spinner("Processing documents... This might take a moment."):
90
+ # Read file bytes
91
+ q_bytes = q_file.getvalue()
92
+ r_bytes = r_file.getvalue()
93
+ s_bytes = s_file.getvalue()
94
+
95
+ # Find sections in all three documents
96
+ q_sections = find_sections(q_bytes, marker_pattern)
97
+ r_sections = find_sections(r_bytes, marker_pattern)
98
+ s_sections = find_sections(s_bytes, marker_pattern)
99
+
100
+ # Get a unique, sorted list of all question keys found (e.g., Q1, Q2, Q10)
101
+ all_keys = sorted(list(set(q_sections.keys()) | set(r_sections.keys()) | set(s_sections.keys())), key=lambda x: int(re.search(r'\d+', x).group()))
102
+
103
+ if not all_keys:
104
+ st.error("Could not find any sections with the provided marker. Please check your PDFs or refine the marker text.")
105
+ else:
106
+ st.success(f"Found {len(all_keys)} unique sections! Displaying results below.")
107
+
108
+ results = []
109
+ for key in all_keys:
110
+ # Extract the PDF section for each type if it exists
111
+ question_pdf_bytes = extract_section_pdf(q_bytes, **q_sections[key]) if key in q_sections else None
112
+ rubric_pdf_bytes = extract_section_pdf(r_bytes, **r_sections[key]) if key in r_sections else None
113
+ solution_pdf_bytes = extract_section_pdf(s_bytes, **s_sections[key]) if key in s_sections else None
114
+
115
+ results.append({
116
+ 'key': key,
117
+ 'question': question_pdf_bytes,
118
+ 'rubric': rubric_pdf_bytes,
119
+ 'solution': solution_pdf_bytes
120
+ })
121
+
122
+ # Display results
123
+ for item in results:
124
+ st.markdown(f"---")
125
+ st.subheader(f"Section: {item['key']}")
126
+
127
+ c1, c2, c3 = st.columns(3)
128
+
129
+ with c1:
130
+ st.markdown("#### Question")
131
+ if item['question']:
132
+ with st.expander("👁️ Preview"):
133
+ try:
134
+ preview_doc = fitz.open(stream=item['question'], filetype="pdf")
135
+ pix = preview_doc[0].get_pixmap()
136
+ st.image(pix.tobytes())
137
+ except Exception as e:
138
+ st.error(f"Could not generate preview: {e}")
139
+ st.download_button(
140
+ label="⬇️ Download PDF",
141
+ data=item['question'],
142
+ file_name=f"{item['key'].lower().replace(' ','_')}_question.pdf",
143
+ mime="application/pdf"
144
+ )
145
+ else:
146
+ st.warning("Not found")
147
+
148
+ with c2:
149
+ st.markdown("#### Rubric")
150
+ if item['rubric']:
151
+ with st.expander("👁️ Preview"):
152
+ try:
153
+ preview_doc = fitz.open(stream=item['rubric'], filetype="pdf")
154
+ pix = preview_doc[0].get_pixmap()
155
+ st.image(pix.tobytes())
156
+ except Exception as e:
157
+ st.error(f"Could not generate preview: {e}")
158
+ st.download_button(
159
+ label="⬇️ Download PDF",
160
+ data=item['rubric'],
161
+ file_name=f"{item['key'].lower().replace(' ','_')}_rubric.pdf",
162
+ mime="application/pdf"
163
+ )
164
+ else:
165
+ st.warning("Not found")
166
+
167
+ with c3:
168
+ st.markdown("#### Solution")
169
+ if item['solution']:
170
+ with st.expander("👁️ Preview"):
171
+ try:
172
+ preview_doc = fitz.open(stream=item['solution'], filetype="pdf")
173
+ pix = preview_doc[0].get_pixmap()
174
+ st.image(pix.tobytes())
175
+ except Exception as e:
176
+ st.error(f"Could not generate preview: {e}")
177
+ st.download_button(
178
+ label="⬇️ Download PDF",
179
+ data=item['solution'],
180
+ file_name=f"{item['key'].lower().replace(' ','_')}_solution.pdf",
181
+ mime="application/pdf"
182
+ )
183
+ else:
184
+ st.warning("Not found")