akashmishra358 commited on
Commit
568c150
·
verified ·
1 Parent(s): a06a2eb

pdf cutter

Browse files
Files changed (1) hide show
  1. ai_studio_code.py +172 -0
ai_studio_code.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import fitz # PyMuPDF
3
+ import pandas as pd
4
+ import re
5
+ import io
6
+
7
+ # --- Core PDF Processing Functions ---
8
+
9
+ def find_sections(pdf_bytes, marker_pattern):
10
+ """
11
+ Scans a PDF and finds the start and end pages of sections based on a pattern.
12
+ Returns a dictionary like {'Q1': {'start': 0, 'end': 1}, 'Q2': {'start': 2, 'end': 3}}
13
+ """
14
+ sections = {}
15
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
16
+
17
+ # Find all occurrences of the pattern
18
+ found_items = []
19
+ for page_num, page in enumerate(doc):
20
+ text = page.get_text("text")
21
+ # Example pattern: r"^(Question|Q)\s*(\d+)"
22
+ matches = re.finditer(marker_pattern, text, re.IGNORECASE | re.MULTILINE)
23
+ for match in matches:
24
+ question_num = match.group(2) # Assumes the number is the second group
25
+ # We use a key like 'Q1', 'Q2' for consistency
26
+ key = f"Q{question_num}"
27
+ if key not in [item['key'] for item in found_items]:
28
+ found_items.append({'key': key, 'page': page_num})
29
+
30
+ if not found_items:
31
+ return {}
32
+
33
+ # Determine page ranges
34
+ for i, item in enumerate(found_items):
35
+ key = item['key']
36
+ start_page = item['page']
37
+ end_page = doc.page_count - 1 # Default to end of doc
38
+ if i + 1 < len(found_items):
39
+ end_page = found_items[i+1]['page'] - 1
40
+
41
+ # Ensure end page is not before start page
42
+ if end_page < start_page:
43
+ end_page = start_page
44
+
45
+ sections[key] = {'start': start_page, 'end': end_page}
46
+
47
+ return sections
48
+
49
+ def extract_section_pdf(pdf_bytes, start_page, end_page):
50
+ """
51
+ Extracts a range of pages from a PDF and returns it as new PDF bytes.
52
+ """
53
+ source_doc = fitz.open(stream=pdf_bytes, filetype="pdf")
54
+ new_doc = fitz.open() # Create a new empty PDF
55
+ new_doc.insert_pdf(source_doc, from_page=start_page, to_page=end_page)
56
+ return new_doc.tobytes()
57
+
58
+ # --- Streamlit UI ---
59
+
60
+ st.set_page_config(layout="wide")
61
+ st.title("📄 PDF Section Splitter & Mapper")
62
+
63
+ st.info("Upload your question, rubric, and solution PDFs. The tool will find sections (Q1, Q2...) and map them together.")
64
+
65
+ # File Uploaders
66
+ col1, col2, col3 = st.columns(3)
67
+ with col1:
68
+ q_file = st.file_uploader("1. Upload Questions PDF", type="pdf")
69
+ with col2:
70
+ r_file = st.file_uploader("2. Upload Rubrics PDF", type="pdf")
71
+ with col3:
72
+ s_file = st.file_uploader("3. Upload Solutions PDF", type="pdf")
73
+
74
+ # User-defined marker
75
+ marker_text = st.text_input(
76
+ "Enter the text marker for questions (e.g., 'Question', 'Q', 'Problem')",
77
+ value="Question"
78
+ )
79
+
80
+ if st.button("🚀 Process PDFs", disabled=(not all([q_file, r_file, s_file]))):
81
+
82
+ # Compile the regex pattern
83
+ # This pattern looks for the marker, optional space, and then captures the digits
84
+ marker_pattern = rf"^{marker_text}\s*(\d+)"
85
+
86
+ with st.spinner("Processing documents... This might take a moment."):
87
+ # Read file bytes
88
+ q_bytes = q_file.getvalue()
89
+ r_bytes = r_file.getvalue()
90
+ s_bytes = s_file.getvalue()
91
+
92
+ # Find sections in all three documents
93
+ q_sections = find_sections(q_bytes, marker_pattern)
94
+ r_sections = find_sections(r_bytes, marker_pattern)
95
+ s_sections = find_sections(s_bytes, marker_pattern)
96
+
97
+ # Get a unique, sorted list of all question keys found (e.g., Q1, Q2, Q10)
98
+ all_keys = sorted(list(set(q_sections.keys()) | set(r_sections.keys()) | set(s_sections.keys())), key=lambda x: int(x[1:]))
99
+
100
+ if not all_keys:
101
+ st.error("Could not find any sections with the provided marker. Please check your PDFs or refine the marker text.")
102
+ else:
103
+ st.success(f"Found {len(all_keys)} unique sections! Displaying results below.")
104
+
105
+ results = []
106
+ for key in all_keys:
107
+ # Extract the PDF section for each type if it exists
108
+ question_pdf_bytes = extract_section_pdf(q_bytes, **q_sections[key]) if key in q_sections else None
109
+ rubric_pdf_bytes = extract_section_pdf(r_bytes, **r_sections[key]) if key in r_sections else None
110
+ solution_pdf_bytes = extract_section_pdf(s_bytes, **s_sections[key]) if key in s_sections else None
111
+
112
+ results.append({
113
+ 'key': key,
114
+ 'question': question_pdf_bytes,
115
+ 'rubric': rubric_pdf_bytes,
116
+ 'solution': solution_pdf_bytes
117
+ })
118
+
119
+ # Display results
120
+ for item in results:
121
+ st.markdown(f"---")
122
+ st.subheader(f"Section: {item['key']}")
123
+
124
+ c1, c2, c3 = st.columns(3)
125
+
126
+ with c1:
127
+ st.markdown("#### Question")
128
+ if item['question']:
129
+ with st.expander("👁️ Preview"):
130
+ preview_doc = fitz.open(stream=item['question'], filetype="pdf")
131
+ pix = preview_doc[0].get_pixmap()
132
+ st.image(pix.tobytes())
133
+ st.download_button(
134
+ label="⬇️ Download PDF",
135
+ data=item['question'],
136
+ file_name=f"{item['key']}_question.pdf",
137
+ mime="application/pdf"
138
+ )
139
+ else:
140
+ st.warning("Not found")
141
+
142
+ with c2:
143
+ st.markdown("#### Rubric")
144
+ if item['rubric']:
145
+ with st.expander("👁️ Preview"):
146
+ preview_doc = fitz.open(stream=item['rubric'], filetype="pdf")
147
+ pix = preview_doc[0].get_pixmap()
148
+ st.image(pix.tobytes())
149
+ st.download_button(
150
+ label="⬇️ Download PDF",
151
+ data=item['rubric'],
152
+ file_name=f"{item['key']}_rubric.pdf",
153
+ mime="application/pdf"
154
+ )
155
+ else:
156
+ st.warning("Not found")
157
+
158
+ with c3:
159
+ st.markdown("#### Solution")
160
+ if item['solution']:
161
+ with st.expander("👁️ Preview"):
162
+ preview_doc = fitz.open(stream=item['solution'], filetype="pdf")
163
+ pix = preview_doc[0].get_pixmap()
164
+ st.image(pix.tobytes())
165
+ st.download_button(
166
+ label="⬇️ Download PDF",
167
+ data=item['solution'],
168
+ file_name=f"{item['key']}_solution.pdf",
169
+ mime="application/pdf"
170
+ )
171
+ else:
172
+ st.warning("Not found")