File size: 19,562 Bytes
046e3b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fbc3470
046e3b8
fbc3470
046e3b8
 
fbc3470
046e3b8
 
 
 
fbc3470
 
 
046e3b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9386150
 
046e3b8
 
 
 
 
 
 
 
 
 
9386150
 
046e3b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9386150
 
046e3b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
import streamlit as st
import pandas as pd
import fitz  # PyMuPDF
import os
import subprocess
import tempfile
import sys
import toml
import shutil
import zipfile
import io

# Ensure we can import from utils if needed
sys.path.append(os.path.dirname(__file__))
from utils import toc_processor
from pdfxmeta import pdfxmeta

st.set_page_config(page_title="PDF Bookmark Splitter", layout="wide")

st.title("PDF Bookmarker & Splitter")

st.markdown("""

**Upload a PDF**, analyze its fonts to find top-level headings, and generate Bookmarks for splitting by chapter.

""")

uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")

# Short guidance shown under the uploader
st.caption("Recommended use: After uploading your PDF, search for the text of a known chapter heading. Once the correct entry is identified in the search results, select the corresponding entry from the drop down, and optionally repeat the step to ensure back matter is split off from the last chapter before running the pipeline.")

if uploaded_file is not None:
    # We need to save the uploaded file to disk for the CLI tools to read it
    # We'll use a permanent temp file for the session so we don't have to re-upload constantly
    # But for cleanliness, we might want to put this in a temp dir too?
    # For now, keeping the input file logic as is (tempfile), but we'll put OUTPUTS in a pure temp dir
    
    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf:
        tmp_pdf.write(uploaded_file.getvalue())
        input_pdf_path = tmp_pdf.name

    # --- State Management & Reset ---
    # Check if a new file is uploaded
    file_id = f"{uploaded_file.name}_{uploaded_file.size}" # Robust proxy for ID
    if 'current_file_id' not in st.session_state:
        st.session_state['current_file_id'] = None
        
    if st.session_state['current_file_id'] != file_id:
        # NEW FILE DETECTED: Reset Pipeline State
        keys_to_reset = ['final_pdf_bytes', 'final_zip_bytes', 'final_zip_name', 'search_matches', 'font_name', 'font_size']
        for k in keys_to_reset:
            if k in st.session_state:
                del st.session_state[k]
        st.session_state['current_file_id'] = file_id
        # st.toast(f"New file loaded: {uploaded_file.name}. State cleared.")

    st.success(f"Loaded: {uploaded_file.name}")

    # --- Data Source Selection ---
    st.header("1. Source Selection")
    source_mode = st.radio("Where should the bookmarks come from?", 
             ["Scan & Generate (Create New)", "Use Existing Bookmarks (Modify)"],
             help="Choose 'Scan & Generate' to build new bookmarks from fonts. Choose 'Use Existing' to tidy up bookmarks already in the file.")

    # --- Analysis Section (Only for Generate) ---
    if source_mode == "Scan & Generate (Create New)":
        st.header("2. Analyze Fonts")
        
        if 'font_name' not in st.session_state:
            st.session_state['font_name'] = ''
        if 'font_size' not in st.session_state:
            st.session_state['font_size'] = 18.0
            
        tab1, tab2 = st.tabs(["Scan for Large Fonts", "Search by Text"])
        
        with tab1:
            if st.button("Find Header Candidates"):
                with st.spinner("Scanning PDF for large fonts..."):
                    doc = fitz.open(input_pdf_path)
                    candidates = []
                    for page in doc[:50]:
                        text_page = page.get_text("dict")
                        for block in text_page["blocks"]:
                            for line in block.get("lines", []):
                                for span in line["spans"]:
                                    text = span["text"].strip()
                                    if len(text) > 3:
                                        candidates.append({
                                            "Text": text[:50],
                                            "Font": span["font"],
                                            "Size": round(span["size"], 2),
                                            "Page": page.number + 1
                                        })
                    doc.close()
                    if candidates:
                        df = pd.DataFrame(candidates)
                        summary = df.groupby(['Font', 'Size']).size().reset_index(name='Count')
                        summary = summary.sort_values(by=['Size', 'Count'], ascending=[False, False]).head(20)
                        st.session_state['scan_results'] = summary
                    else:
                        st.warning("No text found.")
            
            if 'scan_results' in st.session_state:
                st.write("### Top Large Fonts Found")
                st.dataframe(st.session_state['scan_results'], use_container_width=True)
                
                def update_from_scan():
                    val = st.session_state.scan_selector
                    if val:
                        f_name = val.split(" (")[0]
                        f_size = float(val.split("(")[1].replace("pt)", ""))
                        st.session_state['font_name'] = f_name
                        st.session_state['font_size'] = f_size

                options = st.session_state['scan_results'].apply(lambda x: f"{x['Font']} ({x['Size']}pt)", axis=1)
                st.selectbox("Select extraction font:", options, key='scan_selector', on_change=update_from_scan, index=None, placeholder="Choose a font...")

        with tab2:
            search_query = st.text_input("Enter text to find (e.g., 'Chapter 1')", "")
            
            c1, c2 = st.columns([1, 3])
            with c1:
                do_search = st.button("Search Text")
            with c2:
                is_case_sensitive = st.checkbox("Case Sensitive", value=False)

            if do_search:
                with st.spinner(f"Searching for '{search_query}'..."):
                    # Use the robust pdfxmeta library
                    try:
                        doc = fitz.open(input_pdf_path)
                        # pdfxmeta expects a regex pattern, so we escape the query to be safe
                        import re
                        safe_pattern = re.escape(search_query)
                        
                        # extract_meta returns a list of dicts (spans)
                        results = pdfxmeta.extract_meta(doc, safe_pattern, ign_case=(not is_case_sensitive))
                        doc.close()
                        
                        matches = []
                        for res in results:
                            matches.append({
                                "Text": res.get("text", "").strip(),
                                "Font": res.get("font", ""),
                                "Size": round(res.get("size", 0), 2),
                                "Page": res.get("page_index", 0)
                            })
                            # Limit for display safety
                            if len(matches) > 50: break
                            
                        if matches:
                            st.session_state['search_matches'] = pd.DataFrame(matches)
                        else:
                            st.warning("No matches found.")
                            
                    except Exception as e:
                        st.error(f"Search failed: {e}")
            
            if 'search_matches' in st.session_state:
                st.write(f"### Found Matches")
                st.dataframe(st.session_state['search_matches'], use_container_width=True)
                
                def update_from_search():
                    val = st.session_state.search_selector
                    if val:
                        parts = val.split(" (")
                        f_name = parts[0]
                        f_size = float(parts[1].split("pt)")[0])
                        st.session_state['font_name'] = f_name
                        st.session_state['font_size'] = f_size

                options = st.session_state['search_matches'].apply(lambda x: f"{x['Font']} ({x['Size']}pt) - Pg {x['Page']}", axis=1)
                st.selectbox("Select font from match:", options, key='search_selector', on_change=update_from_search, index=None, placeholder="Choose a match...")

        # --- Configuration (Only for Generate) ---
        st.header("3. Configure Recipe")
        col1, col2 = st.columns(2)
        with col1:
            font_name_input = st.text_input("Font Name", key='font_name')
        with col2:
            font_size_input = st.number_input("Font Size", key='font_size')
        
        greedy = st.checkbox("Greedy Match (Merge multiline specs)", value=True)
        
        # --- Back Matter Configuration ---
        with st.expander("Back Matter Configuration (Optional)", expanded=False):
            st.markdown("Identify where the **Back Matter** (Index, Glossary, etc.) starts to split it into a separate `999_Back_matter.pdf`.")
            
            # Independent Search for Back Matter
            bm_query = st.text_input("Find Back Matter start (e.g., 'Index')", key="bm_search_query")
            
            c_bm1, c_bm2 = st.columns([1, 3])
            with c_bm1:
                 do_bm_search = st.button("Search Back Matter")
            with c_bm2:
                 bm_case_sensitive = st.checkbox("Case Sensitive", key="bm_sens", value=False)
                 
            if do_bm_search:
                with st.spinner("Searching..."):
                    try:
                        doc = fitz.open(input_pdf_path)
                        import re
                        safe_pattern = re.escape(bm_query)
                        results = pdfxmeta.extract_meta(doc, safe_pattern, ign_case=(not bm_case_sensitive))
                        doc.close()
                        
                        bm_matches = []
                        for res in results:
                            bm_matches.append({
                                "Text": res.get("text", "").strip(), 
                                "Page": res.get("page_index", 0) # Display raw (already 1-based from pdfxmeta)
                            })
                            if len(bm_matches) > 50: break
                        
                        if bm_matches:
                            st.session_state['bm_matches'] = pd.DataFrame(bm_matches)
                        else:
                            st.warning("No matches found.")
                    except Exception as e:
                        st.error(f"Search failed: {e}")

            if 'bm_matches' in st.session_state:
                st.dataframe(st.session_state['bm_matches'], use_container_width=True)
                
                def update_bm_page():
                    val = st.session_state.bm_selector
                    if val:
                        # Value format: "Page X - Text..."
                        page_num = int(val.split(" -")[0].replace("Page ", ""))
                        st.session_state['back_matter_page'] = page_num
                
                bm_options = st.session_state['bm_matches'].apply(lambda x: f"Page {x['Page']} - {x['Text'][:30]}...", axis=1)
                st.selectbox("Select Start Page:", bm_options, key='bm_selector', on_change=update_bm_page, index=None, placeholder="Select start page...")

            # Manual Override
            # Update session state when this input changes
            def update_manual_bm():
                st.session_state['back_matter_page'] = st.session_state.back_matter_page_manual
                
            st.number_input("Or manually set Start Page:", min_value=0, value=st.session_state.get('back_matter_page', 0), key='back_matter_page_manual', on_change=update_manual_bm)

    else:
        # Existing Mode
        st.info("Using existing bookmarks. They will be cleaned, numbered, and used for splitting/downloading.")

    # --- Generation ---
    st.header("4. Process & Generate")
    
    if st.button("Run Pipeline"):
        # Validate inputs if generating
        if source_mode == "Scan & Generate (Create New)" and not st.session_state.get('font_name'):
            st.error("Please specify a font name for extraction.")
        else:
            with st.status("Running pipeline tasks...", expanded=True) as status:
                # Use a temporary directory for all intermediate files
                with tempfile.TemporaryDirectory() as temp_dir:
                    status.write(f"Created temp workspace: {temp_dir}")
                    
                    # Paths
                    recipe_path = os.path.join(temp_dir, "recipe.toml")
                    raw_toc_path = os.path.join(temp_dir, "raw.toc") # pdftocgen output
                    clean_toc_path = os.path.join(temp_dir, "clean.toc") # modify_toc output
                    output_pdf_path = os.path.join(temp_dir, "final.pdf")
                    
                    raw_toc_content = ""

                    if source_mode == "Scan & Generate (Create New)":
                        # 1. Create Recipe
                        recipe_data = {
                            "heading": [{
                                "level": 1,
                                "greedy": greedy,
                                "font": {
                                    "name": st.session_state['font_name'],
                                    "size": st.session_state['font_size'],
                                    "size_tolerance": 0.1
                                }
                            }]
                        }
                        with open(recipe_path, "w") as f:
                            toml.dump(recipe_data, f)
                        status.write("βœ… Recipe created")
                        
                        # 2. Run pdftocgen -> raw.toc
                        status.write("Running pdftocgen (Scanning)...")
                        cmd1 = [sys.executable, "-m", "pdftocgen", "-r", recipe_path, input_pdf_path]
                        process = subprocess.run(cmd1, capture_output=True, text=True, encoding='utf-8')
                        if process.returncode != 0:
                            st.error(f"pdftocgen failed: {process.stderr}")
                            st.stop()
                        raw_toc_content = process.stdout
                        status.write("βœ… Headers extracted")

                    else:
                        # Existing Bookmarks
                        status.write("Extracting existing bookmarks...")
                        # Run pdftocio in extract mode
                        cmd1 = [sys.executable, "-m", "pdftocio", input_pdf_path]
                        process = subprocess.run(cmd1, capture_output=True, text=True, encoding='utf-8')
                        if process.returncode != 0:
                            st.error(f"pdftocio failed: {process.stderr}")
                            st.stop()
                        raw_toc_content = process.stdout
                        if not raw_toc_content.strip():
                            st.warning("No existing bookmarks found!")
                            st.stop()
                        status.write("βœ… Existing bookmarks imported")
                    
                    # 3. Clean Content (Using centralized utility)
                    status.write("Cleaning and merging bookmarks...")
                    cleaned_toc_content = toc_processor.process_toc(raw_toc_content)
                    
                    with open(clean_toc_path, "w", encoding='utf-8') as f:
                        f.write(cleaned_toc_content)
                    status.write("βœ… Bookmarks formatted (Double-splits fixed)")
                    
                    # 4. Write PDF
                    status.write("Writing to PDF...")
                    cmd3 = [sys.executable, "-m", "pdftocio", "-t", clean_toc_path, "-o", output_pdf_path, input_pdf_path]
                    process = subprocess.run(cmd3, capture_output=True, text=True)
                    if process.returncode != 0:
                        st.error(f"pdftocio failed: {process.stderr}")
                        st.stop()
                    status.write("βœ… PDF saved")
                    
                    # 5. Read Result for Download
                    with open(output_pdf_path, "rb") as f:
                        st.session_state['final_pdf_bytes'] = f.read()
                    
                    # 6. Split & Zip (The Feature)
                    # Use a temp file for the zip to avoid memory issues
                    with tempfile.NamedTemporaryFile(suffix=".zip", delete=False) as tmp_zip:
                        tmp_zip_path = tmp_zip.name
                    
                    try:
                        # Pass back_matter_page if it exists and is valid
                        bm_page = st.session_state.get('back_matter_page', 0)
                        if bm_page == 0: bm_page = None
                        
                        toc_processor.generate_chapter_splits(output_pdf_path, tmp_zip_path, back_matter_start_page=bm_page)
                        
                        with open(tmp_zip_path, "rb") as f:
                            st.session_state['final_zip_bytes'] = f.read()
                            
                        base_name = os.path.splitext(uploaded_file.name)[0]
                        st.session_state['final_zip_name'] = f"{base_name}_chapters.zip"
                        
                    except Exception as e:
                        st.error(f"Error generating zip: {e}")
                    finally:
                        if os.path.exists(tmp_zip_path):
                            os.unlink(tmp_zip_path)

    # --- Persistent Download Area ---
    if 'final_pdf_bytes' in st.session_state:
        st.success("Pipeline completed successfully!")
        st.write("### Downloads")
        
        c_dl1, c_dl2 = st.columns(2)
        with c_dl1:
            st.download_button(
                label="Download Bookmarked PDF",
                data=st.session_state['final_pdf_bytes'],
                file_name="bookmarked_doc.pdf",
                mime="application/pdf",
                key="dl_pdf_btn"
            )
        
        with c_dl2:
            if 'final_zip_bytes' in st.session_state:
                st.download_button(
                    label=f"Download ZIP ({st.session_state['final_zip_name']})",
                    data=st.session_state['final_zip_bytes'],
                    file_name=st.session_state['final_zip_name'],
                    mime="application/zip",
                    key="dl_zip_btn"
                )

    st.markdown("---")
    st.markdown("""

    <div style="text-align: center; color: #666; font-size: 0.8em;">

        Based on <a href="https://github.com/Krasjet/pdf.tocgen" target="_blank">pdf.tocgen</a> by krasjet. <br>

        Enhanced with UI, Chapter Splitting, and Metadata Search. Licensed under AGPL-3.0.

    </div>

    """, unsafe_allow_html=True)