| |
| """Gradio web UI for pdfclean β drag-and-drop PDF watermark/footer removal.""" |
| import os |
| import tempfile |
| import zipfile |
| import gradio as gr |
| from pdfclean import clean_pdf_headless, extract_title |
|
|
| |
| |
|
|
| CHARCOAL = "#141414" |
| CHARCOAL_LIGHT = "#141414" |
| CHARCOAL_MID = "#1E1E1E" |
| CHARCOAL_SOFT = "#2A2A2A" |
| BONE = "#E4E4E4" |
| BONE_DIM = "#8A8A8A" |
| BONE_FAINT = "#5A5A5A" |
| ACCENT = "#E4E4E4" |
| ACCENT_HOVER = "#FFFFFF" |
|
|
| theme = gr.themes.Base( |
| primary_hue=gr.themes.Color( |
| c50=ACCENT, c100=ACCENT, c200=ACCENT, c300=ACCENT, |
| c400=ACCENT, c500=ACCENT, c600=ACCENT, c700=ACCENT, |
| c800=ACCENT, c900=ACCENT, c950=ACCENT, |
| ), |
| neutral_hue=gr.themes.Color( |
| c50=BONE, c100="#e8e4dc", c200="#d0ccc4", c300=BONE_DIM, |
| c400=BONE_FAINT, c500="#5c5954", c600=CHARCOAL_SOFT, |
| c700=CHARCOAL_MID, c800=CHARCOAL_LIGHT, c900=CHARCOAL, |
| c950="#111111", |
| ), |
| font=[gr.themes.GoogleFont("Instrument Sans"), "Helvetica", "Arial", "sans-serif"], |
| font_mono=[gr.themes.GoogleFont("JetBrains Mono"), "monospace"], |
| ).set( |
| |
| body_background_fill=CHARCOAL, |
| body_background_fill_dark=CHARCOAL, |
| body_text_color=BONE, |
| body_text_color_dark=BONE, |
| body_text_color_subdued=BONE_DIM, |
| body_text_color_subdued_dark=BONE_DIM, |
|
|
| |
| block_background_fill=CHARCOAL_LIGHT, |
| block_background_fill_dark=CHARCOAL_LIGHT, |
| block_border_color=CHARCOAL_SOFT, |
| block_border_color_dark=CHARCOAL_SOFT, |
| block_label_text_color=BONE_DIM, |
| block_label_text_color_dark=BONE_DIM, |
| block_label_background_fill=CHARCOAL_MID, |
| block_label_background_fill_dark=CHARCOAL_MID, |
| block_title_text_color=BONE, |
| block_title_text_color_dark=BONE, |
|
|
| |
| input_background_fill=CHARCOAL_MID, |
| input_background_fill_dark=CHARCOAL_MID, |
| input_border_color=CHARCOAL_SOFT, |
| input_border_color_dark=CHARCOAL_SOFT, |
| input_border_color_focus=ACCENT, |
| input_border_color_focus_dark=ACCENT, |
| input_placeholder_color=BONE_FAINT, |
| input_placeholder_color_dark=BONE_FAINT, |
|
|
| |
| button_primary_background_fill=ACCENT, |
| button_primary_background_fill_dark=ACCENT, |
| button_primary_background_fill_hover=ACCENT_HOVER, |
| button_primary_background_fill_hover_dark=ACCENT_HOVER, |
| button_primary_text_color=CHARCOAL, |
| button_primary_text_color_dark=CHARCOAL, |
| button_secondary_background_fill=CHARCOAL_MID, |
| button_secondary_background_fill_dark=CHARCOAL_MID, |
| button_secondary_text_color=BONE, |
| button_secondary_text_color_dark=BONE, |
| button_secondary_border_color=CHARCOAL_SOFT, |
| button_secondary_border_color_dark=CHARCOAL_SOFT, |
|
|
| |
| checkbox_background_color=CHARCOAL_MID, |
| checkbox_background_color_dark=CHARCOAL_MID, |
| checkbox_background_color_selected=ACCENT, |
| checkbox_background_color_selected_dark=ACCENT, |
| checkbox_border_color=CHARCOAL_SOFT, |
| checkbox_border_color_dark=CHARCOAL_SOFT, |
| checkbox_border_color_selected=ACCENT, |
| checkbox_border_color_selected_dark=ACCENT, |
| checkbox_label_text_color=BONE, |
| checkbox_label_text_color_dark=BONE, |
|
|
| |
| panel_background_fill=CHARCOAL_LIGHT, |
| panel_background_fill_dark=CHARCOAL_LIGHT, |
| panel_border_color=CHARCOAL_SOFT, |
| panel_border_color_dark=CHARCOAL_SOFT, |
|
|
| |
| border_color_primary=CHARCOAL_SOFT, |
| border_color_primary_dark=CHARCOAL_SOFT, |
| border_color_accent=ACCENT, |
| border_color_accent_dark=ACCENT, |
|
|
| |
| loader_color=ACCENT, |
| loader_color_dark=ACCENT, |
|
|
| |
| button_primary_shadow="none", |
| button_primary_shadow_dark="none", |
| button_primary_shadow_hover="none", |
| button_primary_shadow_hover_dark="none", |
| button_primary_shadow_active="none", |
| button_primary_shadow_active_dark="none", |
| ) |
|
|
| CUSTOM_CSS = """ |
| .gradio-container { max-width: 960px !important; margin: 0 auto !important; } |
| .prose h1 { color: #E4E4E4 !important; font-weight: 600 !important; letter-spacing: -0.02em; } |
| .prose h3 { color: #8A8A8A !important; font-weight: 500 !important; } |
| .prose p, .prose li { color: #8A8A8A !important; } |
| .prose strong { color: #E4E4E4 !important; } |
| .prose code { color: #E4E4E4 !important; background: #1E1E1E !important; } |
| footer { display: none !important; } |
| """ |
|
|
|
|
| |
|
|
| def make_zip(file_paths): |
| """Bundle multiple files into a single ZIP for easy download.""" |
| tmp = tempfile.NamedTemporaryFile(suffix=".zip", delete=False) |
| with zipfile.ZipFile(tmp.name, "w", zipfile.ZIP_DEFLATED) as zf: |
| for path in file_paths: |
| zf.write(path, os.path.basename(path)) |
| return tmp.name |
|
|
|
|
| |
|
|
| def process_pdfs(pdf_files, export_markdown): |
| """Clean multiple PDFs and return results.""" |
| if not pdf_files: |
| raise gr.Error("Please upload at least one PDF file.") |
|
|
| output_files = [] |
| all_status = [] |
|
|
| for pdf_file in pdf_files: |
| input_path = pdf_file if isinstance(pdf_file, str) else pdf_file.name |
| basename = os.path.basename(input_path) |
|
|
| |
| title_slug = extract_title(input_path) |
| tmp_dir = tempfile.mkdtemp() |
| if title_slug: |
| out_name = f"{title_slug}.pdf" |
| else: |
| out_name = os.path.splitext(basename)[0] + "-clean.pdf" |
| output_path = os.path.join(tmp_dir, out_name) |
|
|
| |
| success, messages, md_path = clean_pdf_headless( |
| input_path, output_path, export_md=export_markdown |
| ) |
|
|
| output_files.append(output_path) |
| if md_path: |
| output_files.append(md_path) |
|
|
| |
| icon = "β" if success else "β " |
| all_status.append(f"{icon} {basename} β {out_name}\n " + "\n ".join(messages)) |
|
|
| |
| if len(output_files) > 1: |
| output_files.append(make_zip(output_files)) |
|
|
| status_text = "\n\n".join(all_status) |
| return output_files, status_text |
|
|
|
|
| |
|
|
| with gr.Blocks(title="pdfclean", theme=theme, css=CUSTOM_CSS) as app: |
|
|
| gr.Markdown( |
| "# pdfclean\n" |
| "### Remove watermarks and footers from academic PDFs\n\n" |
| "Upload one or more PDFs β watermarks like *\"Do Not Copy or Post\"* and " |
| "repeating footer lines (authorization notices, phone numbers, copyright) " |
| "are detected automatically and removed. " |
| "You can also export to clean **Markdown**.\n\n" |
| "**How it works:** Watermarks are detected by their rotation and size. " |
| "Footers are detected by repetition β text at the same position on most pages. " |
| "Body text, tables, footnotes, and page numbers are preserved." |
| ) |
|
|
| with gr.Row(): |
| with gr.Column(scale=1): |
| pdf_input = gr.File( |
| label="Upload PDFs", |
| file_types=[".pdf"], |
| file_count="multiple", |
| ) |
| md_toggle = gr.Checkbox(label="Export markdown", value=True) |
| btn = gr.Button("Clean PDFs", variant="primary", size="lg") |
|
|
| with gr.Column(scale=1): |
| output_files = gr.File(label="Download results", file_count="multiple") |
| status = gr.Textbox(label="Status", interactive=False, lines=6) |
|
|
| btn.click( |
| fn=process_pdfs, |
| inputs=[pdf_input, md_toggle], |
| outputs=[output_files, status], |
| ) |
|
|
| |
| if os.path.exists("example.pdf"): |
| gr.Examples( |
| examples=[["example.pdf"]], |
| inputs=[pdf_input], |
| label="Try an example", |
| ) |
|
|
| app.launch( |
| auth=("user", "welovecollective"), |
| auth_message="Username: user", |
| ssr_mode=False, |
| ) |
|
|