#!/usr/bin/env python3 """Gradio web UI for pdfclean — drag-and-drop PDF watermark/footer removal.""" import os import tempfile import zipfile import gradio as gr from pdfclean import clean_pdf_headless, extract_title # ─── Theme ─────────────────────────────────────────────────────────────────── # Dark charcoal + bone-white. Calm, soothing, confident. CHARCOAL = "#141414" # background — uniform dark CHARCOAL_LIGHT = "#141414" # cards, panels — same as bg (no two-tone) CHARCOAL_MID = "#1E1E1E" # inputs, elevated surfaces CHARCOAL_SOFT = "#2A2A2A" # borders, dividers BONE = "#E4E4E4" # primary text BONE_DIM = "#8A8A8A" # secondary text, labels BONE_FAINT = "#5A5A5A" # placeholders, muted ACCENT = "#E4E4E4" # white accent (buttons) ACCENT_HOVER = "#FFFFFF" theme = gr.themes.Base( primary_hue=gr.themes.Color( c50=ACCENT, c100=ACCENT, c200=ACCENT, c300=ACCENT, c400=ACCENT, c500=ACCENT, c600=ACCENT, c700=ACCENT, c800=ACCENT, c900=ACCENT, c950=ACCENT, ), neutral_hue=gr.themes.Color( c50=BONE, c100="#e8e4dc", c200="#d0ccc4", c300=BONE_DIM, c400=BONE_FAINT, c500="#5c5954", c600=CHARCOAL_SOFT, c700=CHARCOAL_MID, c800=CHARCOAL_LIGHT, c900=CHARCOAL, c950="#111111", ), font=[gr.themes.GoogleFont("Instrument Sans"), "Helvetica", "Arial", "sans-serif"], font_mono=[gr.themes.GoogleFont("JetBrains Mono"), "monospace"], ).set( # Body body_background_fill=CHARCOAL, body_background_fill_dark=CHARCOAL, body_text_color=BONE, body_text_color_dark=BONE, body_text_color_subdued=BONE_DIM, body_text_color_subdued_dark=BONE_DIM, # Blocks (panels, cards) block_background_fill=CHARCOAL_LIGHT, block_background_fill_dark=CHARCOAL_LIGHT, block_border_color=CHARCOAL_SOFT, block_border_color_dark=CHARCOAL_SOFT, block_label_text_color=BONE_DIM, block_label_text_color_dark=BONE_DIM, block_label_background_fill=CHARCOAL_MID, block_label_background_fill_dark=CHARCOAL_MID, block_title_text_color=BONE, block_title_text_color_dark=BONE, # Inputs input_background_fill=CHARCOAL_MID, input_background_fill_dark=CHARCOAL_MID, input_border_color=CHARCOAL_SOFT, input_border_color_dark=CHARCOAL_SOFT, input_border_color_focus=ACCENT, input_border_color_focus_dark=ACCENT, input_placeholder_color=BONE_FAINT, input_placeholder_color_dark=BONE_FAINT, # Buttons button_primary_background_fill=ACCENT, button_primary_background_fill_dark=ACCENT, button_primary_background_fill_hover=ACCENT_HOVER, button_primary_background_fill_hover_dark=ACCENT_HOVER, button_primary_text_color=CHARCOAL, button_primary_text_color_dark=CHARCOAL, button_secondary_background_fill=CHARCOAL_MID, button_secondary_background_fill_dark=CHARCOAL_MID, button_secondary_text_color=BONE, button_secondary_text_color_dark=BONE, button_secondary_border_color=CHARCOAL_SOFT, button_secondary_border_color_dark=CHARCOAL_SOFT, # Checkbox checkbox_background_color=CHARCOAL_MID, checkbox_background_color_dark=CHARCOAL_MID, checkbox_background_color_selected=ACCENT, checkbox_background_color_selected_dark=ACCENT, checkbox_border_color=CHARCOAL_SOFT, checkbox_border_color_dark=CHARCOAL_SOFT, checkbox_border_color_selected=ACCENT, checkbox_border_color_selected_dark=ACCENT, checkbox_label_text_color=BONE, checkbox_label_text_color_dark=BONE, # Panels panel_background_fill=CHARCOAL_LIGHT, panel_background_fill_dark=CHARCOAL_LIGHT, panel_border_color=CHARCOAL_SOFT, panel_border_color_dark=CHARCOAL_SOFT, # Borders border_color_primary=CHARCOAL_SOFT, border_color_primary_dark=CHARCOAL_SOFT, border_color_accent=ACCENT, border_color_accent_dark=ACCENT, # Loader loader_color=ACCENT, loader_color_dark=ACCENT, # Shadow button_primary_shadow="none", button_primary_shadow_dark="none", button_primary_shadow_hover="none", button_primary_shadow_hover_dark="none", button_primary_shadow_active="none", button_primary_shadow_active_dark="none", ) CUSTOM_CSS = """ .gradio-container { max-width: 960px !important; margin: 0 auto !important; } .prose h1 { color: #E4E4E4 !important; font-weight: 600 !important; letter-spacing: -0.02em; } .prose h3 { color: #8A8A8A !important; font-weight: 500 !important; } .prose p, .prose li { color: #8A8A8A !important; } .prose strong { color: #E4E4E4 !important; } .prose code { color: #E4E4E4 !important; background: #1E1E1E !important; } footer { display: none !important; } """ # ─── Helpers ───────────────────────────────────────────────────────────────── def make_zip(file_paths): """Bundle multiple files into a single ZIP for easy download.""" tmp = tempfile.NamedTemporaryFile(suffix=".zip", delete=False) with zipfile.ZipFile(tmp.name, "w", zipfile.ZIP_DEFLATED) as zf: for path in file_paths: zf.write(path, os.path.basename(path)) return tmp.name # ─── Processing ────────────────────────────────────────────────────────────── def process_pdfs(pdf_files, export_markdown): """Clean multiple PDFs and return results.""" if not pdf_files: raise gr.Error("Please upload at least one PDF file.") output_files = [] all_status = [] for pdf_file in pdf_files: input_path = pdf_file if isinstance(pdf_file, str) else pdf_file.name basename = os.path.basename(input_path) # Generate output path title_slug = extract_title(input_path) tmp_dir = tempfile.mkdtemp() if title_slug: out_name = f"{title_slug}.pdf" else: out_name = os.path.splitext(basename)[0] + "-clean.pdf" output_path = os.path.join(tmp_dir, out_name) # Clean success, messages, md_path = clean_pdf_headless( input_path, output_path, export_md=export_markdown ) output_files.append(output_path) if md_path: output_files.append(md_path) # Status icon = "✓" if success else "⚠" all_status.append(f"{icon} {basename} → {out_name}\n " + "\n ".join(messages)) # ZIP if multiple outputs if len(output_files) > 1: output_files.append(make_zip(output_files)) status_text = "\n\n".join(all_status) return output_files, status_text # ─── UI ────────────────────────────────────────────────────────────────────── with gr.Blocks(title="pdfclean", theme=theme, css=CUSTOM_CSS) as app: gr.Markdown( "# pdfclean\n" "### Remove watermarks and footers from academic PDFs\n\n" "Upload one or more PDFs — watermarks like *\"Do Not Copy or Post\"* and " "repeating footer lines (authorization notices, phone numbers, copyright) " "are detected automatically and removed. " "You can also export to clean **Markdown**.\n\n" "**How it works:** Watermarks are detected by their rotation and size. " "Footers are detected by repetition — text at the same position on most pages. " "Body text, tables, footnotes, and page numbers are preserved." ) with gr.Row(): with gr.Column(scale=1): pdf_input = gr.File( label="Upload PDFs", file_types=[".pdf"], file_count="multiple", ) md_toggle = gr.Checkbox(label="Export markdown", value=True) btn = gr.Button("Clean PDFs", variant="primary", size="lg") with gr.Column(scale=1): output_files = gr.File(label="Download results", file_count="multiple") status = gr.Textbox(label="Status", interactive=False, lines=6) btn.click( fn=process_pdfs, inputs=[pdf_input, md_toggle], outputs=[output_files, status], ) # Example file (if present) if os.path.exists("example.pdf"): gr.Examples( examples=[["example.pdf"]], inputs=[pdf_input], label="Try an example", ) app.launch( auth=("user", "welovecollective"), auth_message="Username: user", ssr_mode=False, )