pdfclean / app.py
hjbarraza's picture
Hardcode auth password
4b7d2b7 verified
#!/usr/bin/env python3
"""Gradio web UI for pdfclean β€” drag-and-drop PDF watermark/footer removal."""
import os
import tempfile
import zipfile
import gradio as gr
from pdfclean import clean_pdf_headless, extract_title
# ─── Theme ───────────────────────────────────────────────────────────────────
# Dark charcoal + bone-white. Calm, soothing, confident.
CHARCOAL = "#141414" # background β€” uniform dark
CHARCOAL_LIGHT = "#141414" # cards, panels β€” same as bg (no two-tone)
CHARCOAL_MID = "#1E1E1E" # inputs, elevated surfaces
CHARCOAL_SOFT = "#2A2A2A" # borders, dividers
BONE = "#E4E4E4" # primary text
BONE_DIM = "#8A8A8A" # secondary text, labels
BONE_FAINT = "#5A5A5A" # placeholders, muted
ACCENT = "#E4E4E4" # white accent (buttons)
ACCENT_HOVER = "#FFFFFF"
theme = gr.themes.Base(
primary_hue=gr.themes.Color(
c50=ACCENT, c100=ACCENT, c200=ACCENT, c300=ACCENT,
c400=ACCENT, c500=ACCENT, c600=ACCENT, c700=ACCENT,
c800=ACCENT, c900=ACCENT, c950=ACCENT,
),
neutral_hue=gr.themes.Color(
c50=BONE, c100="#e8e4dc", c200="#d0ccc4", c300=BONE_DIM,
c400=BONE_FAINT, c500="#5c5954", c600=CHARCOAL_SOFT,
c700=CHARCOAL_MID, c800=CHARCOAL_LIGHT, c900=CHARCOAL,
c950="#111111",
),
font=[gr.themes.GoogleFont("Instrument Sans"), "Helvetica", "Arial", "sans-serif"],
font_mono=[gr.themes.GoogleFont("JetBrains Mono"), "monospace"],
).set(
# Body
body_background_fill=CHARCOAL,
body_background_fill_dark=CHARCOAL,
body_text_color=BONE,
body_text_color_dark=BONE,
body_text_color_subdued=BONE_DIM,
body_text_color_subdued_dark=BONE_DIM,
# Blocks (panels, cards)
block_background_fill=CHARCOAL_LIGHT,
block_background_fill_dark=CHARCOAL_LIGHT,
block_border_color=CHARCOAL_SOFT,
block_border_color_dark=CHARCOAL_SOFT,
block_label_text_color=BONE_DIM,
block_label_text_color_dark=BONE_DIM,
block_label_background_fill=CHARCOAL_MID,
block_label_background_fill_dark=CHARCOAL_MID,
block_title_text_color=BONE,
block_title_text_color_dark=BONE,
# Inputs
input_background_fill=CHARCOAL_MID,
input_background_fill_dark=CHARCOAL_MID,
input_border_color=CHARCOAL_SOFT,
input_border_color_dark=CHARCOAL_SOFT,
input_border_color_focus=ACCENT,
input_border_color_focus_dark=ACCENT,
input_placeholder_color=BONE_FAINT,
input_placeholder_color_dark=BONE_FAINT,
# Buttons
button_primary_background_fill=ACCENT,
button_primary_background_fill_dark=ACCENT,
button_primary_background_fill_hover=ACCENT_HOVER,
button_primary_background_fill_hover_dark=ACCENT_HOVER,
button_primary_text_color=CHARCOAL,
button_primary_text_color_dark=CHARCOAL,
button_secondary_background_fill=CHARCOAL_MID,
button_secondary_background_fill_dark=CHARCOAL_MID,
button_secondary_text_color=BONE,
button_secondary_text_color_dark=BONE,
button_secondary_border_color=CHARCOAL_SOFT,
button_secondary_border_color_dark=CHARCOAL_SOFT,
# Checkbox
checkbox_background_color=CHARCOAL_MID,
checkbox_background_color_dark=CHARCOAL_MID,
checkbox_background_color_selected=ACCENT,
checkbox_background_color_selected_dark=ACCENT,
checkbox_border_color=CHARCOAL_SOFT,
checkbox_border_color_dark=CHARCOAL_SOFT,
checkbox_border_color_selected=ACCENT,
checkbox_border_color_selected_dark=ACCENT,
checkbox_label_text_color=BONE,
checkbox_label_text_color_dark=BONE,
# Panels
panel_background_fill=CHARCOAL_LIGHT,
panel_background_fill_dark=CHARCOAL_LIGHT,
panel_border_color=CHARCOAL_SOFT,
panel_border_color_dark=CHARCOAL_SOFT,
# Borders
border_color_primary=CHARCOAL_SOFT,
border_color_primary_dark=CHARCOAL_SOFT,
border_color_accent=ACCENT,
border_color_accent_dark=ACCENT,
# Loader
loader_color=ACCENT,
loader_color_dark=ACCENT,
# Shadow
button_primary_shadow="none",
button_primary_shadow_dark="none",
button_primary_shadow_hover="none",
button_primary_shadow_hover_dark="none",
button_primary_shadow_active="none",
button_primary_shadow_active_dark="none",
)
CUSTOM_CSS = """
.gradio-container { max-width: 960px !important; margin: 0 auto !important; }
.prose h1 { color: #E4E4E4 !important; font-weight: 600 !important; letter-spacing: -0.02em; }
.prose h3 { color: #8A8A8A !important; font-weight: 500 !important; }
.prose p, .prose li { color: #8A8A8A !important; }
.prose strong { color: #E4E4E4 !important; }
.prose code { color: #E4E4E4 !important; background: #1E1E1E !important; }
footer { display: none !important; }
"""
# ─── Helpers ─────────────────────────────────────────────────────────────────
def make_zip(file_paths):
"""Bundle multiple files into a single ZIP for easy download."""
tmp = tempfile.NamedTemporaryFile(suffix=".zip", delete=False)
with zipfile.ZipFile(tmp.name, "w", zipfile.ZIP_DEFLATED) as zf:
for path in file_paths:
zf.write(path, os.path.basename(path))
return tmp.name
# ─── Processing ──────────────────────────────────────────────────────────────
def process_pdfs(pdf_files, export_markdown):
"""Clean multiple PDFs and return results."""
if not pdf_files:
raise gr.Error("Please upload at least one PDF file.")
output_files = []
all_status = []
for pdf_file in pdf_files:
input_path = pdf_file if isinstance(pdf_file, str) else pdf_file.name
basename = os.path.basename(input_path)
# Generate output path
title_slug = extract_title(input_path)
tmp_dir = tempfile.mkdtemp()
if title_slug:
out_name = f"{title_slug}.pdf"
else:
out_name = os.path.splitext(basename)[0] + "-clean.pdf"
output_path = os.path.join(tmp_dir, out_name)
# Clean
success, messages, md_path = clean_pdf_headless(
input_path, output_path, export_md=export_markdown
)
output_files.append(output_path)
if md_path:
output_files.append(md_path)
# Status
icon = "βœ“" if success else "⚠"
all_status.append(f"{icon} {basename} β†’ {out_name}\n " + "\n ".join(messages))
# ZIP if multiple outputs
if len(output_files) > 1:
output_files.append(make_zip(output_files))
status_text = "\n\n".join(all_status)
return output_files, status_text
# ─── UI ──────────────────────────────────────────────────────────────────────
with gr.Blocks(title="pdfclean", theme=theme, css=CUSTOM_CSS) as app:
gr.Markdown(
"# pdfclean\n"
"### Remove watermarks and footers from academic PDFs\n\n"
"Upload one or more PDFs β€” watermarks like *\"Do Not Copy or Post\"* and "
"repeating footer lines (authorization notices, phone numbers, copyright) "
"are detected automatically and removed. "
"You can also export to clean **Markdown**.\n\n"
"**How it works:** Watermarks are detected by their rotation and size. "
"Footers are detected by repetition β€” text at the same position on most pages. "
"Body text, tables, footnotes, and page numbers are preserved."
)
with gr.Row():
with gr.Column(scale=1):
pdf_input = gr.File(
label="Upload PDFs",
file_types=[".pdf"],
file_count="multiple",
)
md_toggle = gr.Checkbox(label="Export markdown", value=True)
btn = gr.Button("Clean PDFs", variant="primary", size="lg")
with gr.Column(scale=1):
output_files = gr.File(label="Download results", file_count="multiple")
status = gr.Textbox(label="Status", interactive=False, lines=6)
btn.click(
fn=process_pdfs,
inputs=[pdf_input, md_toggle],
outputs=[output_files, status],
)
# Example file (if present)
if os.path.exists("example.pdf"):
gr.Examples(
examples=[["example.pdf"]],
inputs=[pdf_input],
label="Try an example",
)
app.launch(
auth=("user", "welovecollective"),
auth_message="Username: user",
ssr_mode=False,
)