| |
| """ |
| pdfclean.py — Remove watermarks and footers from academic PDFs (any publisher) |
| Edits PDF content streams directly to surgically remove rotated watermarks and |
| bottom-of-page footer text without affecting body content. |
| Uses repetition-based detection: text that repeats at the same Y-position on >50% |
| of pages = footer (works with any publisher, no hardcoded patterns). |
| |
| Usage: python pdfclean.py document.pdf # saves document-clean.pdf |
| python pdfclean.py document.pdf -o output.pdf # custom output path |
| python pdfclean.py document.pdf --dry-run # scan only, don't modify |
| """ |
| import argparse |
| import re |
| import sys |
| import os |
| import time |
| import shutil |
| from collections import defaultdict |
|
|
| try: |
| import fitz |
| except ImportError: |
| print("Missing dependency: pip install pymupdf") |
| sys.exit(1) |
|
|
|
|
| |
| |
| |
| STEPS = ['Decrypt', 'Scan', 'Clean', 'Verify', 'Export'] |
| DIM = '\033[2m' |
| BOLD = '\033[1m' |
| GREEN = '\033[32m' |
| YELLOW = '\033[33m' |
| CYAN = '\033[36m' |
| RESET = '\033[0m' |
|
|
| class Dashboard: |
| """Minimal TUI progress dashboard with step tracking and time estimates.""" |
|
|
| def __init__(self, filename, total_pages, steps=None): |
| self.filename = os.path.basename(filename) |
| self.total_pages = total_pages |
| self.steps = steps or STEPS |
| self.current_step = 0 |
| self.step_label = '' |
| self.start_time = time.time() |
| self.step_start = time.time() |
| self.step_times = {} |
| self.sub_progress = 0.0 |
| self.detail = '' |
| self.findings = [] |
| self.warnings = [] |
| self.cols = min(shutil.get_terminal_size().columns, 80) |
|
|
| def set_step(self, idx, label=''): |
| """Move to a new pipeline step.""" |
| if self.current_step < len(self.steps): |
| self.step_times[self.current_step] = time.time() - self.step_start |
| self.current_step = idx |
| self.step_label = label |
| self.step_start = time.time() |
| self.sub_progress = 0.0 |
| self.detail = '' |
| self._render() |
|
|
| def progress(self, current, total, detail=''): |
| """Update sub-progress within current step.""" |
| self.sub_progress = current / total if total > 0 else 1.0 |
| self.detail = detail |
| self._render() |
|
|
| def add_finding(self, text): |
| self.findings.append(text) |
| self._render() |
|
|
| def add_warning(self, text): |
| self.warnings.append(text) |
| self._render() |
|
|
| def finish(self, output_path, md_path=None): |
| """Render final summary.""" |
| self.step_times[self.current_step] = time.time() - self.step_start |
| elapsed = time.time() - self.start_time |
| sys.stderr.write('\033[2J\033[H') |
|
|
| |
| w = self.cols |
| print(f"{BOLD}{'─' * w}{RESET}") |
| print(f"{BOLD} pdfclean{RESET} {DIM}{self.filename} · {self.total_pages} pages{RESET}") |
| print(f"{BOLD}{'─' * w}{RESET}") |
|
|
| |
| for i, step in enumerate(self.steps): |
| t = self.step_times.get(i) |
| if t is not None and t > 0.001: |
| mark = f'{GREEN}✓{RESET}' |
| time_str = f'{DIM}{t:.1f}s{RESET}' |
| elif t is not None: |
| mark = f'{DIM}–{RESET}' |
| time_str = f'{DIM}skip{RESET}' |
| else: |
| mark = f'{DIM}–{RESET}' |
| time_str = '' |
| print(f" {mark} {step:<12} {time_str}") |
|
|
| print() |
|
|
| |
| if self.findings: |
| for f in self.findings: |
| print(f" {f}") |
| print() |
|
|
| |
| if self.warnings: |
| for warn in self.warnings: |
| print(f" {YELLOW}⚠ {warn}{RESET}") |
| print() |
| elif any(self.step_times.get(i, 0) > 0.001 for i in [2]): |
| print(f" {GREEN}✓ all clean, all content intact{RESET}") |
| print() |
|
|
| |
| print(f" {BOLD}pdf{RESET} {output_path}") |
| if md_path: |
| print(f" {BOLD}md {RESET} {md_path}") |
|
|
| |
| print(f"\n{DIM} done in {elapsed:.1f}s{RESET}") |
| print(f"{BOLD}{'─' * w}{RESET}") |
|
|
| def _render(self): |
| """Redraw the dashboard to stderr.""" |
| elapsed = time.time() - self.start_time |
| step_elapsed = time.time() - self.step_start |
|
|
| |
| completed_times = [v for v in self.step_times.values() if v > 0.001] |
| if completed_times and self.current_step < len(self.steps): |
| avg = sum(completed_times) / len(completed_times) |
| remaining_steps = len(self.steps) - self.current_step - 1 |
| step_remaining = step_elapsed * (1 - self.sub_progress) / max(self.sub_progress, 0.01) |
| eta = step_remaining + (avg * remaining_steps) |
| else: |
| eta = 0 |
|
|
| sys.stderr.write('\033[2J\033[H') |
| w = self.cols |
|
|
| |
| print(f"{BOLD}{'─' * w}{RESET}", file=sys.stderr) |
| print(f"{BOLD} pdfclean{RESET} {DIM}{self.filename} · {self.total_pages} pages{RESET}", file=sys.stderr) |
| print(f"{BOLD}{'─' * w}{RESET}", file=sys.stderr) |
|
|
| |
| for i, step in enumerate(self.steps): |
| if i < self.current_step: |
| t = self.step_times.get(i, 0) |
| if t > 0.001: |
| mark = f'{GREEN}✓{RESET}' |
| time_str = f'{DIM}{t:.1f}s{RESET}' |
| else: |
| mark = f'{DIM}–{RESET}' |
| time_str = f'{DIM}skip{RESET}' |
| elif i == self.current_step: |
| mark = f'{CYAN}›{RESET}' |
| time_str = f'{DIM}{step_elapsed:.1f}s{RESET}' |
| else: |
| mark = f'{DIM}·{RESET}' |
| time_str = '' |
| label = self.step_label if i == self.current_step and self.step_label else step |
| print(f" {mark} {label:<12} {time_str}", file=sys.stderr) |
|
|
| print(file=sys.stderr) |
|
|
| |
| if self.sub_progress > 0: |
| bar_w = w - 20 |
| filled = int(self.sub_progress * bar_w) |
| pct = self.sub_progress * 100 |
| bar = f"{'█' * filled}{'░' * (bar_w - filled)}" |
| print(f" {bar} {pct:5.1f}%", file=sys.stderr) |
| else: |
| print(f" {DIM}working...{RESET}", file=sys.stderr) |
|
|
| |
| if self.detail: |
| detail_max = w - 4 |
| d = self.detail[:detail_max] |
| print(f" {DIM}{d}{RESET}", file=sys.stderr) |
|
|
| print(file=sys.stderr) |
|
|
| |
| for f in self.findings: |
| print(f" {f}", file=sys.stderr) |
|
|
| |
| eta_str = f'~{eta:.0f}s left' if eta > 1 else '' |
| print(f"\n{DIM} elapsed {elapsed:.1f}s {eta_str}{RESET}", file=sys.stderr) |
| print(f"{BOLD}{'─' * w}{RESET}", file=sys.stderr) |
| sys.stderr.flush() |
|
|
| class NullDashboard(Dashboard): |
| """Silent dashboard for headless/web usage.""" |
| def __init__(self, filename, total_pages, steps=None): |
| super().__init__(filename, total_pages, steps) |
| def _render(self): |
| pass |
| def finish(self, output_path, md_path=None): |
| pass |
|
|
|
|
| |
| |
| |
| def print_to_pdf(input_path, dash): |
| """Re-print an encrypted/restricted PDF into an unencrypted text-based copy.""" |
| src = fitz.open(input_path) |
| dst = fitz.open() |
| total = len(src) |
|
|
| dst.insert_pdf(src, from_page=0, to_page=total - 1) |
|
|
| for i in range(total): |
| dash.progress(i + 1, total, f'page {i+1}/{total}') |
|
|
| base, ext = os.path.splitext(input_path) |
| printed_path = f"{base}-printed{ext}" |
| dst.save(printed_path, garbage=4, deflate=True) |
| dst.close() |
| src.close() |
| return printed_path |
|
|
|
|
| |
| |
| |
| def detect_watermarks(doc): |
| """Scan all pages for large rotated text (watermarks).""" |
| findings = { |
| 'watermark_texts': set(), |
| 'watermark_pages': 0, |
| } |
|
|
| for i in range(len(doc)): |
| page = doc[i] |
| page_h = page.rect.height |
| blocks = page.get_text('dict')['blocks'] |
| has_wm = False |
|
|
| for b in blocks: |
| if b['type'] != 0: |
| continue |
| for line in b['lines']: |
| for s in line['spans']: |
| text = s['text'].strip() |
| bbox = s['bbox'] |
| height_span = bbox[3] - bbox[1] |
|
|
| |
| |
| rotated = height_span > s['size'] * 3 |
| if rotated and (s['size'] > 30 or height_span > page_h * 0.3) and text: |
| findings['watermark_texts'].add(text) |
| has_wm = True |
|
|
| if has_wm: |
| findings['watermark_pages'] += 1 |
|
|
| return findings |
|
|
|
|
| def detect_footers(doc): |
| """Detect footer zones by repetition analysis — publisher-agnostic.""" |
| total = len(doc) |
| if total < 3: |
| return {'footer_zones': {}, 'footer_pages': 0, 'footer_texts': set()} |
|
|
| y_bands = defaultdict(list) |
| bottom_pct = 0.12 |
|
|
| for i in range(total): |
| page = doc[i] |
| page_h = page.rect.height |
| threshold_y = page_h * (1 - bottom_pct) |
| blocks = page.get_text('dict')['blocks'] |
|
|
| for b in blocks: |
| if b['type'] != 0: |
| continue |
| for line in b['lines']: |
| for s in line['spans']: |
| text = s['text'].strip() |
| if not text or len(text) < 3: |
| continue |
| bbox = s['bbox'] |
| if bbox[1] < threshold_y: |
| continue |
| if len(text) < 5 and text.replace('-', '').replace('.', '').isdigit(): |
| continue |
| y_bucket = round(bbox[1] / 5) * 5 |
| y_bands[y_bucket].append({ |
| 'page': i, 'text': text, |
| 'size': s['size'], 'bbox': bbox |
| }) |
|
|
| min_pages = max(2, total * 0.5) |
| footer_zones = {} |
|
|
| for y_bucket, spans in y_bands.items(): |
| pages_with_text = set(s['page'] for s in spans) |
| if len(pages_with_text) >= min_pages: |
| sample_texts = set(s['text'] for s in spans[:5]) |
| avg_size = sum(s['size'] for s in spans) / len(spans) |
| footer_zones[y_bucket] = { |
| 'samples': sample_texts, |
| 'avg_size': avg_size, |
| 'page_count': len(pages_with_text), |
| } |
|
|
| footer_texts = set() |
| footer_pages = set() |
| for info in footer_zones.values(): |
| footer_texts.update(info['samples']) |
| y_bucket = [k for k, v in footer_zones.items() if v is info][0] |
| for s in y_bands[y_bucket]: |
| footer_pages.add(s['page']) |
|
|
| return { |
| 'footer_zones': footer_zones, |
| 'footer_pages': len(footer_pages), |
| 'footer_texts': footer_texts, |
| 'bottom_pct': bottom_pct, |
| } |
|
|
|
|
| |
| |
| |
| def build_watermark_pattern(doc): |
| """Build regex to match rotated watermark blocks in content streams.""" |
| for i in range(min(5, len(doc))): |
| page = doc[i] |
| for cx in page.get_contents(): |
| stream = doc.xref_stream(cx) |
| if not stream: |
| continue |
| text = stream.decode('latin-1', errors='replace') |
|
|
| for m in re.finditer( |
| r'([\d.]+)\s+([\d.]+)\s+(-[\d.]+)\s+([\d.]+)\s+' |
| r'[\d.\-]+\s+[\d.\-]+\s+cm\s*\n?' |
| r'BT\s*\n?' |
| r'[\d.\-]+\s+[\d.\-]+\s+(?:Td|Tm)', |
| text |
| ): |
| a, b = float(m.group(1)), float(m.group(2)) |
| if a < 0.99 or b > 0.01: |
| a_val = re.escape(m.group(1)) |
| b_val = re.escape(m.group(2)) |
| pat = ( |
| r'q\s*\n?' |
| + a_val + r'\s+' + b_val |
| + r'[\s\S]*?TJ\s*\n?\s*ET\s*\n?\s*Q' |
| ) |
| return pat |
| return None |
|
|
|
|
| def clean_pdf(input_path, output_path, dry_run=False, no_markdown=False): |
| """Main cleaning pipeline: detect, build patterns, remove, verify, export.""" |
| doc = fitz.open(input_path) |
| total = len(doc) |
|
|
| |
| steps = list(STEPS) |
| if no_markdown: |
| steps = [s for s in steps if s != 'Export'] |
|
|
| dash = Dashboard(input_path, total, steps) |
|
|
| |
| was_printed = False |
| step_idx = 0 |
| if doc.is_encrypted or doc.permissions & fitz.PDF_PERM_MODIFY == 0: |
| dash.set_step(step_idx, 'Decrypt') |
| dash.add_finding(f'{DIM}encrypted → printing unencrypted copy{RESET}') |
| printed_path = print_to_pdf(input_path, dash) |
| doc.close() |
| doc = fitz.open(printed_path) |
| input_path = printed_path |
| total = len(doc) |
| was_printed = True |
| else: |
| dash.set_step(step_idx, 'Decrypt') |
| |
|
|
| |
| step_idx = steps.index('Scan') |
| dash.set_step(step_idx, 'Scan') |
| dash.progress(0.3, 1, 'detecting watermarks...') |
| wm_findings = detect_watermarks(doc) |
| dash.progress(0.7, 1, 'detecting footers...') |
| ft_findings = detect_footers(doc) |
| dash.progress(1, 1, 'done') |
| findings = {**wm_findings, **ft_findings} |
|
|
| |
| if findings['watermark_texts']: |
| dash.add_finding(f'{CYAN}watermarks{RESET} {findings["watermark_pages"]}/{total} pages') |
|
|
| if findings['footer_zones']: |
| n_zones = len(findings['footer_zones']) |
| dash.add_finding(f'{CYAN}footers{RESET} {n_zones} zone{"s" if n_zones > 1 else ""}, {findings["footer_pages"]}/{total} pages') |
|
|
| if not findings['watermark_texts'] and not findings['footer_zones']: |
| dash.add_finding(f'{DIM}nothing to clean{RESET}') |
| if was_printed: |
| os.remove(input_path) |
| dash.finish(output_path) |
| return True |
|
|
| if dry_run: |
| dash.add_finding(f'{YELLOW}--dry-run: no changes made{RESET}') |
| if was_printed: |
| os.remove(input_path) |
| dash.finish(input_path) |
| return True |
|
|
| |
| step_idx = steps.index('Clean') |
| success, wm_total, ft_total = clean_text_pdf(doc, findings, input_path, output_path, total, dash, step_idx) |
|
|
| |
| if was_printed and os.path.exists(input_path): |
| os.remove(input_path) |
|
|
| dash.add_finding(f'{GREEN}removed{RESET} {wm_total} watermarks, {ft_total} footers') |
|
|
| |
| md_path = None |
| if success and not no_markdown: |
| step_idx = steps.index('Export') |
| md_path = export_markdown(output_path, dash, step_idx) |
|
|
| dash.finish(output_path, md_path) |
| return success |
|
|
|
|
| def clean_pdf_headless(input_path, output_path, export_md=True): |
| """Headless cleaning pipeline for web/API usage. Returns (success, warnings, md_path).""" |
| doc = fitz.open(input_path) |
| total = len(doc) |
| steps = list(STEPS) if export_md else [s for s in STEPS if s != 'Export'] |
| dash = NullDashboard(input_path, total, steps) |
|
|
| was_printed = False |
| step_idx = 0 |
| if doc.is_encrypted or doc.permissions & fitz.PDF_PERM_MODIFY == 0: |
| dash.set_step(step_idx, 'Decrypt') |
| printed_path = print_to_pdf(input_path, dash) |
| doc.close() |
| doc = fitz.open(printed_path) |
| input_path = printed_path |
| total = len(doc) |
| was_printed = True |
| else: |
| dash.set_step(step_idx, 'Decrypt') |
|
|
| step_idx = steps.index('Scan') |
| dash.set_step(step_idx, 'Scan') |
| wm_findings = detect_watermarks(doc) |
| ft_findings = detect_footers(doc) |
| findings = {**wm_findings, **ft_findings} |
|
|
| if not findings['watermark_texts'] and not findings['footer_zones']: |
| doc.close() |
| shutil.copy2(input_path, output_path) |
| if was_printed: |
| os.remove(input_path) |
| return True, ['Nothing to clean — PDF is already clean'], None |
|
|
| step_idx = steps.index('Clean') |
| success, wm_total, ft_total = clean_text_pdf(doc, findings, input_path, output_path, total, dash, step_idx) |
|
|
| if was_printed and os.path.exists(input_path): |
| os.remove(input_path) |
|
|
| warnings = list(dash.warnings) |
| warnings.insert(0, f'Removed {wm_total} watermarks, {ft_total} footers') |
|
|
| |
| |
| md_path = None |
| if export_md: |
| step_idx = steps.index('Export') |
| try: |
| md_path = export_markdown(output_path, dash, step_idx) |
| except Exception as e: |
| warnings.append(f'Markdown export failed: {e}') |
|
|
| return success, warnings, md_path |
|
|
|
|
| def clean_text_pdf(doc, findings, input_path, output_path, total, dash, step_idx): |
| """Clean PDF: watermarks via content stream regex, footers via redaction.""" |
| dash.set_step(step_idx, 'Clean') |
| wm_pattern = build_watermark_pattern(doc) |
| footer_y_buckets = set(findings.get('footer_zones', {}).keys()) |
|
|
| doc.close() |
| doc = fitz.open(input_path) |
|
|
| wm_total = 0 |
| ft_total = 0 |
|
|
| for i in range(total): |
| page = doc[i] |
| page_h = page.rect.height |
| threshold_y = page_h * (1 - findings.get('bottom_pct', 0.12)) |
|
|
| |
| if wm_pattern: |
| for cx in page.get_contents(): |
| stream = doc.xref_stream(cx) |
| if not stream: |
| continue |
| text = stream.decode('latin-1', errors='replace') |
| text, count = re.subn(wm_pattern, '', text) |
| if count > 0: |
| wm_total += count |
| doc.update_stream(cx, text.encode('latin-1')) |
|
|
| |
| if footer_y_buckets: |
| blocks = page.get_text('dict')['blocks'] |
| for b in blocks: |
| if b['type'] != 0: |
| continue |
| for line in b['lines']: |
| for s in line['spans']: |
| text = s['text'].strip() |
| if not text or len(text) < 3: |
| continue |
| bbox = s['bbox'] |
| if bbox[1] < threshold_y: |
| continue |
| y_bucket = round(bbox[1] / 5) * 5 |
| if y_bucket in footer_y_buckets: |
| rect = fitz.Rect(bbox) |
| page.add_redact_annot(rect, fill=(1, 1, 1)) |
| ft_total += 1 |
|
|
| page.apply_redactions() |
|
|
| dash.progress(i + 1, total, f'page {i+1}/{total}') |
|
|
| |
| doc.save(output_path, garbage=4, deflate=True) |
| doc.close() |
|
|
| |
| verify_idx = dash.steps.index('Verify') |
| dash.set_step(verify_idx, 'Verify') |
| doc = fitz.open(output_path) |
| remaining_wm = 0 |
| remaining_ft = 0 |
| empty_pages = 0 |
|
|
| for i in range(len(doc)): |
| text = doc[i].get_text() |
| for wt in findings['watermark_texts']: |
| if wt in text: |
| remaining_wm += 1 |
| break |
| for ft in list(findings['footer_texts'])[:3]: |
| if ft in text: |
| remaining_ft += 1 |
| break |
| if len(text.strip()) < 20 and i < total - 3: |
| empty_pages += 1 |
| dash.progress(i + 1, len(doc), f'page {i+1}/{len(doc)}') |
|
|
| doc.close() |
|
|
| if remaining_wm: |
| dash.add_warning(f'{remaining_wm} pages still have watermarks') |
| if remaining_ft: |
| dash.add_warning(f'{remaining_ft} pages still have footers') |
| if empty_pages: |
| dash.add_warning(f'{empty_pages} pages appear empty') |
|
|
| return remaining_wm == 0 and remaining_ft == 0 and empty_pages == 0, wm_total, ft_total |
|
|
|
|
| |
| |
| |
| def extract_title(pdf_path): |
| """Extract document title from PDF metadata or largest text on first page.""" |
| doc = fitz.open(pdf_path) |
|
|
| meta_title = doc.metadata.get('title', '').strip() |
| if meta_title and len(meta_title) > 5 and meta_title.lower() != 'unnamed document': |
| doc.close() |
| return slugify(meta_title) |
|
|
| page = doc[0] |
| page_h = page.rect.height |
| largest_size = 0 |
| largest_text = '' |
| for b in page.get_text('dict')['blocks']: |
| if b['type'] != 0: |
| continue |
| for line in b['lines']: |
| for s in line['spans']: |
| text = s['text'].strip() |
| if not text or len(text) <= 5: |
| continue |
| |
| bbox = s['bbox'] |
| height_span = bbox[3] - bbox[1] |
| rotated = height_span > s['size'] * 3 |
| if rotated and (s['size'] > 30 or height_span > page_h * 0.3): |
| continue |
| if s['size'] > largest_size: |
| largest_size = s['size'] |
| largest_text = text |
|
|
| doc.close() |
| return slugify(largest_text) if largest_text else None |
|
|
|
|
| def slugify(text): |
| """Convert text to a filesystem-safe filename slug.""" |
| slug = re.sub(r'[^\w\s-]', '', text) |
| slug = re.sub(r'[\s_]+', '-', slug).strip('-').lower() |
| return slug[:80] if slug else None |
|
|
|
|
| |
| |
| |
| def export_markdown(pdf_path, dash, step_idx): |
| """Convert cleaned PDF to markdown using docling.""" |
| try: |
| from docling.document_converter import DocumentConverter, PdfFormatOption |
| from docling.datamodel.pipeline_options import PdfPipelineOptions |
| from docling.datamodel.base_models import InputFormat |
| except ImportError: |
| dash.add_warning('docling not installed, skipping markdown') |
| return None |
|
|
| dash.set_step(step_idx, 'Export') |
| md_path = os.path.splitext(pdf_path)[0] + '.md' |
|
|
| dash.progress(0.2, 1, 'loading docling...') |
|
|
| pipeline_options = PdfPipelineOptions() |
| pipeline_options.generate_page_images = False |
| pipeline_options.generate_picture_images = False |
| pipeline_options.do_ocr = False |
| pipeline_options.do_table_structure = False |
| pipeline_options.do_picture_classification = False |
| pipeline_options.do_picture_description = False |
| pipeline_options.do_code_enrichment = False |
| pipeline_options.do_formula_enrichment = False |
|
|
| converter = DocumentConverter( |
| format_options={ |
| InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options) |
| } |
| ) |
|
|
| dash.progress(0.4, 1, 'converting pdf → markdown...') |
| result = converter.convert(pdf_path) |
| md_content = result.document.export_to_markdown() |
|
|
| dash.progress(0.8, 1, 'cleaning markdown...') |
|
|
| |
| md_content = re.sub(r'<!-- image -->\n?', '', md_content) |
|
|
| |
| first_heading = re.search(r'^#{1,6}\s+', md_content, re.MULTILINE) |
| if first_heading: |
| md_content = md_content[first_heading.start():] |
|
|
| |
| md_content = re.sub( |
| r'([a-zA-Z,;\-\u2013\u2014])\n\n([a-z])', |
| r'\1 \2', |
| md_content |
| ) |
|
|
| |
| md_content = re.sub(r'\n\n\d{1,4}\n\n', '\n\n', md_content) |
|
|
| dash.progress(1, 1, f'{len(md_content):,} chars') |
|
|
| with open(md_path, 'w', encoding='utf-8') as f: |
| f.write(md_content) |
|
|
| return md_path |
|
|
|
|
| |
| |
| |
| if __name__ == '__main__': |
| parser = argparse.ArgumentParser(description='Remove watermarks and footers from academic PDFs') |
| parser.add_argument('input', nargs='+', help='Input PDF file(s)') |
| parser.add_argument('-o', '--output', help='Output file (only with single input)') |
| parser.add_argument('--dry-run', action='store_true', help='Scan and report only, no modifications') |
| parser.add_argument('--no-markdown', action='store_true', help='Skip markdown export') |
| args = parser.parse_args() |
|
|
| if args.output and len(args.input) > 1: |
| print("Error: -o/--output only works with a single input file") |
| sys.exit(1) |
|
|
| all_success = True |
| for input_file in args.input: |
| if not os.path.exists(input_file): |
| print(f"File not found: {input_file}") |
| all_success = False |
| continue |
|
|
| if args.output: |
| output = args.output |
| else: |
| title_slug = extract_title(input_file) |
| out_dir = os.path.dirname(input_file) or '.' |
| if title_slug: |
| output = os.path.join(out_dir, f"{title_slug}.pdf") |
| else: |
| base, ext = os.path.splitext(input_file) |
| output = f"{base}-clean{ext}" |
|
|
| success = clean_pdf(input_file, output, args.dry_run, args.no_markdown) |
| if not success: |
| all_success = False |
|
|
| sys.exit(0 if all_success else 1) |
|
|