| |
| """ |
| Convert PDF files to Markdown format |
| Supports multiple PDF extraction methods |
| """ |
|
|
| import os |
| import sys |
| import subprocess |
| from pathlib import Path |
| import re |
|
|
| def convert_pdf_with_pdftotext(pdf_path, output_path): |
| """Convert PDF to text using pdftotext command""" |
| try: |
| result = subprocess.run( |
| ['pdftotext', '-layout', pdf_path, output_path], |
| capture_output=True, |
| text=True, |
| check=True |
| ) |
| return True |
| except (subprocess.CalledProcessError, FileNotFoundError): |
| return False |
|
|
| def convert_pdf_with_pymupdf(pdf_path, output_path): |
| """Convert PDF to text using PyMuPDF (fitz)""" |
| try: |
| import fitz |
| doc = fitz.open(pdf_path) |
| text_content = [] |
| |
| for page_num in range(len(doc)): |
| page = doc[page_num] |
| text = page.get_text() |
| text_content.append(text) |
| |
| doc.close() |
| |
| |
| with open(output_path, 'w', encoding='utf-8') as f: |
| f.write('\n\n'.join(text_content)) |
| |
| return True |
| except ImportError: |
| return False |
| except Exception as e: |
| print(f"Error with PyMuPDF: {e}") |
| return False |
|
|
| def convert_pdf_with_pypdf2(pdf_path, output_path): |
| """Convert PDF to text using PyPDF2""" |
| try: |
| import PyPDF2 |
| with open(pdf_path, 'rb') as file: |
| pdf_reader = PyPDF2.PdfReader(file) |
| text_content = [] |
| |
| for page_num in range(len(pdf_reader.pages)): |
| page = pdf_reader.pages[page_num] |
| text = page.extract_text() |
| text_content.append(text) |
| |
| |
| with open(output_path, 'w', encoding='utf-8') as f: |
| f.write('\n\n'.join(text_content)) |
| |
| return True |
| except ImportError: |
| return False |
| except Exception as e: |
| print(f"Error with PyPDF2: {e}") |
| return False |
|
|
| def convert_pdf_with_pdfplumber(pdf_path, output_path): |
| """Convert PDF to text using pdfplumber""" |
| try: |
| import pdfplumber |
| text_content = [] |
| |
| with pdfplumber.open(pdf_path) as pdf: |
| for page in pdf.pages: |
| text = page.extract_text() |
| if text: |
| text_content.append(text) |
| |
| |
| with open(output_path, 'w', encoding='utf-8') as f: |
| f.write('\n\n'.join(text_content)) |
| |
| return True |
| except ImportError: |
| return False |
| except Exception as e: |
| print(f"Error with pdfplumber: {e}") |
| return False |
|
|
| def text_to_markdown(text_content): |
| """Convert plain text to basic markdown format""" |
| |
| text = text_content.strip() |
| |
| |
| text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text) |
| |
| |
| lines = text.split('\n') |
| formatted_lines = [] |
| |
| for i, line in enumerate(lines): |
| stripped = line.strip() |
| if not stripped: |
| formatted_lines.append('') |
| continue |
| |
| |
| if len(stripped) < 100 and stripped.isupper() and len(stripped.split()) < 10: |
| |
| if i + 1 < len(lines) and (not lines[i+1].strip() or lines[i+1].strip()[0].islower()): |
| formatted_lines.append(f"## {stripped.title()}") |
| continue |
| |
| formatted_lines.append(stripped) |
| |
| return '\n'.join(formatted_lines) |
|
|
| def convert_pdf_to_markdown(pdf_path, output_path): |
| """Convert PDF to Markdown using available method""" |
| pdf_path = Path(pdf_path) |
| output_path = Path(output_path) |
| |
| if not pdf_path.exists(): |
| print(f"β PDF file not found: {pdf_path}") |
| return False |
| |
| |
| methods = [ |
| ("pdftotext", convert_pdf_with_pdftotext), |
| ("PyMuPDF", convert_pdf_with_pymupdf), |
| ("pdfplumber", convert_pdf_with_pdfplumber), |
| ("PyPDF2", convert_pdf_with_pypdf2), |
| ] |
| |
| temp_text_path = output_path.with_suffix('.txt') |
| |
| for method_name, method_func in methods: |
| print(f" Trying {method_name}...", end=' ') |
| if method_func(str(pdf_path), str(temp_text_path)): |
| print("β Success") |
| break |
| else: |
| print("β Not available") |
| else: |
| print(f"β No PDF conversion method available for {pdf_path.name}") |
| print(" Please install one of: pdftotext, PyMuPDF, pdfplumber, or PyPDF2") |
| return False |
| |
| |
| try: |
| with open(temp_text_path, 'r', encoding='utf-8') as f: |
| text_content = f.read() |
| |
| markdown_content = text_to_markdown(text_content) |
| |
| |
| with open(output_path, 'w', encoding='utf-8') as f: |
| f.write(markdown_content) |
| |
| |
| if temp_text_path.exists(): |
| temp_text_path.unlink() |
| |
| return True |
| except Exception as e: |
| print(f"β Error processing text: {e}") |
| return False |
|
|
| def process_directory(directory, output_base='manuscripts'): |
| """Process all PDFs in a directory structure""" |
| directory = Path(directory) |
| pdf_files = list(directory.rglob('*.pdf')) |
| |
| if not pdf_files: |
| print(f"No PDF files found in {directory}") |
| return |
| |
| print(f"π Found {len(pdf_files)} PDF file(s) to convert:\n") |
| |
| success_count = 0 |
| for pdf_file in pdf_files: |
| |
| relative_path = pdf_file.relative_to(directory) |
| output_dir = Path(output_base) / relative_path.parent |
| output_file = output_dir / f"{pdf_file.stem}.md" |
| |
| output_dir.mkdir(parents=True, exist_ok=True) |
| |
| print(f"Converting: {pdf_file.name}") |
| if convert_pdf_to_markdown(pdf_file, output_file): |
| print(f" β Created: {output_file}\n") |
| success_count += 1 |
| else: |
| print(f" β Failed: {pdf_file.name}\n") |
| |
| print(f"β
Successfully converted {success_count}/{len(pdf_files)} files") |
|
|
| def main(): |
| import argparse |
| parser = argparse.ArgumentParser(description='Convert PDF files to Markdown') |
| parser.add_argument('input', nargs='?', default='manuscripts', |
| help='Input directory (default: manuscripts)') |
| parser.add_argument('-o', '--output', default='manuscripts', |
| help='Output directory (default: manuscripts)') |
| |
| args = parser.parse_args() |
| |
| process_directory(args.input, args.output) |
|
|
| if __name__ == '__main__': |
| main() |
|
|
|
|