HR-Assistant / src /doc_parser /pdf_to_markdown.py
owenkaplinsky
Clean initial commit for HuggingFace
363cda9
"""
PDF to Markdown converter using GPT-4 Vision.
---------------------------------------------------------------------------
------------------------------ How to Use It ------------------------------
---------------------------------------------------------------------------
Process a single file:
>>> python pdf_to_markdown.py data_cv/max_mustermann_cv.pdf
Process a folder:
>>> python pdf_to_markdown.py data_cv/
Customize model or rendering:
>>> python pdf_to_markdown.py data_cv/ --model gpt-4.1 --target-width 1800 --batch-size 3
Disable column splitting:
>>> python pdf_to_markdown.py my_resume.pdf --no-halves
Set a custom output folder:
>>> python pdf_to_markdown.py data_cv/ --output processed/
🔧 Summary of Configurable Options
| Option | Description | Default |
| --------------------- | ------------------------------- | ------------------ |
| `path` | PDF file or folder path | required |
| `--output` | Output directory | `results/` |
| `--model` | OpenAI model | `gpt-4.1-mini` |
| `--target-width` | Render width per page | `2000` |
| `--batch-size` | Pages per API request | `2` |
| `--max-output-tokens` | Max tokens returned | `8192` |
| `--no-halves` | Disable left/right column crops | Enabled by default |
"""
import argparse
import os
from datetime import datetime
from pathlib import Path
from typing import Dict, List
from dotenv import load_dotenv
from openai import OpenAI
from PIL import Image
from .utils import (
render_pdf_to_images,
pil_to_png_data_uri,
split_halves,
parse_sections_from_json_text,
normalize_sections,
merge_duplicate_titles,
build_contact_section_from_filename,
process_section,
apply_postprocessing,
)
def pdf_to_markdown(
input_path: Path,
output_path: Path,
model: str = "gpt-4.1-mini",
target_width: int = 2000,
batch_size: int = 2,
max_output_tokens: int = 8192,
add_halves: bool = True,
) -> None:
"""
Process a single PDF or all PDFs in a directory and export Markdown sections.
1. Render PDF pages to images.
2. Send images in batches to GPT-4 Vision for section parsing.
3. Normalize and post-process the returned sections.
4. Save the final sections as a Markdown text file.
5. Repeat for all PDFs in the input path.
6. Output files are saved in the specified output directory.
Args:
input_path: Path to a single PDF file or a directory of PDFs.
output_path: Directory to save the output Markdown files.
model: OpenAI model to use for processing.
target_width: Target width for rendering PDF pages.
batch_size: Number of pages to send per API request.
max_output_tokens: Maximum tokens in model output.
add_halves: Whether to add left/right column crops.
"""
load_dotenv()
def log_step(message: str) -> None:
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print(f"[{timestamp}] {message}")
log_step("Vision-based PDF → Markdown extraction started...")
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
raise RuntimeError("OPENAI_API_KEY is not set. Add it to your environment or .env file.")
# --- Determine which PDFs to process ---
if input_path.is_file() and input_path.suffix.lower() == ".pdf":
pdf_files = [input_path]
elif input_path.is_dir():
pdf_files = sorted(input_path.glob("*.pdf"))
else:
raise ValueError(f"Invalid input path: {input_path}")
if not pdf_files:
log_step(f"No PDF files found at {input_path}")
return
output_path.mkdir(parents=True, exist_ok=True)
log_step(f"Found {len(pdf_files)} PDF file(s) in {input_path}.")
log_step(f"Using model={model}, batch_size={batch_size}, target_width={target_width}px.")
client = OpenAI()
# -------------------------- Inner helper --------------------------
def call_batch(imgs: List[Image.Image]) -> List[Dict[str, str]]:
"""Process a batch of page images → STRICT JSON sections."""
image_contents = []
for img in imgs:
data_uri = pil_to_png_data_uri(img)
image_contents.append({"type": "input_image", "image_url": data_uri})
if add_halves:
for half in split_halves(img):
image_contents.append(
{"type": "input_image", "image_url": pil_to_png_data_uri(half)}
)
system = "You are a precise document structure parser. Output ONLY valid JSON."
user = (
"From these page images, return a STRICT JSON array where each item has 'title' and 'body'. "
"Group human-meaningful sections, merge multi-line headings (two-column layouts), preserve reading order. "
"Do NOT summarize or omit content. Include headers/footers if they contain contact data. "
"Preserve bullet/numbered lists and render tables as Markdown where possible. "
"Use proper UTF-8 German diacritics (ä, ö, ü, ß). "
"Include small sidebar/column blocks and deduplicate content across full pages and crops."
)
response = client.responses.create(
model=model,
temperature=0,
max_output_tokens=max_output_tokens,
input=[
{"role": "system", "content": [{"type": "input_text", "text": system}]},
{"role": "user", "content": [{"type": "input_text", "text": user}] + image_contents},
],
)
text = getattr(response, "output_text", "") or ""
return parse_sections_from_json_text(text)
# -------------------------- Main processing --------------------------
total_files = len(pdf_files)
for index, pdf_file in enumerate(pdf_files, start=1):
log_step(f"[{index}/{total_files}] Processing {pdf_file.name}...")
pages = render_pdf_to_images(pdf_file, target_width=target_width)
if not pages:
raise RuntimeError(f"Failed to render any PDF pages for {pdf_file}.")
log_step(f"Rendered {len(pages)} page(s).")
all_sections: List[Dict[str, str]] = []
for start in range(0, len(pages), batch_size):
end = min(len(pages), start + batch_size)
batch_num = (start // batch_size) + 1
log_step(f"Batch {batch_num}: pages {start + 1}{end}.")
secs = call_batch(pages[start:end])
if secs:
all_sections.extend(secs)
log_step(f"Batch {batch_num} returned {len(secs)} section(s).")
else:
log_step(f"Batch {batch_num} returned no sections.")
if not all_sections:
raise RuntimeError(f"No sections parsed from vision model output for {pdf_file}.")
log_step(f"Received {len(all_sections)} raw section(s).")
normalized = normalize_sections(all_sections)
merged = merge_duplicate_titles(normalized)
final_sections = apply_postprocessing(merged)
contact_section = process_section(build_contact_section_from_filename(pdf_file))
final_sections.insert(0, contact_section)
out_txt = output_path / f"{pdf_file.stem}.txt"
log_step(f"Writing output to {out_txt}...")
lines: List[str] = []
for sec in final_sections:
title = (sec.get("title") or "").strip()
body = (sec.get("body") or "").strip()
if title:
lines.append(f"## {title}")
if body:
lines.append(body)
lines.append("")
while lines and lines[-1] == "":
lines.pop()
out_txt.write_text("\n".join(lines), encoding="utf-8")
log_step(f"✅ Completed processing for {pdf_file.name}.")
log_step("🎉 All PDF files processed successfully.")
print(f"\nResults saved in: {output_path.resolve()}")
# ----------------------------- CLI entrypoint -----------------------------
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Convert PDFs to structured Markdown using GPT-4 Vision."
)
parser.add_argument(
"path",
help="Path to a single PDF file or a directory containing PDF files.",
)
parser.add_argument(
"-o", "--output",
default="results",
help="Output directory for the Markdown files (default: results/)",
)
parser.add_argument(
"--model",
default=os.getenv("OPENAI_MODEL", "gpt-4.1-mini"),
help="OpenAI model to use (default: gpt-4.1-mini)",
)
parser.add_argument(
"--target-width",
type=int,
default=int(os.getenv("VISION_TARGET_WIDTH", "2000")),
help="Target width for rendering PDF pages (default: 2000 px)",
)
parser.add_argument(
"--batch-size",
type=int,
default=int(os.getenv("VISION_BATCH_PAGES", "2")),
help="Number of pages to send to the model per request (default: 2)",
)
parser.add_argument(
"--max-output-tokens",
type=int,
default=int(os.getenv("MAX_OUTPUT_TOKENS", "8192")),
help="Maximum tokens in model output (default: 8192)",
)
parser.add_argument(
"--no-halves",
action="store_true",
help="Disable left/right column splitting (default: enabled)",
)
args = parser.parse_args()
pdf_to_markdown(
input_path=Path(args.path),
output_path=Path(args.output),
model=args.model,
target_width=args.target_width,
batch_size=args.batch_size,
max_output_tokens=args.max_output_tokens,
add_halves=not args.no_halves,
)