#!/usr/bin/env python3 from __future__ import annotations import argparse import difflib import hashlib import os import textwrap from dataclasses import dataclass from pathlib import Path def sha256_short(data: bytes, n: int = 12) -> str: return hashlib.sha256(data).hexdigest()[:n] def is_probably_text(path: Path) -> bool: ext = path.suffix.lower() return ext in {".tex", ".bib", ".txt", ".md", ".rst", ".csv", ".json", ".yaml", ".yml"} def escape_pdf_string(text: str) -> str: return ( text.replace("\\", "\\\\") .replace("(", "\\(") .replace(")", "\\)") .replace("\r", "\\r") .replace("\t", "\\t") ) @dataclass(frozen=True) class PdfLine: text: str rgb: tuple[float, float, float] class SimplePdf: def __init__( self, page_width: int = 612, # US Letter page_height: int = 792, margin_left: int = 40, margin_top: int = 760, margin_bottom: int = 40, font_size: int = 8, leading: int = 10, max_chars: int = 100, ) -> None: self.page_width = page_width self.page_height = page_height self.margin_left = margin_left self.margin_top = margin_top self.margin_bottom = margin_bottom self.font_size = font_size self.leading = leading self.max_chars = max_chars def _lines_per_page(self) -> int: usable = self.margin_top - self.margin_bottom return max(1, usable // self.leading) def _chunk_lines(self, lines: list[PdfLine]) -> list[list[PdfLine]]: chunks: list[list[PdfLine]] = [] current: list[PdfLine] = [] for line in lines: parts = self._wrap_preserve(line.text, self.max_chars) for part in parts: current.append(PdfLine(part, line.rgb)) if len(current) >= self._lines_per_page(): chunks.append(current) current = [] if current: chunks.append(current) return chunks @staticmethod def _wrap_preserve(text: str, width: int) -> list[str]: if len(text) <= width: return [text] # Preserve leading whitespace for diff readability. leading_ws_len = len(text) - len(text.lstrip(" ")) prefix = " " * min(leading_ws_len, width - 1) rest = text.lstrip(" ") wrapped = textwrap.wrap( rest, width=width - len(prefix), break_long_words=True, break_on_hyphens=False, drop_whitespace=False, ) if not wrapped: return [text[:width], prefix + text[width:]] out = [prefix + wrapped[0]] out.extend(prefix + w for w in wrapped[1:]) return out def render(self, lines: list[PdfLine], out_path: Path) -> None: pages = self._chunk_lines(lines) objects: list[bytes] = [] def add_object(payload: bytes) -> int: objects.append(payload) return len(objects) # Font (Courier) font_obj = add_object(b"<< /Type /Font /Subtype /Type1 /BaseFont /Courier >>") content_objs: list[int] = [] page_objs: list[int] = [] for page_lines in pages: stream = self._content_stream(page_lines).encode("utf-8") content_obj = add_object( b"<< /Length " + str(len(stream)).encode("ascii") + b" >>\nstream\n" + stream + b"\nendstream" ) content_objs.append(content_obj) # Pages object created after page objects, but we need its object number for /Parent. # Reserve a slot. pages_obj_placeholder = add_object(b"") pages_obj_num = pages_obj_placeholder for content_obj in content_objs: page_obj = add_object( ( f"<< /Type /Page /Parent {pages_obj_num} 0 R " f"/MediaBox [0 0 {self.page_width} {self.page_height}] " f"/Resources << /Font << /F1 {font_obj} 0 R >> >> " f"/Contents {content_obj} 0 R >>" ).encode("ascii") ) page_objs.append(page_obj) kids = " ".join(f"{n} 0 R" for n in page_objs) pages_obj = ( f"<< /Type /Pages /Kids [{kids}] /Count {len(page_objs)} >>".encode("ascii") ) objects[pages_obj_num - 1] = pages_obj catalog_obj = add_object( f"<< /Type /Catalog /Pages {pages_obj_num} 0 R >>".encode("ascii") ) self._write_pdf(objects, catalog_obj, out_path) def _content_stream(self, page_lines: list[PdfLine]) -> str: x = self.margin_left y = self.margin_top parts = [ "BT", f"/F1 {self.font_size} Tf", f"{x} {y} Td", ] prev_rgb: tuple[float, float, float] | None = None first = True for line in page_lines: if first: first = False else: parts.append(f"0 -{self.leading} Td") if prev_rgb != line.rgb: r, g, b = line.rgb parts.append(f"{r:.3f} {g:.3f} {b:.3f} rg") prev_rgb = line.rgb parts.append(f"({escape_pdf_string(line.text)}) Tj") parts.append("ET") return "\n".join(parts) @staticmethod def _write_pdf(objects: list[bytes], root_obj_num: int, out_path: Path) -> None: # PDF header: include binary chars line per spec recommendation. out = bytearray() out += b"%PDF-1.4\n%\xe2\xe3\xcf\xd3\n" xref_positions = [0] # object 0 for i, payload in enumerate(objects, start=1): xref_positions.append(len(out)) out += f"{i} 0 obj\n".encode("ascii") out += payload + b"\nendobj\n" xref_start = len(out) out += f"xref\n0 {len(objects) + 1}\n".encode("ascii") out += b"0000000000 65535 f \n" for pos in xref_positions[1:]: out += f"{pos:010d} 00000 n \n".encode("ascii") out += ( b"trailer\n" + f"<< /Size {len(objects) + 1} /Root {root_obj_num} 0 R >>\n".encode( "ascii" ) + b"startxref\n" + str(xref_start).encode("ascii") + b"\n%%EOF\n" ) out_path.parent.mkdir(parents=True, exist_ok=True) out_path.write_bytes(out) def diff_to_pdf_lines(old_root: Path, new_root: Path) -> list[PdfLine]: old_files = {p.relative_to(old_root) for p in old_root.rglob("*") if p.is_file()} new_files = {p.relative_to(new_root) for p in new_root.rglob("*") if p.is_file()} added = sorted(new_files - old_files) removed = sorted(old_files - new_files) common = sorted(old_files & new_files) changed_text: list[Path] = [] changed_bin: list[Path] = [] for rel in common: p1 = old_root / rel p2 = new_root / rel b1 = p1.read_bytes() b2 = p2.read_bytes() if b1 == b2: continue if is_probably_text(rel): changed_text.append(rel) else: changed_bin.append(rel) lines: list[PdfLine] = [] def add_line(text: str, rgb: tuple[float, float, float] = (0, 0, 0)) -> None: lines.append(PdfLine(text, rgb)) add_line(f"Diff: {old_root} -> {new_root}", (0.1, 0.1, 0.1)) add_line("", (0, 0, 0)) add_line("Summary", (0.1, 0.1, 0.6)) add_line(f"- Changed text files: {len(changed_text)}", (0, 0, 0)) for rel in changed_text: add_line(f" - {rel}", (0, 0, 0)) add_line(f"- Changed binary files: {len(changed_bin)}", (0, 0, 0)) for rel in changed_bin: p1 = old_root / rel p2 = new_root / rel b1 = p1.read_bytes() b2 = p2.read_bytes() add_line( f" - {rel} (old {len(b1)}B {sha256_short(b1)}, new {len(b2)}B {sha256_short(b2)})", (0, 0, 0), ) add_line(f"- Added files: {len(added)}", (0, 0, 0)) for rel in added: p = new_root / rel b = p.read_bytes() add_line(f" - {rel} ({len(b)}B {sha256_short(b)})", (0, 0, 0)) add_line(f"- Removed files: {len(removed)}", (0, 0, 0)) for rel in removed: add_line(f" - {rel}", (0, 0, 0)) add_line("", (0, 0, 0)) add_line("Unified diffs (text)", (0.1, 0.1, 0.6)) add_line("", (0, 0, 0)) def color_for_diff_line(s: str) -> tuple[float, float, float]: if s.startswith("diff "): return (0.2, 0.2, 0.2) if s.startswith("--- ") or s.startswith("+++ "): return (0.2, 0.2, 0.2) if s.startswith("@@"): return (0.1, 0.1, 0.6) if s.startswith("+") and not s.startswith("+++"): return (0.0, 0.45, 0.0) if s.startswith("-") and not s.startswith("---"): return (0.75, 0.0, 0.0) return (0, 0, 0) for rel in changed_text: p1 = old_root / rel p2 = new_root / rel old_lines = p1.read_text(encoding="utf-8", errors="replace").splitlines() new_lines = p2.read_text(encoding="utf-8", errors="replace").splitlines() add_line(f"File: {rel}", (0.2, 0.2, 0.2)) diff_lines = difflib.unified_diff( old_lines, new_lines, fromfile=str(old_root / rel), tofile=str(new_root / rel), lineterm="", n=3, ) for s in diff_lines: add_line(s, color_for_diff_line(s)) add_line("", (0, 0, 0)) return lines def main() -> int: parser = argparse.ArgumentParser( description="Generate a PDF containing unified diffs between two paper versions." ) parser.add_argument("old_dir", type=Path) parser.add_argument("new_dir", type=Path) parser.add_argument( "-o", "--out", type=Path, default=Path("paper-sources/paper-301-vs-757-diff.pdf"), help="Output PDF path (default: paper-sources/paper-301-vs-757-diff.pdf)", ) args = parser.parse_args() old_dir = args.old_dir.resolve() new_dir = args.new_dir.resolve() if not old_dir.is_dir(): raise SystemExit(f"Not a directory: {old_dir}") if not new_dir.is_dir(): raise SystemExit(f"Not a directory: {new_dir}") pdf = SimplePdf() lines = diff_to_pdf_lines(old_dir, new_dir) pdf.render(lines, args.out) print(args.out) return 0 if __name__ == "__main__": raise SystemExit(main())