lwm-spectro / scripts /paper_diff_to_pdf.py
Namhyun Kim
Sync local development code into HF repo
eaaeb1b
#!/usr/bin/env python3
from __future__ import annotations
import argparse
import difflib
import hashlib
import os
import textwrap
from dataclasses import dataclass
from pathlib import Path
def sha256_short(data: bytes, n: int = 12) -> str:
return hashlib.sha256(data).hexdigest()[:n]
def is_probably_text(path: Path) -> bool:
ext = path.suffix.lower()
return ext in {".tex", ".bib", ".txt", ".md", ".rst", ".csv", ".json", ".yaml", ".yml"}
def escape_pdf_string(text: str) -> str:
return (
text.replace("\\", "\\\\")
.replace("(", "\\(")
.replace(")", "\\)")
.replace("\r", "\\r")
.replace("\t", "\\t")
)
@dataclass(frozen=True)
class PdfLine:
text: str
rgb: tuple[float, float, float]
class SimplePdf:
def __init__(
self,
page_width: int = 612, # US Letter
page_height: int = 792,
margin_left: int = 40,
margin_top: int = 760,
margin_bottom: int = 40,
font_size: int = 8,
leading: int = 10,
max_chars: int = 100,
) -> None:
self.page_width = page_width
self.page_height = page_height
self.margin_left = margin_left
self.margin_top = margin_top
self.margin_bottom = margin_bottom
self.font_size = font_size
self.leading = leading
self.max_chars = max_chars
def _lines_per_page(self) -> int:
usable = self.margin_top - self.margin_bottom
return max(1, usable // self.leading)
def _chunk_lines(self, lines: list[PdfLine]) -> list[list[PdfLine]]:
chunks: list[list[PdfLine]] = []
current: list[PdfLine] = []
for line in lines:
parts = self._wrap_preserve(line.text, self.max_chars)
for part in parts:
current.append(PdfLine(part, line.rgb))
if len(current) >= self._lines_per_page():
chunks.append(current)
current = []
if current:
chunks.append(current)
return chunks
@staticmethod
def _wrap_preserve(text: str, width: int) -> list[str]:
if len(text) <= width:
return [text]
# Preserve leading whitespace for diff readability.
leading_ws_len = len(text) - len(text.lstrip(" "))
prefix = " " * min(leading_ws_len, width - 1)
rest = text.lstrip(" ")
wrapped = textwrap.wrap(
rest,
width=width - len(prefix),
break_long_words=True,
break_on_hyphens=False,
drop_whitespace=False,
)
if not wrapped:
return [text[:width], prefix + text[width:]]
out = [prefix + wrapped[0]]
out.extend(prefix + w for w in wrapped[1:])
return out
def render(self, lines: list[PdfLine], out_path: Path) -> None:
pages = self._chunk_lines(lines)
objects: list[bytes] = []
def add_object(payload: bytes) -> int:
objects.append(payload)
return len(objects)
# Font (Courier)
font_obj = add_object(b"<< /Type /Font /Subtype /Type1 /BaseFont /Courier >>")
content_objs: list[int] = []
page_objs: list[int] = []
for page_lines in pages:
stream = self._content_stream(page_lines).encode("utf-8")
content_obj = add_object(
b"<< /Length "
+ str(len(stream)).encode("ascii")
+ b" >>\nstream\n"
+ stream
+ b"\nendstream"
)
content_objs.append(content_obj)
# Pages object created after page objects, but we need its object number for /Parent.
# Reserve a slot.
pages_obj_placeholder = add_object(b"")
pages_obj_num = pages_obj_placeholder
for content_obj in content_objs:
page_obj = add_object(
(
f"<< /Type /Page /Parent {pages_obj_num} 0 R "
f"/MediaBox [0 0 {self.page_width} {self.page_height}] "
f"/Resources << /Font << /F1 {font_obj} 0 R >> >> "
f"/Contents {content_obj} 0 R >>"
).encode("ascii")
)
page_objs.append(page_obj)
kids = " ".join(f"{n} 0 R" for n in page_objs)
pages_obj = (
f"<< /Type /Pages /Kids [{kids}] /Count {len(page_objs)} >>".encode("ascii")
)
objects[pages_obj_num - 1] = pages_obj
catalog_obj = add_object(
f"<< /Type /Catalog /Pages {pages_obj_num} 0 R >>".encode("ascii")
)
self._write_pdf(objects, catalog_obj, out_path)
def _content_stream(self, page_lines: list[PdfLine]) -> str:
x = self.margin_left
y = self.margin_top
parts = [
"BT",
f"/F1 {self.font_size} Tf",
f"{x} {y} Td",
]
prev_rgb: tuple[float, float, float] | None = None
first = True
for line in page_lines:
if first:
first = False
else:
parts.append(f"0 -{self.leading} Td")
if prev_rgb != line.rgb:
r, g, b = line.rgb
parts.append(f"{r:.3f} {g:.3f} {b:.3f} rg")
prev_rgb = line.rgb
parts.append(f"({escape_pdf_string(line.text)}) Tj")
parts.append("ET")
return "\n".join(parts)
@staticmethod
def _write_pdf(objects: list[bytes], root_obj_num: int, out_path: Path) -> None:
# PDF header: include binary chars line per spec recommendation.
out = bytearray()
out += b"%PDF-1.4\n%\xe2\xe3\xcf\xd3\n"
xref_positions = [0] # object 0
for i, payload in enumerate(objects, start=1):
xref_positions.append(len(out))
out += f"{i} 0 obj\n".encode("ascii")
out += payload + b"\nendobj\n"
xref_start = len(out)
out += f"xref\n0 {len(objects) + 1}\n".encode("ascii")
out += b"0000000000 65535 f \n"
for pos in xref_positions[1:]:
out += f"{pos:010d} 00000 n \n".encode("ascii")
out += (
b"trailer\n"
+ f"<< /Size {len(objects) + 1} /Root {root_obj_num} 0 R >>\n".encode(
"ascii"
)
+ b"startxref\n"
+ str(xref_start).encode("ascii")
+ b"\n%%EOF\n"
)
out_path.parent.mkdir(parents=True, exist_ok=True)
out_path.write_bytes(out)
def diff_to_pdf_lines(old_root: Path, new_root: Path) -> list[PdfLine]:
old_files = {p.relative_to(old_root) for p in old_root.rglob("*") if p.is_file()}
new_files = {p.relative_to(new_root) for p in new_root.rglob("*") if p.is_file()}
added = sorted(new_files - old_files)
removed = sorted(old_files - new_files)
common = sorted(old_files & new_files)
changed_text: list[Path] = []
changed_bin: list[Path] = []
for rel in common:
p1 = old_root / rel
p2 = new_root / rel
b1 = p1.read_bytes()
b2 = p2.read_bytes()
if b1 == b2:
continue
if is_probably_text(rel):
changed_text.append(rel)
else:
changed_bin.append(rel)
lines: list[PdfLine] = []
def add_line(text: str, rgb: tuple[float, float, float] = (0, 0, 0)) -> None:
lines.append(PdfLine(text, rgb))
add_line(f"Diff: {old_root} -> {new_root}", (0.1, 0.1, 0.1))
add_line("", (0, 0, 0))
add_line("Summary", (0.1, 0.1, 0.6))
add_line(f"- Changed text files: {len(changed_text)}", (0, 0, 0))
for rel in changed_text:
add_line(f" - {rel}", (0, 0, 0))
add_line(f"- Changed binary files: {len(changed_bin)}", (0, 0, 0))
for rel in changed_bin:
p1 = old_root / rel
p2 = new_root / rel
b1 = p1.read_bytes()
b2 = p2.read_bytes()
add_line(
f" - {rel} (old {len(b1)}B {sha256_short(b1)}, new {len(b2)}B {sha256_short(b2)})",
(0, 0, 0),
)
add_line(f"- Added files: {len(added)}", (0, 0, 0))
for rel in added:
p = new_root / rel
b = p.read_bytes()
add_line(f" - {rel} ({len(b)}B {sha256_short(b)})", (0, 0, 0))
add_line(f"- Removed files: {len(removed)}", (0, 0, 0))
for rel in removed:
add_line(f" - {rel}", (0, 0, 0))
add_line("", (0, 0, 0))
add_line("Unified diffs (text)", (0.1, 0.1, 0.6))
add_line("", (0, 0, 0))
def color_for_diff_line(s: str) -> tuple[float, float, float]:
if s.startswith("diff "):
return (0.2, 0.2, 0.2)
if s.startswith("--- ") or s.startswith("+++ "):
return (0.2, 0.2, 0.2)
if s.startswith("@@"):
return (0.1, 0.1, 0.6)
if s.startswith("+") and not s.startswith("+++"):
return (0.0, 0.45, 0.0)
if s.startswith("-") and not s.startswith("---"):
return (0.75, 0.0, 0.0)
return (0, 0, 0)
for rel in changed_text:
p1 = old_root / rel
p2 = new_root / rel
old_lines = p1.read_text(encoding="utf-8", errors="replace").splitlines()
new_lines = p2.read_text(encoding="utf-8", errors="replace").splitlines()
add_line(f"File: {rel}", (0.2, 0.2, 0.2))
diff_lines = difflib.unified_diff(
old_lines,
new_lines,
fromfile=str(old_root / rel),
tofile=str(new_root / rel),
lineterm="",
n=3,
)
for s in diff_lines:
add_line(s, color_for_diff_line(s))
add_line("", (0, 0, 0))
return lines
def main() -> int:
parser = argparse.ArgumentParser(
description="Generate a PDF containing unified diffs between two paper versions."
)
parser.add_argument("old_dir", type=Path)
parser.add_argument("new_dir", type=Path)
parser.add_argument(
"-o",
"--out",
type=Path,
default=Path("paper-sources/paper-301-vs-757-diff.pdf"),
help="Output PDF path (default: paper-sources/paper-301-vs-757-diff.pdf)",
)
args = parser.parse_args()
old_dir = args.old_dir.resolve()
new_dir = args.new_dir.resolve()
if not old_dir.is_dir():
raise SystemExit(f"Not a directory: {old_dir}")
if not new_dir.is_dir():
raise SystemExit(f"Not a directory: {new_dir}")
pdf = SimplePdf()
lines = diff_to_pdf_lines(old_dir, new_dir)
pdf.render(lines, args.out)
print(args.out)
return 0
if __name__ == "__main__":
raise SystemExit(main())