|
|
|
|
|
from __future__ import annotations |
|
|
|
|
|
import argparse |
|
|
import difflib |
|
|
import hashlib |
|
|
import os |
|
|
import textwrap |
|
|
from dataclasses import dataclass |
|
|
from pathlib import Path |
|
|
|
|
|
|
|
|
def sha256_short(data: bytes, n: int = 12) -> str: |
|
|
return hashlib.sha256(data).hexdigest()[:n] |
|
|
|
|
|
|
|
|
def is_probably_text(path: Path) -> bool: |
|
|
ext = path.suffix.lower() |
|
|
return ext in {".tex", ".bib", ".txt", ".md", ".rst", ".csv", ".json", ".yaml", ".yml"} |
|
|
|
|
|
|
|
|
def escape_pdf_string(text: str) -> str: |
|
|
return ( |
|
|
text.replace("\\", "\\\\") |
|
|
.replace("(", "\\(") |
|
|
.replace(")", "\\)") |
|
|
.replace("\r", "\\r") |
|
|
.replace("\t", "\\t") |
|
|
) |
|
|
|
|
|
|
|
|
@dataclass(frozen=True) |
|
|
class PdfLine: |
|
|
text: str |
|
|
rgb: tuple[float, float, float] |
|
|
|
|
|
|
|
|
class SimplePdf: |
|
|
def __init__( |
|
|
self, |
|
|
page_width: int = 612, |
|
|
page_height: int = 792, |
|
|
margin_left: int = 40, |
|
|
margin_top: int = 760, |
|
|
margin_bottom: int = 40, |
|
|
font_size: int = 8, |
|
|
leading: int = 10, |
|
|
max_chars: int = 100, |
|
|
) -> None: |
|
|
self.page_width = page_width |
|
|
self.page_height = page_height |
|
|
self.margin_left = margin_left |
|
|
self.margin_top = margin_top |
|
|
self.margin_bottom = margin_bottom |
|
|
self.font_size = font_size |
|
|
self.leading = leading |
|
|
self.max_chars = max_chars |
|
|
|
|
|
def _lines_per_page(self) -> int: |
|
|
usable = self.margin_top - self.margin_bottom |
|
|
return max(1, usable // self.leading) |
|
|
|
|
|
def _chunk_lines(self, lines: list[PdfLine]) -> list[list[PdfLine]]: |
|
|
chunks: list[list[PdfLine]] = [] |
|
|
current: list[PdfLine] = [] |
|
|
for line in lines: |
|
|
parts = self._wrap_preserve(line.text, self.max_chars) |
|
|
for part in parts: |
|
|
current.append(PdfLine(part, line.rgb)) |
|
|
if len(current) >= self._lines_per_page(): |
|
|
chunks.append(current) |
|
|
current = [] |
|
|
if current: |
|
|
chunks.append(current) |
|
|
return chunks |
|
|
|
|
|
@staticmethod |
|
|
def _wrap_preserve(text: str, width: int) -> list[str]: |
|
|
if len(text) <= width: |
|
|
return [text] |
|
|
|
|
|
leading_ws_len = len(text) - len(text.lstrip(" ")) |
|
|
prefix = " " * min(leading_ws_len, width - 1) |
|
|
rest = text.lstrip(" ") |
|
|
wrapped = textwrap.wrap( |
|
|
rest, |
|
|
width=width - len(prefix), |
|
|
break_long_words=True, |
|
|
break_on_hyphens=False, |
|
|
drop_whitespace=False, |
|
|
) |
|
|
if not wrapped: |
|
|
return [text[:width], prefix + text[width:]] |
|
|
out = [prefix + wrapped[0]] |
|
|
out.extend(prefix + w for w in wrapped[1:]) |
|
|
return out |
|
|
|
|
|
def render(self, lines: list[PdfLine], out_path: Path) -> None: |
|
|
pages = self._chunk_lines(lines) |
|
|
|
|
|
objects: list[bytes] = [] |
|
|
|
|
|
def add_object(payload: bytes) -> int: |
|
|
objects.append(payload) |
|
|
return len(objects) |
|
|
|
|
|
|
|
|
font_obj = add_object(b"<< /Type /Font /Subtype /Type1 /BaseFont /Courier >>") |
|
|
|
|
|
content_objs: list[int] = [] |
|
|
page_objs: list[int] = [] |
|
|
|
|
|
for page_lines in pages: |
|
|
stream = self._content_stream(page_lines).encode("utf-8") |
|
|
content_obj = add_object( |
|
|
b"<< /Length " |
|
|
+ str(len(stream)).encode("ascii") |
|
|
+ b" >>\nstream\n" |
|
|
+ stream |
|
|
+ b"\nendstream" |
|
|
) |
|
|
content_objs.append(content_obj) |
|
|
|
|
|
|
|
|
|
|
|
pages_obj_placeholder = add_object(b"") |
|
|
pages_obj_num = pages_obj_placeholder |
|
|
|
|
|
for content_obj in content_objs: |
|
|
page_obj = add_object( |
|
|
( |
|
|
f"<< /Type /Page /Parent {pages_obj_num} 0 R " |
|
|
f"/MediaBox [0 0 {self.page_width} {self.page_height}] " |
|
|
f"/Resources << /Font << /F1 {font_obj} 0 R >> >> " |
|
|
f"/Contents {content_obj} 0 R >>" |
|
|
).encode("ascii") |
|
|
) |
|
|
page_objs.append(page_obj) |
|
|
|
|
|
kids = " ".join(f"{n} 0 R" for n in page_objs) |
|
|
pages_obj = ( |
|
|
f"<< /Type /Pages /Kids [{kids}] /Count {len(page_objs)} >>".encode("ascii") |
|
|
) |
|
|
objects[pages_obj_num - 1] = pages_obj |
|
|
|
|
|
catalog_obj = add_object( |
|
|
f"<< /Type /Catalog /Pages {pages_obj_num} 0 R >>".encode("ascii") |
|
|
) |
|
|
|
|
|
self._write_pdf(objects, catalog_obj, out_path) |
|
|
|
|
|
def _content_stream(self, page_lines: list[PdfLine]) -> str: |
|
|
x = self.margin_left |
|
|
y = self.margin_top |
|
|
parts = [ |
|
|
"BT", |
|
|
f"/F1 {self.font_size} Tf", |
|
|
f"{x} {y} Td", |
|
|
] |
|
|
|
|
|
prev_rgb: tuple[float, float, float] | None = None |
|
|
first = True |
|
|
for line in page_lines: |
|
|
if first: |
|
|
first = False |
|
|
else: |
|
|
parts.append(f"0 -{self.leading} Td") |
|
|
|
|
|
if prev_rgb != line.rgb: |
|
|
r, g, b = line.rgb |
|
|
parts.append(f"{r:.3f} {g:.3f} {b:.3f} rg") |
|
|
prev_rgb = line.rgb |
|
|
|
|
|
parts.append(f"({escape_pdf_string(line.text)}) Tj") |
|
|
|
|
|
parts.append("ET") |
|
|
return "\n".join(parts) |
|
|
|
|
|
@staticmethod |
|
|
def _write_pdf(objects: list[bytes], root_obj_num: int, out_path: Path) -> None: |
|
|
|
|
|
out = bytearray() |
|
|
out += b"%PDF-1.4\n%\xe2\xe3\xcf\xd3\n" |
|
|
|
|
|
xref_positions = [0] |
|
|
for i, payload in enumerate(objects, start=1): |
|
|
xref_positions.append(len(out)) |
|
|
out += f"{i} 0 obj\n".encode("ascii") |
|
|
out += payload + b"\nendobj\n" |
|
|
|
|
|
xref_start = len(out) |
|
|
out += f"xref\n0 {len(objects) + 1}\n".encode("ascii") |
|
|
out += b"0000000000 65535 f \n" |
|
|
for pos in xref_positions[1:]: |
|
|
out += f"{pos:010d} 00000 n \n".encode("ascii") |
|
|
|
|
|
out += ( |
|
|
b"trailer\n" |
|
|
+ f"<< /Size {len(objects) + 1} /Root {root_obj_num} 0 R >>\n".encode( |
|
|
"ascii" |
|
|
) |
|
|
+ b"startxref\n" |
|
|
+ str(xref_start).encode("ascii") |
|
|
+ b"\n%%EOF\n" |
|
|
) |
|
|
|
|
|
out_path.parent.mkdir(parents=True, exist_ok=True) |
|
|
out_path.write_bytes(out) |
|
|
|
|
|
|
|
|
def diff_to_pdf_lines(old_root: Path, new_root: Path) -> list[PdfLine]: |
|
|
old_files = {p.relative_to(old_root) for p in old_root.rglob("*") if p.is_file()} |
|
|
new_files = {p.relative_to(new_root) for p in new_root.rglob("*") if p.is_file()} |
|
|
|
|
|
added = sorted(new_files - old_files) |
|
|
removed = sorted(old_files - new_files) |
|
|
common = sorted(old_files & new_files) |
|
|
|
|
|
changed_text: list[Path] = [] |
|
|
changed_bin: list[Path] = [] |
|
|
|
|
|
for rel in common: |
|
|
p1 = old_root / rel |
|
|
p2 = new_root / rel |
|
|
b1 = p1.read_bytes() |
|
|
b2 = p2.read_bytes() |
|
|
if b1 == b2: |
|
|
continue |
|
|
if is_probably_text(rel): |
|
|
changed_text.append(rel) |
|
|
else: |
|
|
changed_bin.append(rel) |
|
|
|
|
|
lines: list[PdfLine] = [] |
|
|
|
|
|
def add_line(text: str, rgb: tuple[float, float, float] = (0, 0, 0)) -> None: |
|
|
lines.append(PdfLine(text, rgb)) |
|
|
|
|
|
add_line(f"Diff: {old_root} -> {new_root}", (0.1, 0.1, 0.1)) |
|
|
add_line("", (0, 0, 0)) |
|
|
|
|
|
add_line("Summary", (0.1, 0.1, 0.6)) |
|
|
add_line(f"- Changed text files: {len(changed_text)}", (0, 0, 0)) |
|
|
for rel in changed_text: |
|
|
add_line(f" - {rel}", (0, 0, 0)) |
|
|
add_line(f"- Changed binary files: {len(changed_bin)}", (0, 0, 0)) |
|
|
for rel in changed_bin: |
|
|
p1 = old_root / rel |
|
|
p2 = new_root / rel |
|
|
b1 = p1.read_bytes() |
|
|
b2 = p2.read_bytes() |
|
|
add_line( |
|
|
f" - {rel} (old {len(b1)}B {sha256_short(b1)}, new {len(b2)}B {sha256_short(b2)})", |
|
|
(0, 0, 0), |
|
|
) |
|
|
add_line(f"- Added files: {len(added)}", (0, 0, 0)) |
|
|
for rel in added: |
|
|
p = new_root / rel |
|
|
b = p.read_bytes() |
|
|
add_line(f" - {rel} ({len(b)}B {sha256_short(b)})", (0, 0, 0)) |
|
|
add_line(f"- Removed files: {len(removed)}", (0, 0, 0)) |
|
|
for rel in removed: |
|
|
add_line(f" - {rel}", (0, 0, 0)) |
|
|
|
|
|
add_line("", (0, 0, 0)) |
|
|
add_line("Unified diffs (text)", (0.1, 0.1, 0.6)) |
|
|
add_line("", (0, 0, 0)) |
|
|
|
|
|
def color_for_diff_line(s: str) -> tuple[float, float, float]: |
|
|
if s.startswith("diff "): |
|
|
return (0.2, 0.2, 0.2) |
|
|
if s.startswith("--- ") or s.startswith("+++ "): |
|
|
return (0.2, 0.2, 0.2) |
|
|
if s.startswith("@@"): |
|
|
return (0.1, 0.1, 0.6) |
|
|
if s.startswith("+") and not s.startswith("+++"): |
|
|
return (0.0, 0.45, 0.0) |
|
|
if s.startswith("-") and not s.startswith("---"): |
|
|
return (0.75, 0.0, 0.0) |
|
|
return (0, 0, 0) |
|
|
|
|
|
for rel in changed_text: |
|
|
p1 = old_root / rel |
|
|
p2 = new_root / rel |
|
|
old_lines = p1.read_text(encoding="utf-8", errors="replace").splitlines() |
|
|
new_lines = p2.read_text(encoding="utf-8", errors="replace").splitlines() |
|
|
add_line(f"File: {rel}", (0.2, 0.2, 0.2)) |
|
|
diff_lines = difflib.unified_diff( |
|
|
old_lines, |
|
|
new_lines, |
|
|
fromfile=str(old_root / rel), |
|
|
tofile=str(new_root / rel), |
|
|
lineterm="", |
|
|
n=3, |
|
|
) |
|
|
for s in diff_lines: |
|
|
add_line(s, color_for_diff_line(s)) |
|
|
add_line("", (0, 0, 0)) |
|
|
|
|
|
return lines |
|
|
|
|
|
|
|
|
def main() -> int: |
|
|
parser = argparse.ArgumentParser( |
|
|
description="Generate a PDF containing unified diffs between two paper versions." |
|
|
) |
|
|
parser.add_argument("old_dir", type=Path) |
|
|
parser.add_argument("new_dir", type=Path) |
|
|
parser.add_argument( |
|
|
"-o", |
|
|
"--out", |
|
|
type=Path, |
|
|
default=Path("paper-sources/paper-301-vs-757-diff.pdf"), |
|
|
help="Output PDF path (default: paper-sources/paper-301-vs-757-diff.pdf)", |
|
|
) |
|
|
args = parser.parse_args() |
|
|
|
|
|
old_dir = args.old_dir.resolve() |
|
|
new_dir = args.new_dir.resolve() |
|
|
if not old_dir.is_dir(): |
|
|
raise SystemExit(f"Not a directory: {old_dir}") |
|
|
if not new_dir.is_dir(): |
|
|
raise SystemExit(f"Not a directory: {new_dir}") |
|
|
|
|
|
pdf = SimplePdf() |
|
|
lines = diff_to_pdf_lines(old_dir, new_dir) |
|
|
pdf.render(lines, args.out) |
|
|
print(args.out) |
|
|
return 0 |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
raise SystemExit(main()) |
|
|
|
|
|
|