Dataset-Maker / src /packager.py
arittrabag's picture
Deploy Dataset-Maker: torn-page non-overlapping dataset generator
a8784d9 verified
"""Package torn pieces + stitching ground-truth into a downloadable ZIP.
Layout inside the archive:
pieces/page_0001/piece_000.png ...
manifest.json # global summary + per-piece placement (x, y, w, h)
README.txt # how to reassemble
The manifest IS the dataset label: each piece's (x, y) offset on its page is the
exact stitching target. Reassembling = paste every piece at its offset.
"""
from __future__ import annotations
import io
import json
import zipfile
from datetime import datetime, timezone
from .optimizer import encode_piece
from .tearing import TornPage
def build_zip(
pages: list[TornPage],
*,
source_name: str,
dpi: int,
noise_strength: float,
noise_scale: float,
lossy: bool,
) -> tuple[bytes, dict]:
"""Return (zip_bytes, manifest_dict) for a list of torn pages."""
manifest = {
"generator": "Dataset-Maker",
"created_utc": datetime.now(timezone.utc).isoformat(),
"source": source_name,
"dpi": dpi,
"noise_strength": noise_strength,
"noise_scale": noise_scale,
"lossy": lossy,
"pages": [],
"total_pieces": 0,
}
buf = io.BytesIO()
with zipfile.ZipFile(buf, "w", compression=zipfile.ZIP_DEFLATED) as zf:
for pi, page in enumerate(pages):
pdir = f"pieces/page_{pi + 1:04d}"
page_entry = {
"index": pi,
"width": page.width,
"height": page.height,
# Undirected neighbor pairs (piece-index i, j) = which fragments
# share a torn border. Positive pairs for pairwise/graph stitching
# models; non-listed pairs are negatives.
"adjacency": [[int(i), int(j)] for i, j in page.adjacency],
"pieces": [],
}
for k, piece in enumerate(page.pieces):
fname = f"{pdir}/piece_{k:03d}.png"
zf.writestr(fname, encode_piece(piece.rgb, lossy=lossy))
h, w = piece.mask.shape
page_entry["pieces"].append(
{"file": fname, "x": piece.x, "y": piece.y, "w": w, "h": h}
)
manifest["total_pieces"] += len(page.pieces)
manifest["pages"].append(page_entry)
zf.writestr("manifest.json", json.dumps(manifest, indent=2))
zf.writestr("README.txt", _README)
return buf.getvalue(), manifest
_README = """Dataset-Maker export
=====================
Each page was torn into NON-OVERLAPPING fragments (a strict partition: every
pixel belongs to exactly one piece). Fragments sit on a black background.
Each page also carries `adjacency`: a list of [i, j] piece-index pairs that
share a torn border (4-connectivity, undirected, i < j). Use as positive pairs
for pairwise/graph-based stitching models; any pair not listed is a negative.
To reassemble a page (stitching ground truth):
import json
from PIL import Image
m = json.load(open("manifest.json"))
for page in m["pages"]:
canvas = Image.new("RGB", (page["width"], page["height"]))
for p in page["pieces"]:
piece = Image.open(p["file"])
canvas.paste(piece, (p["x"], p["y"]), mask=...) # non-black pixels
canvas.save(f"reassembled_{page['index']}.png")
"""