File size: 3,342 Bytes
a8784d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
"""Package torn pieces + stitching ground-truth into a downloadable ZIP.

Layout inside the archive:
    pieces/page_0001/piece_000.png ...
    manifest.json          # global summary + per-piece placement (x, y, w, h)
    README.txt             # how to reassemble

The manifest IS the dataset label: each piece's (x, y) offset on its page is the
exact stitching target. Reassembling = paste every piece at its offset.
"""
from __future__ import annotations

import io
import json
import zipfile
from datetime import datetime, timezone

from .optimizer import encode_piece
from .tearing import TornPage


def build_zip(
    pages: list[TornPage],
    *,
    source_name: str,
    dpi: int,
    noise_strength: float,
    noise_scale: float,
    lossy: bool,
) -> tuple[bytes, dict]:
    """Return (zip_bytes, manifest_dict) for a list of torn pages."""
    manifest = {
        "generator": "Dataset-Maker",
        "created_utc": datetime.now(timezone.utc).isoformat(),
        "source": source_name,
        "dpi": dpi,
        "noise_strength": noise_strength,
        "noise_scale": noise_scale,
        "lossy": lossy,
        "pages": [],
        "total_pieces": 0,
    }

    buf = io.BytesIO()
    with zipfile.ZipFile(buf, "w", compression=zipfile.ZIP_DEFLATED) as zf:
        for pi, page in enumerate(pages):
            pdir = f"pieces/page_{pi + 1:04d}"
            page_entry = {
                "index": pi,
                "width": page.width,
                "height": page.height,
                # Undirected neighbor pairs (piece-index i, j) = which fragments
                # share a torn border. Positive pairs for pairwise/graph stitching
                # models; non-listed pairs are negatives.
                "adjacency": [[int(i), int(j)] for i, j in page.adjacency],
                "pieces": [],
            }
            for k, piece in enumerate(page.pieces):
                fname = f"{pdir}/piece_{k:03d}.png"
                zf.writestr(fname, encode_piece(piece.rgb, lossy=lossy))
                h, w = piece.mask.shape
                page_entry["pieces"].append(
                    {"file": fname, "x": piece.x, "y": piece.y, "w": w, "h": h}
                )
            manifest["total_pieces"] += len(page.pieces)
            manifest["pages"].append(page_entry)

        zf.writestr("manifest.json", json.dumps(manifest, indent=2))
        zf.writestr("README.txt", _README)

    return buf.getvalue(), manifest


_README = """Dataset-Maker export
=====================
Each page was torn into NON-OVERLAPPING fragments (a strict partition: every
pixel belongs to exactly one piece). Fragments sit on a black background.

Each page also carries `adjacency`: a list of [i, j] piece-index pairs that
share a torn border (4-connectivity, undirected, i < j). Use as positive pairs
for pairwise/graph-based stitching models; any pair not listed is a negative.

To reassemble a page (stitching ground truth):
    import json
    from PIL import Image
    m = json.load(open("manifest.json"))
    for page in m["pages"]:
        canvas = Image.new("RGB", (page["width"], page["height"]))
        for p in page["pieces"]:
            piece = Image.open(p["file"])
            canvas.paste(piece, (p["x"], p["y"]), mask=...)  # non-black pixels
        canvas.save(f"reassembled_{page['index']}.png")
"""