File size: 8,289 Bytes
a8784d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
"""Dataset-Maker - Gradio web app (HuggingFace Spaces ready).

Upload a PDF -> each page is rendered to A4, torn into NON-OVERLAPPING fragments
on a black background, and packaged as a ZIP with stitching ground truth.

Performance:
  * Gradio `.queue()` caps concurrent requests for the 2-vCPU free tier.
  * A priority queue (src/queue_manager.py) orders page jobs cheap-first.
  * NumPy/SciPy vectorized partition; PNG-optimized export.
"""
from __future__ import annotations

import gradio as gr

from src import config, workspace
from src.optimizer import encode_preview
from src.packager import build_zip
from src.pipeline import process_pdf, save_temp_pdf
from src.tearing import verify_partition


def _resolve_theme(name: str):
    """Resolve a registry theme, falling back gracefully across Gradio versions.

    Some themes (Ocean, Citrus) only exist in Gradio 5+. On older Gradio we fall
    back to Default rather than crashing at startup.
    """
    cls_name, kwargs = config.THEME_REGISTRY.get(
        name, config.THEME_REGISTRY[config.DEFAULT_THEME]
    )
    cls = getattr(gr.themes, cls_name, None) or getattr(gr.themes, "Default")
    return cls(**kwargs)


def available_themes() -> list[str]:
    """Registry themes actually present in the installed Gradio build."""
    return [
        name for name, (cls, _) in config.THEME_REGISTRY.items()
        if getattr(gr.themes, cls, None) is not None
    ]


def _pieces_gallery(pages, max_pieces: int = 60):
    """Flatten a few torn pieces for the preview gallery (downscaled)."""
    out = []
    for pi, page in enumerate(pages):
        for k, piece in enumerate(page.pieces):
            out.append((encode_preview(piece.rgb, 256), f"p{pi+1}·{k}"))
            if len(out) >= max_pieces:
                return out
    return out


def generate(
    pdf_file,
    dpi: int,
    n_pieces: int,
    noise_strength: float,
    noise_scale: float,
    lossy: bool,
    seed: int,
    progress=gr.Progress(),
):
    """Main event handler: PDF -> (status, gallery, zip path)."""
    if pdf_file is None:
        raise gr.Error("Upload a PDF first.")

    # Drop temp files from the previous run so disk stays at steady state
    # (~1 ZIP) instead of growing every generate. HF free-tier disk is small.
    workspace.clear_all()

    progress(0.02, desc="Reading PDF…")
    with open(pdf_file, "rb") as fh:
        pdf_bytes = fh.read()
    if len(pdf_bytes) > config.MAX_UPLOAD_MB * 1024 * 1024:
        raise gr.Error(f"PDF exceeds {config.MAX_UPLOAD_MB} MB limit.")

    tmp_pdf = save_temp_pdf(pdf_bytes)
    pages = process_pdf(
        tmp_pdf,
        dpi=int(dpi),
        n_pieces=int(n_pieces),
        noise_strength=float(noise_strength),
        noise_scale=float(noise_scale),
        master_seed=int(seed),
        progress=lambda f, m: progress(0.05 + 0.8 * f, desc=m),
    )
    # Input PDF is fully rendered into `pages` now; free it immediately.
    workspace.discard(tmp_pdf)

    # Verify the no-overlap invariant on the first page (sanity gate).
    report = verify_partition(pages[0])
    if not report["is_partition"]:
        raise gr.Error(
            f"Partition check failed: overlap={report['max_overlap']}, "
            f"uncovered={report['uncovered_pixels']}"
        )

    progress(0.9, desc="Packaging ZIP…")
    zip_bytes, manifest = build_zip(
        pages,
        source_name="upload.pdf",
        dpi=int(dpi),
        noise_strength=float(noise_strength),
        noise_scale=float(noise_scale),
        lossy=lossy,
    )
    out_path = workspace.new_temp(suffix="_dataset.zip")
    with open(out_path, "wb") as fh:
        fh.write(zip_bytes)

    status = (
        f"✅ {len(pages)} pages · {manifest['total_pieces']} pieces · "
        f"no-overlap verified (max_overlap={report['max_overlap']}, "
        f"uncovered={report['uncovered_pixels']})"
    )
    progress(1.0, desc="Done")
    # Order: gallery, zip, status. Status is consumed by a chained .then() with
    # progress hidden, so no progress bar paints over the status text strip.
    return _pieces_gallery(pages), out_path, status


def clear_all():
    """Delete tracked temp files (PDFs + ZIPs) and reset the UI outputs."""
    removed = workspace.clear_all()
    status = f"🧹 Cleared {removed} temp file(s). Upload a PDF and hit **Generate**."
    # outputs order: pdf_in, status, gallery, zip_out
    return None, status, None, None


# Cap the preview gallery and scroll *inside* it. Gradio 4.44's Gallery `height`
# caps the root but the inner thumbnail grid (.grid-wrap) overflows the page
# instead of scrolling, so force overflow on the inner container directly.
_GALLERY_CSS = """
#piece-gallery { max-height: 70vh; }
#piece-gallery .grid-wrap,
#piece-gallery .thumbnails {
    max-height: 70vh;
    overflow-y: auto;
}
"""


def build_ui(theme_name: str = config.DEFAULT_THEME) -> gr.Blocks:
    with gr.Blocks(
        theme=_resolve_theme(theme_name),
        title="Dataset-Maker · Torn-page stitching dataset",
        css=_GALLERY_CSS,
    ) as demo:
        gr.Markdown(
            "# 🧩 Dataset-Maker\n"
            "Tear PDF pages into **non-overlapping** torn fragments for "
            "image-stitching datasets. Every pixel lands in exactly one piece - "
            "guaranteed by a domain-warped Voronoi partition."
        )
        with gr.Row():
            with gr.Column(scale=1):
                pdf_in = gr.File(label="PDF", file_types=[".pdf"], type="filepath")
                n_pieces = gr.Slider(
                    config.MIN_PIECES, config.MAX_PIECES, config.DEFAULT_PIECES,
                    step=1, label="Pieces per page",
                )
                with gr.Accordion("Tearing controls", open=False):
                    noise_strength = gr.Slider(
                        0, 80, config.DEFAULT_NOISE_STRENGTH, step=1,
                        label="Tear jaggedness (px)",
                    )
                    noise_scale = gr.Slider(
                        8, 200, config.DEFAULT_NOISE_SCALE, step=1,
                        label="Tear smoothness (wavelength px)",
                    )
                    dpi = gr.Slider(
                        config.MIN_DPI, config.MAX_DPI, config.DEFAULT_DPI, step=1,
                        label="Render DPI",
                    )
                    seed = gr.Number(value=0, precision=0, label="Master seed")
                    lossy = gr.Checkbox(
                        value=False, label="Lossy palette PNG (smaller ZIP)"
                    )
                with gr.Row():
                    run = gr.Button("Generate dataset", variant="primary")
                    clear = gr.Button("Clear all", variant="secondary")
            with gr.Column(scale=2):
                status = gr.Markdown("Upload a PDF and hit **Generate**.")
                gallery = gr.Gallery(
                    label="Torn pieces (preview)", columns=6, height=420,
                    object_fit="contain", elem_id="piece-gallery",
                )
                zip_out = gr.File(label="Download dataset (.zip)")

        # Status flows through a State, then into the Markdown via a hidden-
        # progress .then() — keeps the progress bars on gallery + zip only,
        # not over the thin status text (4.44 has no per-output show_progress).
        status_state = gr.State("")

        run.click(
            generate,
            inputs=[pdf_in, dpi, n_pieces, noise_strength, noise_scale, lossy, seed],
            outputs=[gallery, zip_out, status_state],
            concurrency_limit=config.WORKER_CONCURRENCY,  # heavy job throttle
        ).then(
            lambda s: s,
            inputs=status_state,
            outputs=status,
            show_progress="hidden",
        )
        clear.click(
            clear_all,
            inputs=None,
            outputs=[pdf_in, status, gallery, zip_out],
        )
        gr.Markdown(
            "Pieces sit on black backgrounds; `manifest.json` carries each "
            "piece's `(x, y)` offset = the stitching label."
        )
    return demo


demo = build_ui()
demo.queue(
    max_size=config.QUEUE_MAX_SIZE,
    default_concurrency_limit=config.WORKER_CONCURRENCY,
)

if __name__ == "__main__":
    demo.launch(share=True)