Spaces:
Running
Running
| """Dataset-Maker - Gradio web app (HuggingFace Spaces ready). | |
| Upload a PDF -> each page is rendered to A4, torn into NON-OVERLAPPING fragments | |
| on a black background, and packaged as a ZIP with stitching ground truth. | |
| Performance: | |
| * Gradio `.queue()` caps concurrent requests for the 2-vCPU free tier. | |
| * A priority queue (src/queue_manager.py) orders page jobs cheap-first. | |
| * NumPy/SciPy vectorized partition; PNG-optimized export. | |
| """ | |
| from __future__ import annotations | |
| import gradio as gr | |
| from src import config, workspace | |
| from src.optimizer import encode_preview | |
| from src.packager import build_zip | |
| from src.pipeline import process_pdf, save_temp_pdf | |
| from src.tearing import verify_partition | |
| def _resolve_theme(name: str): | |
| """Resolve a registry theme, falling back gracefully across Gradio versions. | |
| Some themes (Ocean, Citrus) only exist in Gradio 5+. On older Gradio we fall | |
| back to Default rather than crashing at startup. | |
| """ | |
| cls_name, kwargs = config.THEME_REGISTRY.get( | |
| name, config.THEME_REGISTRY[config.DEFAULT_THEME] | |
| ) | |
| cls = getattr(gr.themes, cls_name, None) or getattr(gr.themes, "Default") | |
| return cls(**kwargs) | |
| def available_themes() -> list[str]: | |
| """Registry themes actually present in the installed Gradio build.""" | |
| return [ | |
| name for name, (cls, _) in config.THEME_REGISTRY.items() | |
| if getattr(gr.themes, cls, None) is not None | |
| ] | |
| def _pieces_gallery(pages, max_pieces: int = 60): | |
| """Flatten a few torn pieces for the preview gallery (downscaled).""" | |
| out = [] | |
| for pi, page in enumerate(pages): | |
| for k, piece in enumerate(page.pieces): | |
| out.append((encode_preview(piece.rgb, 256), f"p{pi+1}·{k}")) | |
| if len(out) >= max_pieces: | |
| return out | |
| return out | |
| def generate( | |
| pdf_file, | |
| dpi: int, | |
| n_pieces: int, | |
| noise_strength: float, | |
| noise_scale: float, | |
| lossy: bool, | |
| seed: int, | |
| progress=gr.Progress(), | |
| ): | |
| """Main event handler: PDF -> (status, gallery, zip path).""" | |
| if pdf_file is None: | |
| raise gr.Error("Upload a PDF first.") | |
| # Drop temp files from the previous run so disk stays at steady state | |
| # (~1 ZIP) instead of growing every generate. HF free-tier disk is small. | |
| workspace.clear_all() | |
| progress(0.02, desc="Reading PDF…") | |
| with open(pdf_file, "rb") as fh: | |
| pdf_bytes = fh.read() | |
| if len(pdf_bytes) > config.MAX_UPLOAD_MB * 1024 * 1024: | |
| raise gr.Error(f"PDF exceeds {config.MAX_UPLOAD_MB} MB limit.") | |
| tmp_pdf = save_temp_pdf(pdf_bytes) | |
| pages = process_pdf( | |
| tmp_pdf, | |
| dpi=int(dpi), | |
| n_pieces=int(n_pieces), | |
| noise_strength=float(noise_strength), | |
| noise_scale=float(noise_scale), | |
| master_seed=int(seed), | |
| progress=lambda f, m: progress(0.05 + 0.8 * f, desc=m), | |
| ) | |
| # Input PDF is fully rendered into `pages` now; free it immediately. | |
| workspace.discard(tmp_pdf) | |
| # Verify the no-overlap invariant on the first page (sanity gate). | |
| report = verify_partition(pages[0]) | |
| if not report["is_partition"]: | |
| raise gr.Error( | |
| f"Partition check failed: overlap={report['max_overlap']}, " | |
| f"uncovered={report['uncovered_pixels']}" | |
| ) | |
| progress(0.9, desc="Packaging ZIP…") | |
| zip_bytes, manifest = build_zip( | |
| pages, | |
| source_name="upload.pdf", | |
| dpi=int(dpi), | |
| noise_strength=float(noise_strength), | |
| noise_scale=float(noise_scale), | |
| lossy=lossy, | |
| ) | |
| out_path = workspace.new_temp(suffix="_dataset.zip") | |
| with open(out_path, "wb") as fh: | |
| fh.write(zip_bytes) | |
| status = ( | |
| f"✅ {len(pages)} pages · {manifest['total_pieces']} pieces · " | |
| f"no-overlap verified (max_overlap={report['max_overlap']}, " | |
| f"uncovered={report['uncovered_pixels']})" | |
| ) | |
| progress(1.0, desc="Done") | |
| # Order: gallery, zip, status. Status is consumed by a chained .then() with | |
| # progress hidden, so no progress bar paints over the status text strip. | |
| return _pieces_gallery(pages), out_path, status | |
| def clear_all(): | |
| """Delete tracked temp files (PDFs + ZIPs) and reset the UI outputs.""" | |
| removed = workspace.clear_all() | |
| status = f"🧹 Cleared {removed} temp file(s). Upload a PDF and hit **Generate**." | |
| # outputs order: pdf_in, status, gallery, zip_out | |
| return None, status, None, None | |
| # Cap the preview gallery and scroll *inside* it. Gradio 4.44's Gallery `height` | |
| # caps the root but the inner thumbnail grid (.grid-wrap) overflows the page | |
| # instead of scrolling, so force overflow on the inner container directly. | |
| _GALLERY_CSS = """ | |
| #piece-gallery { max-height: 70vh; } | |
| #piece-gallery .grid-wrap, | |
| #piece-gallery .thumbnails { | |
| max-height: 70vh; | |
| overflow-y: auto; | |
| } | |
| """ | |
| def build_ui(theme_name: str = config.DEFAULT_THEME) -> gr.Blocks: | |
| with gr.Blocks( | |
| theme=_resolve_theme(theme_name), | |
| title="Dataset-Maker · Torn-page stitching dataset", | |
| css=_GALLERY_CSS, | |
| ) as demo: | |
| gr.Markdown( | |
| "# 🧩 Dataset-Maker\n" | |
| "Tear PDF pages into **non-overlapping** torn fragments for " | |
| "image-stitching datasets. Every pixel lands in exactly one piece - " | |
| "guaranteed by a domain-warped Voronoi partition." | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| pdf_in = gr.File(label="PDF", file_types=[".pdf"], type="filepath") | |
| n_pieces = gr.Slider( | |
| config.MIN_PIECES, config.MAX_PIECES, config.DEFAULT_PIECES, | |
| step=1, label="Pieces per page", | |
| ) | |
| with gr.Accordion("Tearing controls", open=False): | |
| noise_strength = gr.Slider( | |
| 0, 80, config.DEFAULT_NOISE_STRENGTH, step=1, | |
| label="Tear jaggedness (px)", | |
| ) | |
| noise_scale = gr.Slider( | |
| 8, 200, config.DEFAULT_NOISE_SCALE, step=1, | |
| label="Tear smoothness (wavelength px)", | |
| ) | |
| dpi = gr.Slider( | |
| config.MIN_DPI, config.MAX_DPI, config.DEFAULT_DPI, step=1, | |
| label="Render DPI", | |
| ) | |
| seed = gr.Number(value=0, precision=0, label="Master seed") | |
| lossy = gr.Checkbox( | |
| value=False, label="Lossy palette PNG (smaller ZIP)" | |
| ) | |
| with gr.Row(): | |
| run = gr.Button("Generate dataset", variant="primary") | |
| clear = gr.Button("Clear all", variant="secondary") | |
| with gr.Column(scale=2): | |
| status = gr.Markdown("Upload a PDF and hit **Generate**.") | |
| gallery = gr.Gallery( | |
| label="Torn pieces (preview)", columns=6, height=420, | |
| object_fit="contain", elem_id="piece-gallery", | |
| ) | |
| zip_out = gr.File(label="Download dataset (.zip)") | |
| # Status flows through a State, then into the Markdown via a hidden- | |
| # progress .then() — keeps the progress bars on gallery + zip only, | |
| # not over the thin status text (4.44 has no per-output show_progress). | |
| status_state = gr.State("") | |
| run.click( | |
| generate, | |
| inputs=[pdf_in, dpi, n_pieces, noise_strength, noise_scale, lossy, seed], | |
| outputs=[gallery, zip_out, status_state], | |
| concurrency_limit=config.WORKER_CONCURRENCY, # heavy job throttle | |
| ).then( | |
| lambda s: s, | |
| inputs=status_state, | |
| outputs=status, | |
| show_progress="hidden", | |
| ) | |
| clear.click( | |
| clear_all, | |
| inputs=None, | |
| outputs=[pdf_in, status, gallery, zip_out], | |
| ) | |
| gr.Markdown( | |
| "Pieces sit on black backgrounds; `manifest.json` carries each " | |
| "piece's `(x, y)` offset = the stitching label." | |
| ) | |
| return demo | |
| demo = build_ui() | |
| demo.queue( | |
| max_size=config.QUEUE_MAX_SIZE, | |
| default_concurrency_limit=config.WORKER_CONCURRENCY, | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(share=True) | |