Arjunvir Singh commited on
Commit
4e3af73
·
1 Parent(s): 8bd8e7a

Frontend: zip + multi-file uploads, progress with stage labels, chunk detail tab

Browse files

- Upload widget now accepts .zip and multiple files (gr.File with
file_count='multiple'). Server-side _extract_uploads_to_parse(...)
unzips one level, filters by extension, applies path-traversal guard,
caps at MAX_BATCH_DOCS (default 20) so a malicious zip can't fan out.
Per-file MAX_UPLOAD_BYTES / MAX_PAGE_COUNT guards still apply.
- For batch input, summary.batch lists every parsed doc with its
headline metrics + an aggregate block (total_chunks, mean_quality_score,
etc.). The Markdown tab still shows the first doc; full set is in the
artifact zip.
- gr.Progress callback wired into parse_uploaded_document with stage
labels: 'Validating uploads', 'Parsing N/M: <name>', 'Bundling
artifacts', 'Done'. Per-doc progress derived from list position.
- New 'Chunks' tab with a richer payload:
- Total / parent / child / table-linked / figure-linked / visual-context counts.
- Per-strategy block: count, token_count_min/median/max, 3 sample
chunks with previews truncated to 240 chars.
- The legacy chunking plan (strategy ladder + reasons) is nested at
chunking_payload.plan.
- UI top-level instructions explain the upload modes and per-file caps.

4 new tests cover: zip extraction with two markdown docs (assert batch
metadata + aggregate), multi-file list upload, mixed-content zip
(.exe filtered out, .md preserved), and chunk-detail payload shape.

Test count: 254/254.

Files changed (2) hide show
  1. app.py +346 -43
  2. tests/test_app.py +117 -0
app.py CHANGED
@@ -5,8 +5,9 @@ from __future__ import annotations
5
  import os
6
  import shutil
7
  import tempfile
 
8
  from pathlib import Path
9
- from typing import Any
10
 
11
  try:
12
  import gradio as gr
@@ -45,6 +46,12 @@ LIVE_GPU_CONFIG = ROOT / "configs" / "live_gpu_repair.yaml"
45
  # Spaces or tighten further for public ones.
46
  MAX_UPLOAD_BYTES = int(os.environ.get("ZSGDP_MAX_UPLOAD_BYTES", str(50 * 1024 * 1024))) # 50 MB
47
  MAX_PAGE_COUNT = int(os.environ.get("ZSGDP_MAX_PAGE_COUNT", "200"))
 
 
 
 
 
 
48
 
49
 
50
  class UploadRejected(Exception):
@@ -128,57 +135,288 @@ def _empty_outputs(reason: str, source: Path | None, *, rejected: bool, runtime:
128
  return ("", summary, {}, {}, {}, runtime, [], {}, {}, None, [])
129
 
130
 
131
- def parse_uploaded_document(file_obj: Any, pipeline_mode: str):
132
- """Parse a document into Markdown, structured JSON, and retrieval-ready chunks.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
 
134
- Use when a user supplies a document (PDF, Markdown, plaintext, HTML) and
135
- wants either (a) the text reconstructed cleanly, (b) structured elements
136
- + tables + figures with bounding boxes, (c) chunks for downstream RAG, or
137
- (d) an audit trail showing which parsers ran and how the merger resolved
138
- conflicts. Output bundles include SHA-256-checksummed artifact manifests.
 
 
 
 
139
 
140
  Args:
141
- file_obj: Uploaded document. Max 50 MB and 200 pages by default
142
- (configurable per Space via ZSGDP_MAX_UPLOAD_BYTES /
143
- ZSGDP_MAX_PAGE_COUNT). Oversized uploads are rejected with a
144
- clear UI error before parsing starts.
145
- pipeline_mode: "Docling + PyMuPDF" runs Docling primary with PyMuPDF
146
- for the parser-disagreement signal; "Default lightweight" uses
147
- text + PyMuPDF only.
148
-
149
- Returns an 11-tuple matching the Gradio outputs: canonical markdown,
150
- summary JSON, quality report, parser metrics, chunking plan, GPU runtime
151
- status, planned GPU tasks, GPU task preflight report, artifact manifest
152
- validation, archive zip path, individual artifact file list.
153
  """
154
 
 
 
 
 
 
 
155
  if file_obj is None:
156
  return _empty_outputs("Upload a document first.", None, rejected=False, runtime={})
157
 
158
- source = Path(file_obj.name)
 
 
 
 
 
 
 
 
 
 
 
 
159
  work_dir = Path(tempfile.mkdtemp(prefix="zeroshotgpu_"))
160
- output_dir = work_dir / "parsed"
161
- config_path = _config_path_for_mode(pipeline_mode)
162
 
163
- try:
164
- _validate_upload(source)
165
- except UploadRejected as exc:
166
- _logger.warning(
167
- "space_upload_rejected",
168
- extra={"source_path": str(source), "reason": str(exc)},
169
- )
170
  runtime = runtime_status_for_mode(pipeline_mode)
171
- return _empty_outputs(str(exc), source, rejected=True, runtime=runtime)
 
 
 
 
 
172
 
173
- try:
174
- parsed = parse_document(source, output_dir, config_path=config_path)
175
- except Exception as exc: # pragma: no cover - surfaced in the Space UI.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  runtime = runtime_status_for_mode(pipeline_mode)
177
- return _empty_outputs(str(exc), source, rejected=False, runtime=runtime)
 
 
 
 
 
 
 
 
 
 
 
 
 
178
 
179
- artifact_validation = validate_artifact_manifest(output_dir)
180
- archive_path = shutil.make_archive(str(output_dir), "zip", output_dir)
181
- individual_files = _collect_artifact_files(output_dir)
182
  runtime = parsed.provenance.get("gpu_runtime", {})
183
  summary = {
184
  "doc_id": parsed.doc_id,
@@ -197,12 +435,41 @@ def parse_uploaded_document(file_obj: Any, pipeline_mode: str):
197
  "artifact_checked_count": artifact_validation.get("checked_count"),
198
  "individual_artifact_count": len(individual_files),
199
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
  return (
201
  parsed.to_markdown(),
202
  summary,
203
  parsed.quality_report.to_dict(),
204
  parsed.provenance.get("parser_metrics", {}),
205
- parsed.provenance.get("chunking", {}),
206
  runtime,
207
  parsed.provenance.get("gpu_tasks", []),
208
  parsed.provenance.get("gpu_task_report", {}),
@@ -296,27 +563,63 @@ def run_benchmark_in_space() -> dict:
296
 
297
 
298
  with gr.Blocks(title="zeroshotGPU") as demo:
299
- gr.Markdown("# zeroshotGPU")
 
 
 
 
 
 
 
 
 
300
  with gr.Row():
301
- upload = gr.File(label="Document", file_types=[".pdf", ".md", ".txt", ".html"])
 
 
 
 
302
  with gr.Column():
303
  pipeline = gr.Dropdown(
304
  choices=["Docling + PyMuPDF", "Default lightweight", "Live GPU repair"],
305
  value="Docling + PyMuPDF",
306
  label="Pipeline",
307
- info="`Live GPU repair` enables repair.execute_gpu_escalations=true and dispatches malformed-table / OCR / figure / reading-order issues to Qwen2.5-VL on the configured GPU backend.",
308
  )
309
  parse_button = gr.Button("Parse", variant="primary")
310
  archive = gr.File(label="Artifacts (zip)")
311
  with gr.Tabs():
312
  with gr.Tab("Markdown"):
 
 
 
 
 
313
  markdown = gr.Markdown(label="Canonical Markdown")
314
  with gr.Tab("Run"):
 
 
 
 
 
315
  summary = gr.JSON(label="Summary")
316
  quality = gr.JSON(label="Quality Report")
317
  parser_metrics = gr.JSON(label="Parser Metrics")
318
- chunking = gr.JSON(label="Chunking Plan")
319
  artifact_validation = gr.JSON(label="Artifact Manifest Validation")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
320
  with gr.Tab("Artifacts"):
321
  gr.Markdown(
322
  "Each top-level artifact is downloadable individually. "
 
5
  import os
6
  import shutil
7
  import tempfile
8
+ import zipfile
9
  from pathlib import Path
10
+ from typing import Any, Iterable
11
 
12
  try:
13
  import gradio as gr
 
46
  # Spaces or tighten further for public ones.
47
  MAX_UPLOAD_BYTES = int(os.environ.get("ZSGDP_MAX_UPLOAD_BYTES", str(50 * 1024 * 1024))) # 50 MB
48
  MAX_PAGE_COUNT = int(os.environ.get("ZSGDP_MAX_PAGE_COUNT", "200"))
49
+ # Cap on docs extracted from a single zip so a malicious archive can't
50
+ # fan out into thousands of parses. Each doc still goes through the
51
+ # per-file MAX_UPLOAD_BYTES / MAX_PAGE_COUNT guards.
52
+ MAX_BATCH_DOCS = int(os.environ.get("ZSGDP_MAX_BATCH_DOCS", "20"))
53
+
54
+ SUPPORTED_PARSE_EXTS = (".pdf", ".md", ".txt", ".html", ".htm")
55
 
56
 
57
  class UploadRejected(Exception):
 
135
  return ("", summary, {}, {}, {}, runtime, [], {}, {}, None, [])
136
 
137
 
138
+ def _build_chunk_detail(parsed) -> dict[str, Any]:
139
+ """Produce a richer chunking summary than the bare chunking_plan.
140
+
141
+ Surfaces strategy counts, token-count distribution, sample chunks per
142
+ strategy (truncated to keep the payload UI-friendly), and counts of
143
+ tables / figures / parent / child chunks. Companion to the
144
+ `chunking_plan` JSON which only describes the strategy ladder.
145
+ """
146
+
147
+ chunks = parsed.chunks
148
+ by_strategy: dict[str, list] = {}
149
+ for chunk in chunks:
150
+ by_strategy.setdefault(chunk.strategy, []).append(chunk)
151
+
152
+ strategy_breakdown: dict[str, dict[str, Any]] = {}
153
+ for strategy, items in sorted(by_strategy.items()):
154
+ token_counts = sorted(item.token_count for item in items)
155
+ sample_chunks = []
156
+ for item in items[:3]:
157
+ preview = item.text.strip()
158
+ if len(preview) > 240:
159
+ preview = preview[:237] + "..."
160
+ sample_chunks.append(
161
+ {
162
+ "chunk_id": item.chunk_id,
163
+ "page_start": item.page_start,
164
+ "page_end": item.page_end,
165
+ "section_path": item.section_path,
166
+ "boundary_reason": item.boundary_reason,
167
+ "token_count": item.token_count,
168
+ "source_parser": item.source_parser,
169
+ "preview": preview,
170
+ }
171
+ )
172
+ strategy_breakdown[strategy] = {
173
+ "count": len(items),
174
+ "token_count_min": token_counts[0] if token_counts else 0,
175
+ "token_count_median": token_counts[len(token_counts) // 2] if token_counts else 0,
176
+ "token_count_max": token_counts[-1] if token_counts else 0,
177
+ "samples": sample_chunks,
178
+ }
179
+
180
+ parent_count = sum(1 for c in chunks if c.content_type == "parent")
181
+ child_count = sum(1 for c in chunks if c.parent_chunk_id)
182
+ table_chunks = sum(1 for c in chunks if c.table_ids)
183
+ figure_chunks = sum(1 for c in chunks if c.figure_ids)
184
+ visual_context = sum(1 for c in chunks if c.requires_visual_context)
185
+
186
+ return {
187
+ "total_chunks": len(chunks),
188
+ "parent_chunks": parent_count,
189
+ "child_chunks": child_count,
190
+ "table_linked_chunks": table_chunks,
191
+ "figure_linked_chunks": figure_chunks,
192
+ "visual_context_required": visual_context,
193
+ "strategies": strategy_breakdown,
194
+ "plan": parsed.provenance.get("chunking", {}),
195
+ }
196
+
197
+
198
+ def _extract_uploads_to_parse(uploads: Iterable[Path], work_dir: Path) -> list[Path]:
199
+ """Resolve a set of uploaded files (possibly zips) into individual docs.
200
+
201
+ Each input is either:
202
+ - A supported document file (.pdf, .md, .txt, .html) — kept as-is.
203
+ - A .zip archive — extracted; supported files inside are added to the
204
+ list. Nested zips are skipped (no recursive extraction; one level only).
205
+ Other extensions are silently dropped.
206
+
207
+ The total number of resolved docs is capped at MAX_BATCH_DOCS to bound
208
+ the worst-case parse time per request.
209
+ """
210
+
211
+ resolved: list[Path] = []
212
+ for upload in uploads:
213
+ ext = upload.suffix.lower()
214
+ if ext == ".zip":
215
+ extract_dir = Path(tempfile.mkdtemp(prefix="zsgdp_zip_", dir=work_dir))
216
+ try:
217
+ with zipfile.ZipFile(upload) as zf:
218
+ # Skip directories and nested zips.
219
+ for member in zf.namelist():
220
+ if member.endswith("/"):
221
+ continue
222
+ member_lower = member.lower()
223
+ if not member_lower.endswith(SUPPORTED_PARSE_EXTS):
224
+ continue
225
+ if "__MACOSX" in member or member_lower.startswith("."):
226
+ continue
227
+ # Path traversal guard.
228
+ target = (extract_dir / member).resolve()
229
+ if not str(target).startswith(str(extract_dir.resolve())):
230
+ continue
231
+ target.parent.mkdir(parents=True, exist_ok=True)
232
+ with zf.open(member) as source, open(target, "wb") as out:
233
+ shutil.copyfileobj(source, out)
234
+ resolved.append(target)
235
+ except zipfile.BadZipFile:
236
+ _logger.warning("space_zip_corrupt", extra={"path": str(upload)})
237
+ continue
238
+ elif ext in SUPPORTED_PARSE_EXTS:
239
+ resolved.append(upload)
240
+ else:
241
+ _logger.info("space_upload_skipped", extra={"path": str(upload), "reason": "unsupported_extension"})
242
+
243
+ if len(resolved) >= MAX_BATCH_DOCS:
244
+ break
245
+
246
+ return resolved[:MAX_BATCH_DOCS]
247
+
248
+
249
+ def _parse_one_doc(
250
+ source: Path,
251
+ output_dir: Path,
252
+ pipeline_mode: str,
253
+ ) -> dict[str, Any]:
254
+ """Parse a single doc and return a per-doc result block.
255
+
256
+ Raises on parse failure so the batch driver can record the error and
257
+ continue with remaining docs instead of aborting the whole request.
258
+ """
259
+
260
+ config_path = _config_path_for_mode(pipeline_mode)
261
+ parsed = parse_document(source, output_dir, config_path=config_path)
262
+ artifact_validation = validate_artifact_manifest(output_dir)
263
+ individual_files = _collect_artifact_files(output_dir)
264
+ return {
265
+ "source_path": str(source),
266
+ "doc_id": parsed.doc_id,
267
+ "file_type": parsed.file_type,
268
+ "elements": len(parsed.elements),
269
+ "tables": len(parsed.tables),
270
+ "figures": len(parsed.figures),
271
+ "chunks": len(parsed.chunks),
272
+ "quality_score": parsed.quality_report.score,
273
+ "blocking": parsed.quality_report.has_blocking_failures,
274
+ "artifact_manifest_valid": artifact_validation.get("valid"),
275
+ "individual_artifact_count": len(individual_files),
276
+ "_parsed": parsed,
277
+ "_artifact_validation": artifact_validation,
278
+ "_individual_files": individual_files,
279
+ "_output_dir": str(output_dir),
280
+ }
281
+
282
+
283
+ def parse_uploaded_document(file_obj: Any, pipeline_mode: str, progress: Any = None):
284
+ """Parse one or more documents into Markdown, structured JSON, and chunks.
285
+
286
+ Accepts either a single file or a list of files (Gradio's `file_count="multiple"`
287
+ semantics). `.zip` uploads are extracted on the server side and each
288
+ supported file inside is parsed; total docs are capped at
289
+ MAX_BATCH_DOCS (default 20) to bound the worst-case work per request.
290
 
291
+ For multi-doc inputs the Markdown tab shows the first document's
292
+ output; the Summary tab includes a `batch` block listing every doc's
293
+ headline metrics; the Artifacts zip contains every per-doc directory.
294
+
295
+ Use when a user supplies one or many documents and wants either
296
+ (a) the text reconstructed cleanly, (b) structured elements + tables
297
+ + figures with bounding boxes, (c) chunks for downstream RAG, or
298
+ (d) an audit trail showing which parsers ran and how the merger
299
+ resolved conflicts.
300
 
301
  Args:
302
+ file_obj: Uploaded file(s). Single `.pdf` / `.md` / `.txt` /
303
+ `.html`, or a `.zip` of those, or a list of any of the above.
304
+ Per-file caps of 50 MB and 200 pages apply (configurable via
305
+ ZSGDP_MAX_UPLOAD_BYTES / ZSGDP_MAX_PAGE_COUNT).
306
+ pipeline_mode: "Docling + PyMuPDF" / "Default lightweight" /
307
+ "Live GPU repair". The third dispatches malformed-table,
308
+ OCR-coverage, figure, and reading-order issues to the
309
+ configured GPU backend (Qwen2.5-VL by default).
310
+ progress: optional Gradio Progress object (auto-injected by the
311
+ Gradio click handler leave None for direct API calls).
 
 
312
  """
313
 
314
+ if progress is None:
315
+ # When called via /gradio_api/call, no progress is wired; use a no-op
316
+ # so the function signature stays consistent.
317
+ def progress(value, *, desc=""): # type: ignore[no-redef]
318
+ return None
319
+
320
  if file_obj is None:
321
  return _empty_outputs("Upload a document first.", None, rejected=False, runtime={})
322
 
323
+ progress(0.0, desc="Validating uploads...")
324
+
325
+ # Normalise to a list of Path. Gradio passes a single FileData when
326
+ # file_count='single' and a list when 'multiple'.
327
+ if isinstance(file_obj, list):
328
+ upload_paths = [Path(item.name if hasattr(item, "name") else item) for item in file_obj if item is not None]
329
+ elif hasattr(file_obj, "name"):
330
+ upload_paths = [Path(file_obj.name)]
331
+ else:
332
+ upload_paths = [Path(str(file_obj))]
333
+ if not upload_paths:
334
+ return _empty_outputs("Upload a document first.", None, rejected=False, runtime={})
335
+
336
  work_dir = Path(tempfile.mkdtemp(prefix="zeroshotgpu_"))
337
+ docs_to_parse = _extract_uploads_to_parse(upload_paths, work_dir)
 
338
 
339
+ if not docs_to_parse:
 
 
 
 
 
 
340
  runtime = runtime_status_for_mode(pipeline_mode)
341
+ return _empty_outputs(
342
+ "No supported documents found in the upload (accepted: pdf/md/txt/html, optionally inside a zip).",
343
+ upload_paths[0],
344
+ rejected=True,
345
+ runtime=runtime,
346
+ )
347
 
348
+ # Per-file abuse guard.
349
+ for doc in docs_to_parse:
350
+ try:
351
+ _validate_upload(doc)
352
+ except UploadRejected as exc:
353
+ _logger.warning(
354
+ "space_upload_rejected",
355
+ extra={"source_path": str(doc), "reason": str(exc)},
356
+ )
357
+ runtime = runtime_status_for_mode(pipeline_mode)
358
+ return _empty_outputs(str(exc), doc, rejected=True, runtime=runtime)
359
+
360
+ progress(0.05, desc=f"Parsing {len(docs_to_parse)} document(s)...")
361
+
362
+ output_root = work_dir / "parsed"
363
+ output_root.mkdir(parents=True, exist_ok=True)
364
+ per_doc_results: list[dict[str, Any]] = []
365
+ used_names: set[str] = set()
366
+
367
+ for index, doc in enumerate(docs_to_parse, start=1):
368
+ # Stable per-doc subdir.
369
+ stem = doc.stem or f"doc_{index}"
370
+ candidate = stem
371
+ suffix = 2
372
+ while candidate in used_names:
373
+ candidate = f"{stem}_{suffix}"
374
+ suffix += 1
375
+ used_names.add(candidate)
376
+ doc_out = output_root / candidate
377
+
378
+ progress(
379
+ 0.05 + 0.85 * (index - 1) / max(1, len(docs_to_parse)),
380
+ desc=f"Parsing {index}/{len(docs_to_parse)}: {doc.name}",
381
+ )
382
+ try:
383
+ result = _parse_one_doc(doc, doc_out, pipeline_mode)
384
+ per_doc_results.append(result)
385
+ except Exception as exc: # pragma: no cover - surfaced in UI
386
+ _logger.warning(
387
+ "space_parse_failed",
388
+ extra={"source_path": str(doc), "error": str(exc)},
389
+ )
390
+ per_doc_results.append(
391
+ {
392
+ "source_path": str(doc),
393
+ "error": str(exc),
394
+ "doc_id": None,
395
+ "_parsed": None,
396
+ }
397
+ )
398
+
399
+ progress(0.92, desc="Bundling artifacts...")
400
+
401
+ # Pick the first successful parse as the primary doc shown in the UI.
402
+ successful = [r for r in per_doc_results if r.get("_parsed") is not None]
403
+ if not successful:
404
  runtime = runtime_status_for_mode(pipeline_mode)
405
+ first_error = next((r.get("error") for r in per_doc_results if r.get("error")), "All parses failed.")
406
+ return _empty_outputs(first_error, upload_paths[0], rejected=False, runtime=runtime)
407
+
408
+ primary = successful[0]
409
+ parsed = primary["_parsed"]
410
+ artifact_validation = primary["_artifact_validation"]
411
+ individual_files = primary["_individual_files"]
412
+
413
+ # If batch, the archive bundles the whole output_root; otherwise just the
414
+ # single doc's dir. Always returns a single zip path.
415
+ if len(per_doc_results) > 1:
416
+ archive_path = shutil.make_archive(str(output_root), "zip", output_root)
417
+ else:
418
+ archive_path = shutil.make_archive(str(Path(primary["_output_dir"])), "zip", primary["_output_dir"])
419
 
 
 
 
420
  runtime = parsed.provenance.get("gpu_runtime", {})
421
  summary = {
422
  "doc_id": parsed.doc_id,
 
435
  "artifact_checked_count": artifact_validation.get("checked_count"),
436
  "individual_artifact_count": len(individual_files),
437
  }
438
+
439
+ if len(per_doc_results) > 1:
440
+ successful_count = sum(1 for r in per_doc_results if r.get("_parsed") is not None)
441
+ summary["batch"] = {
442
+ "input_count": len(docs_to_parse),
443
+ "successful_count": successful_count,
444
+ "failed_count": len(per_doc_results) - successful_count,
445
+ "documents": [
446
+ {key: value for key, value in record.items() if not key.startswith("_")}
447
+ for record in per_doc_results
448
+ ],
449
+ "aggregate": {
450
+ "total_elements": sum(r.get("elements", 0) for r in per_doc_results if r.get("elements") is not None),
451
+ "total_tables": sum(r.get("tables", 0) for r in per_doc_results if r.get("tables") is not None),
452
+ "total_figures": sum(r.get("figures", 0) for r in per_doc_results if r.get("figures") is not None),
453
+ "total_chunks": sum(r.get("chunks", 0) for r in per_doc_results if r.get("chunks") is not None),
454
+ "mean_quality_score": (
455
+ sum(r.get("quality_score", 0.0) for r in per_doc_results if r.get("quality_score") is not None)
456
+ / max(1, successful_count)
457
+ ),
458
+ },
459
+ }
460
+
461
+ chunking_payload = {
462
+ "plan": parsed.provenance.get("chunking", {}),
463
+ "detail": _build_chunk_detail(parsed),
464
+ }
465
+ progress(1.0, desc="Done")
466
+
467
  return (
468
  parsed.to_markdown(),
469
  summary,
470
  parsed.quality_report.to_dict(),
471
  parsed.provenance.get("parser_metrics", {}),
472
+ chunking_payload,
473
  runtime,
474
  parsed.provenance.get("gpu_tasks", []),
475
  parsed.provenance.get("gpu_task_report", {}),
 
563
 
564
 
565
  with gr.Blocks(title="zeroshotGPU") as demo:
566
+ gr.Markdown(
567
+ "# zeroshotGPU\n\n"
568
+ "Self-hosted agentic document parser. Upload a single document, multiple "
569
+ "documents, or a `.zip` of documents (PDF / Markdown / plaintext / HTML). "
570
+ "Each parse emits canonical markdown, structured JSON, retrieval-ready "
571
+ "chunks (multi-strategy), a quality report with GT-comparison metrics "
572
+ "where applicable, and a SHA-256-checksummed artifact manifest. "
573
+ f"Per-file caps: {MAX_UPLOAD_BYTES // (1024 * 1024)} MB / "
574
+ f"{MAX_PAGE_COUNT} pages. Batch cap: {MAX_BATCH_DOCS} docs per request."
575
+ )
576
  with gr.Row():
577
+ upload = gr.File(
578
+ label="Document(s) — single file, multi-select, or .zip",
579
+ file_types=[".pdf", ".md", ".txt", ".html", ".htm", ".zip"],
580
+ file_count="multiple",
581
+ )
582
  with gr.Column():
583
  pipeline = gr.Dropdown(
584
  choices=["Docling + PyMuPDF", "Default lightweight", "Live GPU repair"],
585
  value="Docling + PyMuPDF",
586
  label="Pipeline",
587
+ info="`Docling + PyMuPDF` runs both for the disagreement signal. `Default lightweight` is text + PyMuPDF only. `Live GPU repair` enables repair.execute_gpu_escalations=true and dispatches malformed-table / OCR / figure / reading-order issues to Qwen2.5-VL.",
588
  )
589
  parse_button = gr.Button("Parse", variant="primary")
590
  archive = gr.File(label="Artifacts (zip)")
591
  with gr.Tabs():
592
  with gr.Tab("Markdown"):
593
+ gr.Markdown(
594
+ "_Canonical markdown reconstruction of the parsed document. "
595
+ "For batch uploads, this shows the first document; the full "
596
+ "set is in the artifacts zip._"
597
+ )
598
  markdown = gr.Markdown(label="Canonical Markdown")
599
  with gr.Tab("Run"):
600
+ gr.Markdown(
601
+ "_Summary, quality report, parser metrics, and artifact "
602
+ "validation. For batch uploads, `Summary.batch` lists every "
603
+ "document parsed in the request._"
604
+ )
605
  summary = gr.JSON(label="Summary")
606
  quality = gr.JSON(label="Quality Report")
607
  parser_metrics = gr.JSON(label="Parser Metrics")
 
608
  artifact_validation = gr.JSON(label="Artifact Manifest Validation")
609
+ with gr.Tab("Chunks"):
610
+ gr.Markdown(
611
+ "_Per-strategy chunk breakdown: counts, token-count "
612
+ "distribution (min / median / max), and three sample chunks "
613
+ "with previews per strategy. The full chunks.jsonl is in the "
614
+ "Artifacts tab and inside the zip._\n\n"
615
+ "Strategies emitted by default: `fixed_token_baseline`, "
616
+ "`recursive_structure`, `parent_child` (with linked parent / "
617
+ "child IDs), `page_level`, plus `table` / `figure` chunks "
618
+ "with provenance. `semantic`, `late`, `vision_guided`, and "
619
+ "`agentic_proposition` are config-gated stubs that emit "
620
+ "deterministic candidates marked for backend replacement."
621
+ )
622
+ chunking = gr.JSON(label="Chunking plan + per-strategy detail")
623
  with gr.Tab("Artifacts"):
624
  gr.Markdown(
625
  "Each top-level artifact is downloadable individually. "
tests/test_app.py CHANGED
@@ -137,5 +137,122 @@ class UploadGuardTests(unittest.TestCase):
137
  self.assertNotIn("rejected", summary)
138
 
139
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  if __name__ == "__main__":
141
  unittest.main()
 
137
  self.assertNotIn("rejected", summary)
138
 
139
 
140
+ class BatchAndZipUploadTests(unittest.TestCase):
141
+ def test_zip_upload_extracts_and_parses_each_doc(self):
142
+ if space_app is None:
143
+ self.skipTest(APP_IMPORT_ERROR)
144
+
145
+ import zipfile
146
+
147
+ with tempfile.TemporaryDirectory() as tmp:
148
+ tmp_path = Path(tmp)
149
+ # Build a small zip with two markdown docs.
150
+ doc_a = tmp_path / "a.md"
151
+ doc_a.write_text("# Doc A\n\nFirst.\n", encoding="utf-8")
152
+ doc_b = tmp_path / "b.md"
153
+ doc_b.write_text("# Doc B\n\nSecond.\n", encoding="utf-8")
154
+ zip_path = tmp_path / "batch.zip"
155
+ with zipfile.ZipFile(zip_path, "w") as zf:
156
+ zf.write(doc_a, arcname="a.md")
157
+ zf.write(doc_b, arcname="b.md")
158
+
159
+ outputs = space_app.parse_uploaded_document(
160
+ _UploadedFile(str(zip_path)), "Default lightweight"
161
+ )
162
+
163
+ # Tuple width unchanged.
164
+ self.assertEqual(len(outputs), 11)
165
+ summary = outputs[1]
166
+ # Batch metadata recorded.
167
+ self.assertIn("batch", summary)
168
+ self.assertEqual(summary["batch"]["input_count"], 2)
169
+ self.assertEqual(summary["batch"]["successful_count"], 2)
170
+ self.assertEqual(summary["batch"]["failed_count"], 0)
171
+ self.assertEqual(len(summary["batch"]["documents"]), 2)
172
+ # Aggregate metrics populated.
173
+ agg = summary["batch"]["aggregate"]
174
+ self.assertGreater(agg["total_chunks"], 0)
175
+ self.assertGreater(agg["mean_quality_score"], 0.0)
176
+
177
+ def test_multiple_files_uploaded_as_list(self):
178
+ if space_app is None:
179
+ self.skipTest(APP_IMPORT_ERROR)
180
+
181
+ with tempfile.TemporaryDirectory() as tmp:
182
+ doc1 = Path(tmp) / "one.md"
183
+ doc1.write_text("# One\n\nFirst doc.\n", encoding="utf-8")
184
+ doc2 = Path(tmp) / "two.md"
185
+ doc2.write_text("# Two\n\nSecond doc.\n", encoding="utf-8")
186
+
187
+ outputs = space_app.parse_uploaded_document(
188
+ [_UploadedFile(str(doc1)), _UploadedFile(str(doc2))],
189
+ "Default lightweight",
190
+ )
191
+
192
+ summary = outputs[1]
193
+ self.assertIn("batch", summary)
194
+ self.assertEqual(summary["batch"]["input_count"], 2)
195
+
196
+ def test_zip_with_unsupported_files_filtered_out(self):
197
+ if space_app is None:
198
+ self.skipTest(APP_IMPORT_ERROR)
199
+
200
+ import zipfile
201
+
202
+ with tempfile.TemporaryDirectory() as tmp:
203
+ tmp_path = Path(tmp)
204
+ zip_path = tmp_path / "mixed.zip"
205
+ doc_a = tmp_path / "first.md"
206
+ doc_a.write_text("# First\n\nContent A.\n", encoding="utf-8")
207
+ doc_b = tmp_path / "second.md"
208
+ doc_b.write_text("# Second\n\nContent B.\n", encoding="utf-8")
209
+ junk = tmp_path / "ignore.exe"
210
+ junk.write_bytes(b"\x00\x01")
211
+
212
+ with zipfile.ZipFile(zip_path, "w") as zf:
213
+ zf.write(doc_a, arcname="first.md")
214
+ zf.write(doc_b, arcname="second.md")
215
+ zf.write(junk, arcname="ignore.exe")
216
+
217
+ outputs = space_app.parse_uploaded_document(
218
+ _UploadedFile(str(zip_path)), "Default lightweight"
219
+ )
220
+
221
+ summary = outputs[1]
222
+ # The two .md files parsed; the .exe was filtered out before parsing.
223
+ self.assertIn("batch", summary)
224
+ self.assertEqual(summary["batch"]["input_count"], 2)
225
+ self.assertEqual(summary["batch"]["successful_count"], 2)
226
+
227
+ def test_chunk_detail_payload_present(self):
228
+ if space_app is None:
229
+ self.skipTest(APP_IMPORT_ERROR)
230
+
231
+ with tempfile.TemporaryDirectory() as tmp:
232
+ doc = Path(tmp) / "rich.md"
233
+ doc.write_text(
234
+ "# Rich Doc\n\n"
235
+ "First paragraph with some prose to chunk.\n\n"
236
+ "Second paragraph with different content for variety.\n\n"
237
+ "| A | B |\n| --- | --- |\n| 1 | 2 |\n",
238
+ encoding="utf-8",
239
+ )
240
+ outputs = space_app.parse_uploaded_document(
241
+ _UploadedFile(str(doc)), "Default lightweight"
242
+ )
243
+
244
+ chunking_payload = outputs[4]
245
+ self.assertIn("plan", chunking_payload)
246
+ self.assertIn("detail", chunking_payload)
247
+ detail = chunking_payload["detail"]
248
+ self.assertGreater(detail["total_chunks"], 0)
249
+ self.assertIn("strategies", detail)
250
+ # Each strategy block has the expected shape.
251
+ for strategy_name, block in detail["strategies"].items():
252
+ self.assertIn("count", block)
253
+ self.assertIn("samples", block)
254
+ self.assertIn("token_count_min", block)
255
+
256
+
257
  if __name__ == "__main__":
258
  unittest.main()